Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

NutchTextDumpToMongoDB.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 13.1 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12
13	/**
14	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16	* This reads in the dump.txt file contained in each site folder within the input folder.
17	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
19	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20	* which parses out the actual text body content of each webpage's section within a dump.txt.
21	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
22	* likely to be in Maori or not.
23	*
24	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25	* nor even newline separator, it's hard to be sure that the entire page is in language.
26	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
28	*
29	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31	* instead of running it over the entire html body's text.
32	*
33	* TO COMPILE OR RUN, FIRST DO:
34	* cd maori-lang-detection/apache-opennlp-1.9.1
35	* export OPENNLP_HOME=`pwd`
36	* cd maori-lang-detection/src
37	*
38	* TO COMPILE:
39	* maori-lang-detection/src$
40	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
41	*
42	* TO RUN:
43	* maori-lang-detection/src$
44	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
45	*
46	* or:
47	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
48	*
49	*/
50	public class NutchTextDumpToMongoDB {
51	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
52
53	static boolean DEBUG_MODE = true;
54
55	/** Counter for number of sites.
56	* Should be equal to number of times NutchTextDumpToMongoDB constructor
57	* is called: once per site.
58	*/
59	static private int SITE_COUNTER = 0;
60	static private long WEBPAGE_COUNTER = 0;
61
62	private final MaoriTextDetector maoriTxtDetector;
63	private final MongoDBAccess mongodbAccess;
64
65	public final String siteID;
66	public final boolean siteCrawlUnfinished;
67	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69	private int countOfWebPagesWithBodyText = 0;
70
71	private String geoLocationCountryCode = null; /** 2 letter country code */
72	private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */
73
74	private String domainOfSite;
75	private int numPagesInMRI = 0;
76
77	/** keep a list to store the text of each page */
78	private ArrayList<TextDumpPage> pages;
79
80	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
81	// The start of a new web page's record in nutch's text dump of an entire site
82	// is denoted by a newline followed by a URL (protocol)
83	// or the very start of the file with a URL (protocol)
84	return ((prevLine == null \|\| prevLine.equals(""))
85	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
86	}
87
88	public void debugPageDump(StringBuilder pageDump) {
89	if(DEBUG_MODE) {
90	// START DEBUG
91	logger.debug("__________________________________________");
92	logger.debug("@@@ Found page entry: ");
93	logger.debug("__________________________________________");
94	logger.debug(pageDump.toString());
95	logger.debug("------------------------------------------");
96	// END DEBUG
97	}
98	}
99
100	/** A NutchTextDumpToMongoDB processes the dump.txt for one site */
101	public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
102	MaoriTextDetector maoriTxtDetector, String siteID,
103	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
104	throws IOException
105	{
106	// increment static counter of sites processed by a NutchTextDumpToMongoDB instance
107	SITE_COUNTER++;
108
109	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
110	this.siteID = siteID;
111	this.siteCrawlUnfinished = siteCrawlUnfinished;
112	this.siteCrawledTimestamp = lastModified;
113
114	this.maoriTxtDetector = maoriTxtDetector;
115	this.mongodbAccess = mongodbAccess;
116
117	pages = new ArrayList<TextDumpPage>();
118
119	String line = null;
120	StringBuilder pageDump = null;
121	try (
122	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
123	) {
124
125	boolean readingText = false;
126	String prevLine = null;
127
128	while((line = reader.readLine()) != null) { // readLine removes newline separator
129	line = line.trim();
130	// iff outside of a page's body text, then an empty line marks the end of a page
131	// in nutch's text dump of a site.
132	// But note, there can be an empty line (or more?) between the start and end
133	// markers of a page's text, though.
134
135	if(isStartOfNewWebPageRecord(prevLine, line)) {
136
137	if(pageDump != null) { // should also be the case then: if(prevLine != null)
138	// finish old pageDump and begin new one
139
140	//debugPageDump(pageDump);
141
142	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
143	// parses the fields and body text of a webpage in nutch's txt dump of entire site
144	//page.parseFields();
145	//page.getText();
146	pages.add(page);
147	pageDump = null;
148
149	}
150
151	// begin new webpage dump
152	pageDump = new StringBuilder();
153	pageDump.append(line);
154	pageDump.append("\n");
155
156	}
157	else if(!line.equals("")) {
158	pageDump.append(line);
159	pageDump.append("\n");
160
161	}
162	// can throw away any newlines between text start and end markers.
163
164	prevLine = line;
165	}
166
167	// process final webpage record:
168	//debugPageDump(pageDump);
169
170	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
171	pages.add(page);
172	pageDump = null;
173
174	// for every site, we just need to work out if any of its pages
175	// contains /mi(/) in its URL
176	String url = page.getPageURL();
177	if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") \|\| url.endsWith("/mi"))) {
178	this.urlContainsLangCodeInPath = true;
179	}
180
181	} catch (IOException ioe) {
182	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
183	}
184
185	// Just do this once: get and store domain of site.
186	// Passing true to get domain with protocol prefix
187	if(pages.size() > 0) {
188	TextDumpPage firstPage = pages.get(0);
189	String url = firstPage.getPageURL();
190	this.domainOfSite = Utility.getDomainForURL(url, true);
191	}
192	else {
193	this.domainOfSite = "UNKNOWN";
194	}
195
196
197	prepareSiteStats(mongodbAccess);
198	}
199
200
201	private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
202
203	TextDumpPage page = null;
204	for(int i = 0; i < pages.size(); i++) {
205
206	page = pages.get(i);
207
208	String text = page.getPageText();
209
210	if(text.equals("")) {
211	// don't care about empty pages
212	continue;
213	}
214	else {
215	WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
216	countOfWebPagesWithBodyText++; // of this site alone
217
218	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
219	if(isMRI) {
220	numPagesInMRI++;
221	}
222
223	String[] sentences = maoriTxtDetector.getAllSentences(text);
224	int totalSentences = sentences.length;
225	ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences);
226	ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences);
227
228	WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER,
229	SITE_COUNTER,
230	isMRI,
231	totalSentences,
232	singleSentences,
233	overlappingSentences);
234
235
236	mongodbAccess.insertWebpageInfo(webpage);
237	}
238	}
239	}
240
241	/*
242	public void printSiteStats() {
243
244
245	logger.info("------------- " + this.siteID + " SITE STATS -----------");
246
247	logger.info("SITE DOMAIN: " + this.domainOfSite);
248	logger.info("Total number of web pages in site: " + pages.size());
249	logger.info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
250
251	if(pagesInMRI.size() > 0) {
252	logger.info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
253	for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
254	logger.info(mriWebPageInfo.toString());
255	}
256	}
257
258	logger.info(" ----------- ");
259	if(pagesContainingMRI.size() > 0) {
260	logger.info("The following pages weren't detected as primarily being in MÄori");
261	logger.info("But still contained sentences detected as MÄori");
262	for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
263	logger.info(mriWebPageInfo.toString());
264	}
265
266	} else {
267	logger.info("No further pages detected as containing any sentences in MRI");
268	}
269	logger.info(" ----------- ");
270	}
271	*/
272
273
274
275	public void websiteDataToDB() {
276
277
278	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
279	// LocalDateTime date =
280	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
281	// String crawlTimestamp =
282	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
283
284	boolean redoCrawl = false;
285
286	if(this.siteCrawlUnfinished) {
287	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
288	if(this.numPagesInMRI > 2) {
289	redoCrawl = true;
290	}
291	}
292
293	//File geoLiteCityDatFile = new File(this.getClass().getResource("GeoLiteCity.dat").getFile());
294	//this.geoLocationCountryCode = getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
295
296	int totalPages = pages.size();
297
298	WebsiteInfo website = new WebsiteInfo(SITE_COUNTER, this.siteID, this.domainOfSite,
299	totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
300	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
301	this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
302
303	mongodbAccess.insertWebsiteInfo(website);
304
305	}
306
307
308	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
309
310	public static void printUsage() {
311	System.err.println("Run this program as:");
312	System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
313	}
314
315	public static void main(String[] args) {
316	if(args.length != 1) {
317	printUsage();
318	return;
319	}
320
321	File sitesDir = new File(args[0]);
322	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
323	logger.error("Error: " + args[0] + " does not exist or is not a directory");
324	return;
325	}
326
327	NutchTextDumpToMongoDB.DEBUG_MODE = false;
328
329
330	try (
331	MongoDBAccess mongodb = new MongoDBAccess();
332	) {
333
334	mongodb.connectToDB();
335	//mongodb.showCollections();
336
337	// print out the column headers for the websites csv file
338	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
339	// OPTIONAL TODO: creating collections can be done here if dropping and recreating
340
341	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
342	File[] sites = sitesDir.listFiles();
343
344	// sort site folders in alphabetical order
345	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
346	Arrays.sort(sites);
347
348	for(File siteDir : sites) { // e.g. 00001
349	if(siteDir.isDirectory()) {
350	// look for dump.txt
351	File txtDumpFile = new File(siteDir, "dump.txt");
352	if(!txtDumpFile.exists()) {
353	logger.error("Text dump file " + txtDumpFile + " did not exist");
354	continue;
355	}
356
357	else {
358	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
359
360	String siteID = siteDir.getName();
361	long lastModified = siteDir.lastModified();
362	logger.debug("Found siteID: " + siteID);
363	NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
364	mongodb, mriTxtDetector,
365	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
366	// now it's parsed all the web pages in the site's text dump
367
368	// Let's print stats on each web page's detected language being MRI or not
369	// and how many pages there were in the site in total.
370
371	//nutchTxtDump.printSiteStats();
372
373	nutchTxtDump.websiteDataToDB();
374	}
375	}
376
377	}
378
379	} catch(Exception e) {
380	// can get an exception when instantiating NutchTextDumpToMongoDB instance
381	// or with CSV file
382	logger.error(e.getMessage(), e);
383	}
384	}
385	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33634

Download in other formats: