Context Navigation

NutchTextDumpToMongoDB.java@ 33909

Last change on this file since 33909 was 33909, checked in by ak19, 4 years ago

Implementing tables 3 to 5. 2. Rolled back the introduction of the basicDomain field (domain stripped of http/https and www prefixes) as the code can create and sort this field alphabetically, whereas it didn't sort properly in mongodb. 3. The code now does sort the domains stripped of protocol and www for the mongodb queries producing domain results and ensures the domain list is unique. 4. Split the MongoDBAccess class into 2, with the connection code in MongoDBAccess.java and the querying code in MongoDBQueryer (a subclass of MongoDBAccess) that is so far exclusively used by WebPageURLsListing.java

File size: 16.0 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12	//import org.bson.types.ObjectId;
13
14	import org.greenstone.atea.morphia.*;
15
16
17	/**
18	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
19	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
20	* This reads in the dump.txt file contained in each site folder within the input folder.
21	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
22	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
23	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
24	* which parses out the actual text body content of each webpage's section within a dump.txt.
25	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
26	* likely to be in Maori or not.
27	*
28	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
29	* nor even newline separator, it's hard to be sure that the entire page is in language.
30	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
31	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
32	*
33	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
34	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
35	* instead of running it over the entire html body's text.
36	*
37	* TO COMPILE OR RUN, FIRST DO:
38	* cd maori-lang-detection/apache-opennlp-1.9.1
39	* export OPENNLP_HOME=`pwd`
40	* cd maori-lang-detection/src
41	*
42	* TO COMPILE:
43	* maori-lang-detection/src$
44	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
45	*
46	* TO RUN:
47	* maori-lang-detection/src$
48	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
49	*
50	* or:
51	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
52	*
53	*/
54	public class NutchTextDumpToMongoDB {
55	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
56
57	static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
58
59	/** Counter for number of sites.
60	* Should be equal to number of times NutchTextDumpToMongoDB constructor
61	* is called: once per site.
62	*/
63	static private int SITE_COUNTER = 0;
64	static private long WEBPAGE_COUNTER = 0;
65
66	private final MaoriTextDetector maoriTxtDetector;
67	private final MongoDBAccess mongodbAccess;
68
69	public final String siteID;
70	public final boolean siteCrawlUnfinished;
71	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
72
73	private int countOfWebPagesWithBodyText = 0;
74
75	private String geoLocationCountryCode = null; /** 2 letter country code */
76	private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */
77
78	private String domainOfSite;
79	//private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
80	private int numPagesInMRI = 0;
81	private int numPagesContainingMRI = 0;
82
83	/** keep a list to store the text of each page */
84	private ArrayList<TextDumpPage> pages;
85
86
87
88	/** Number of language and confidence results to return for storing in MongoDB
89	* MongoDB runs out of space if storing too many, as we store this info per sentence
90	* and a long text document becomes a very large MongoDB document presumably */
91	private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
92
93
94	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
95	// The start of a new web page's record in nutch's text dump of an entire site
96	// is denoted by a newline followed by a URL (protocol)
97	// or the very start of the file with a URL (protocol)
98	return ((prevLine == null \|\| prevLine.equals(""))
99	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
100	}
101
102	public void debugPageDump(StringBuilder pageDump) {
103	if(DEBUG_MODE) {
104	// START DEBUG
105	logger.debug("__________________________________________");
106	logger.debug("@@@ Found page entry: ");
107	logger.debug("__________________________________________");
108	logger.debug(pageDump.toString());
109	logger.debug("------------------------------------------");
110	// END DEBUG
111	}
112	}
113
114	/** A NutchTextDumpToMongoDB processes the dump.txt for one site */
115	public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
116	MaoriTextDetector maoriTxtDetector, String siteID,
117	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
118	throws IOException
119	{
120	// increment static counter of sites processed by a NutchTextDumpToMongoDB instance
121	SITE_COUNTER++;
122
123	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
124	this.siteID = siteID;
125	this.siteCrawlUnfinished = siteCrawlUnfinished;
126	this.siteCrawledTimestamp = lastModified;
127
128	this.maoriTxtDetector = maoriTxtDetector;
129	this.mongodbAccess = mongodbAccess;
130
131	pages = new ArrayList<TextDumpPage>();
132
133	String line = null;
134	StringBuilder pageDump = null;
135	try (
136	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
137	) {
138
139	boolean readingText = false;
140	String prevLine = null;
141
142	while((line = reader.readLine()) != null) { // readLine removes newline separator
143	line = line.trim();
144	// iff outside of a page's body text, then an empty line marks the end of a page
145	// in nutch's text dump of a site.
146	// But note, there can be an empty line (or more?) between the start and end
147	// markers of a page's text, though.
148
149	if(isStartOfNewWebPageRecord(prevLine, line)) {
150
151	if(pageDump != null) { // should also be the case then: if(prevLine != null)
152	// finish old pageDump and begin new one
153
154	//debugPageDump(pageDump);
155
156	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
157	// parses the fields and body text of a webpage in nutch's txt dump of entire site
158	//page.parseFields();
159	//page.getText();
160	pages.add(page);
161	inspectPageURLPath(page);
162	pageDump = null;
163
164	}
165
166	// begin new webpage dump
167	pageDump = new StringBuilder();
168	pageDump.append(line);
169	pageDump.append("\n");
170
171	}
172	else if(!line.equals("")) {
173	pageDump.append(line);
174	pageDump.append("\n");
175
176	}
177	// can throw away any newlines between text start and end markers.
178
179	prevLine = line;
180	}
181
182	// process final webpage record:
183	//debugPageDump(pageDump);
184
185	if(pageDump == null) {
186	logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
187	} else {
188	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
189	pages.add(page);
190	inspectPageURLPath(page);
191	pageDump = null;
192	}
193
194
195	} catch (IOException ioe) {
196	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
197	}
198
199	// Just do this once: get and store domain of site.
200	// Passing true to get domain with protocol prefix
201	if(pages.size() > 0) {
202	TextDumpPage firstPage = pages.get(0);
203	String url = firstPage.getPageURL();
204	this.domainOfSite = Utility.getDomainForURL(url, true);
205	//this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
206	}
207	else {
208	this.domainOfSite = "UNKNOWN";
209	//this.baseSiteDomain = "UNKNOWN";
210	}
211
212	/* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above.
213	// For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path
214	for(TextDumpPage aPage : pages) {
215	inspectPageURLPath(aPage);
216	}
217	*/
218	prepareSiteStats(mongodbAccess);
219	}
220
221	/** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL.
222	* This method is called on each page of a site as the page is created. */
223	private void inspectPageURLPath(TextDumpPage page) {
224	String url = page.getPageURL();
225	//logger.debug("@@@@ pageURL: " + url);
226
227	if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site,
228	// check if this page of the site contains /mi(/) or http(s)://mi in its URL path
229	if(url.contains("/mi/") \|\| url.endsWith("/mi") \|\| url.startsWith("https://mi.") \|\| url.startsWith("http://mi.")) {
230	this.urlContainsLangCodeInPath = true;
231	}
232	}
233	}
234
235
236	private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
237
238	TextDumpPage page = null;
239	for(int i = 0; i < pages.size(); i++) {
240
241	page = pages.get(i);
242
243	String text = page.getPageText();
244
245	if(text.equals("")) {
246	// don't care about empty pages
247	continue;
248	}
249	else {
250	WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
251	countOfWebPagesWithBodyText++; // of this site alone
252
253	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
254	if(isMRI) {
255	numPagesInMRI++;
256	}
257
258	String[] sentences = maoriTxtDetector.getAllSentences(text);
259	int totalSentences = sentences.length;
260	int numSentencesInMRI = 0;
261	ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
262	ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
263
264	WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/new ObjectId()/,
265	this.siteID/SITE_COUNTER/,
266	isMRI,
267	totalSentences,
268	singleSentences,
269	overlappingSentences);
270
271
272	for(SentenceInfo si : singleSentences) {
273	//LanguageInfo bestLanguage = si.languagesInfo[0];
274	//if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
275	if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
276	numSentencesInMRI++;
277	}
278	}
279
280
281	webpage.setMRISentenceCount(numSentencesInMRI);
282	webpage.setContainsMRI((numSentencesInMRI > 0));
283	if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
284	// Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
285	// But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
286	numPagesContainingMRI++;
287	}
288
289	//mongodbAccess.insertWebpageInfo(webpage);
290	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
291	mongodbAccess.datastore.save(webpage);
292	}
293	}
294	}
295
296
297	public void websiteDataToDB() {
298
299
300	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
301	// LocalDateTime date =
302	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
303	// String crawlTimestamp =
304	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
305
306	boolean redoCrawl = false;
307
308	if(this.siteCrawlUnfinished) {
309	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
310	if(this.numPagesInMRI > 2) {
311	redoCrawl = true;
312	}
313	}
314
315	File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
316	try {
317	if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
318	this.geoLocationCountryCode = "UNKNOWN";
319	} else {
320	this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
321	}
322	} catch(Exception e) {
323	logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e);
324
325	//if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
326	//this.geoLocationCountryCode = "NZ";
327	//}
328
329	// Help along identification of domain's country by construing TLDs if 2 letters after last period mark
330	int periodIndex = domainOfSite.length()-3;
331	// .com\|org etc extensions that have 3 chars afte period mark will remain unknown
332	// 2 letter extensions will be considered TLD
333	if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
334	// has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
335	String TLD = domainOfSite.substring(periodIndex+1);
336	this.geoLocationCountryCode = TLD.toUpperCase();
337	} else {
338	this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
339	}
340	}
341
342	int totalPages = pages.size();
343
344	WebsiteInfo website = new WebsiteInfo(/SITE_COUNTER,/ this.siteID,
345	this.domainOfSite, //this.baseSiteDomain,
346	totalPages, this.countOfWebPagesWithBodyText,
347	this.numPagesInMRI, this.numPagesContainingMRI,
348	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
349	this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
350
351	//mongodbAccess.insertWebsiteInfo(website);
352	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
353	mongodbAccess.datastore.save(website);
354	}
355
356
357	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
358
359	public static void printUsage() {
360	System.err.println("Run this program as:");
361	System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
362	}
363
364	public static void main(String[] args) {
365	if(args.length != 1) {
366	printUsage();
367	return;
368	}
369
370	File sitesDir = new File(args[0]);
371	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
372	logger.error("Error: " + args[0] + " does not exist or is not a directory");
373	return;
374	}
375
376	NutchTextDumpToMongoDB.DEBUG_MODE = false;
377
378
379	try (
380	MongoDBAccess mongodb = new MongoDBAccess();
381	) {
382
383	mongodb.connectToDB();
384	//mongodb.showCollections();
385
386	// print out the column headers for the websites csv file
387	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
388	// OPTIONAL TODO: creating collections can be done here if dropping and recreating
389
390	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
391	File[] sites = sitesDir.listFiles();
392
393	// sort site folders in alphabetical order
394	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
395	Arrays.sort(sites);
396
397	for(File siteDir : sites) { // e.g. 00001
398	if(siteDir.isDirectory()) {
399	// look for dump.txt
400	File txtDumpFile = new File(siteDir, "dump.txt");
401	if(!txtDumpFile.exists()) {
402	logger.error("Text dump file " + txtDumpFile + " did not exist");
403	continue;
404	}
405
406	else {
407	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
408
409	String siteID = siteDir.getName();
410	if(siteID.contains("_")) {
411	logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
412	continue;
413	}
414
415	long lastModified = siteDir.lastModified();
416	logger.debug("@@@ Processing siteID: " + siteID);
417	NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
418	mongodb, mriTxtDetector,
419	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
420	// now it's parsed all the web pages in the site's text dump
421
422	// Let's print stats on each web page's detected language being MRI or not
423	// and how many pages there were in the site in total.
424
425	//nutchTxtDump.printSiteStats();
426
427	nutchTxtDump.websiteDataToDB();
428	}
429	}
430
431	}
432
433	} catch(Exception e) {
434	// can get an exception when instantiating NutchTextDumpToMongoDB instance
435	// or with CSV file
436	logger.error(e.getMessage(), e);
437	}
438	}
439	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33909

Download in other formats: