Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33983

Last change on this file since 33983 was 33983, checked in by ak19, 4 years ago
More sensible name for method which had too long kept its old name from when all it did was keep track of site and page level statistics
File size: 16.1 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12	//import org.bson.types.ObjectId;
13
14	import org.greenstone.atea.morphia.*;
15
16
17	/**
18	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
19	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
20	* This reads in the dump.txt file contained in each site folder within the input folder.
21	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
22	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
23	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
24	* which parses out the actual text body content of each webpage's section within a dump.txt.
25	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
26	* likely to be in Maori or not.
27	*
28	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
29	* nor even newline separator, it's hard to be sure that the entire page is in language.
30	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
31	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
32	*
33	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
34	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
35	* instead of running it over the entire html body's text.
36	*
37	* TO COMPILE OR RUN, FIRST DO:
38	* cd maori-lang-detection/apache-opennlp-1.9.1
39	* export OPENNLP_HOME=`pwd`
40	* cd maori-lang-detection/src
41	*
42	* TO COMPILE:
43	* maori-lang-detection/src$
44	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
45	*
46	* TO RUN:
47	* maori-lang-detection/src$
48	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
49	*
50	* or:
51	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
52	*
53	*/
54	public class NutchTextDumpToMongoDB {
55	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
56
57	static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
58
59	/** Counter for number of sites.
60	* Should be equal to number of times NutchTextDumpToMongoDB constructor
61	* is called: once per site.
62	*/
63	static private int SITE_COUNTER = 0;
64	static private long WEBPAGE_COUNTER = 0;
65
66	private final MaoriTextDetector maoriTxtDetector;
67	private final MongoDBAccess mongodbAccess;
68
69	public final String siteID;
70	public final boolean siteCrawlUnfinished;
71	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
72
73	private int countOfWebPagesWithBodyText = 0;
74
75	private String geoLocationCountryCode = null; /** 2 letter country code */
76	private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */
77
78	private String domainOfSite;
79	//private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
80	private int numPagesInMRI = 0;
81	private int numPagesContainingMRI = 0;
82
83	/** keep a list to store the text of each page */
84	private ArrayList<TextDumpPage> pages;
85
86
87
88	/** Number of language and confidence results to return for storing in MongoDB
89	* MongoDB runs out of space if storing too many, as we store this info per sentence
90	* and a long text document becomes a very large MongoDB document presumably */
91	private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
92
93
94	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
95	// The start of a new web page's record in nutch's text dump of an entire site
96	// is denoted by a newline followed by a URL (protocol)
97	// or the very start of the file with a URL (protocol)
98	return ((prevLine == null \|\| prevLine.equals(""))
99	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
100	}
101
102	public void debugPageDump(StringBuilder pageDump) {
103	if(DEBUG_MODE) {
104	// START DEBUG
105	logger.debug("__________________________________________");
106	logger.debug("@@@ Found page entry: ");
107	logger.debug("__________________________________________");
108	logger.debug(pageDump.toString());
109	logger.debug("------------------------------------------");
110	// END DEBUG
111	}
112	}
113
114	/** A NutchTextDumpToMongoDB processes the dump.txt for one site */
115	public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
116	MaoriTextDetector maoriTxtDetector, String siteID,
117	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
118	throws IOException
119	{
120	// increment static counter of sites processed by a NutchTextDumpToMongoDB instance
121	SITE_COUNTER++;
122
123	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
124	this.siteID = siteID;
125	this.siteCrawlUnfinished = siteCrawlUnfinished;
126	this.siteCrawledTimestamp = lastModified;
127
128	this.maoriTxtDetector = maoriTxtDetector;
129	this.mongodbAccess = mongodbAccess;
130
131	pages = new ArrayList<TextDumpPage>();
132
133	String line = null;
134	StringBuilder pageDump = null;
135	try (
136	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
137	) {
138
139	boolean readingText = false;
140	String prevLine = null;
141
142	while((line = reader.readLine()) != null) { // readLine removes newline separator
143	line = line.trim();
144	// iff outside of a page's body text, then an empty line marks the end of a page
145	// in nutch's text dump of a site.
146	// But note, there can be an empty line (or more?) between the start and end
147	// markers of a page's text, though.
148
149	if(isStartOfNewWebPageRecord(prevLine, line)) {
150
151	if(pageDump != null) { // should also be the case then: if(prevLine != null)
152	// finish old pageDump and begin new one
153
154	//debugPageDump(pageDump);
155
156	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
157	// parses the fields and body text of a webpage in nutch's txt dump of entire site
158	//page.parseFields();
159	//page.getText();
160	pages.add(page);
161	inspectPageURLPath(page);
162	pageDump = null;
163
164	}
165
166	// begin new webpage dump
167	pageDump = new StringBuilder();
168	pageDump.append(line);
169	pageDump.append("\n");
170
171	}
172	else if(!line.equals("")) {
173	pageDump.append(line);
174	pageDump.append("\n");
175
176	}
177	// can throw away any newlines between text start and end markers.
178
179	prevLine = line;
180	}
181
182	// process final webpage record:
183	//debugPageDump(pageDump);
184
185	if(pageDump == null) {
186	logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
187	} else {
188	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
189	pages.add(page);
190	inspectPageURLPath(page);
191	pageDump = null;
192	}
193
194
195	} catch (IOException ioe) {
196	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
197	}
198
199	// Just do this once: get and store domain of site.
200	// Passing true to get domain with protocol prefix
201	if(pages.size() > 0) {
202	TextDumpPage firstPage = pages.get(0);
203	String url = firstPage.getPageURL();
204	this.domainOfSite = Utility.getDomainForURL(url, true);
205	//this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
206	}
207	else {
208	this.domainOfSite = "UNKNOWN";
209	//this.baseSiteDomain = "UNKNOWN";
210	}
211
212	/* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above.
213	// For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path
214	for(TextDumpPage aPage : pages) {
215	inspectPageURLPath(aPage);
216	}
217	*/
218	webPageDataToMongoDB(mongodbAccess);
219	}
220
221
222	/** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL.
223	* This method is called on each page of a site as the page is created. */
224	private void inspectPageURLPath(TextDumpPage page) {
225	String url = page.getPageURL();
226	//logger.debug("@@@@ pageURL: " + url);
227
228	if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site,
229	// check if this page of the site contains /mi(/) or http(s)://mi in its URL path
230	if(url.contains("/mi/") \|\| url.endsWith("/mi") \|\| url.startsWith("https://mi.") \|\| url.startsWith("http://mi.")) {
231	this.urlContainsLangCodeInPath = true;
232	}
233	}
234	}
235
236
237	private void webPageDataToMongoDB(MongoDBAccess mongodbAccess) throws IOException {
238
239	TextDumpPage page = null;
240	for(int i = 0; i < pages.size(); i++) {
241
242	page = pages.get(i);
243
244	String text = page.getPageText();
245
246	if(text.equals("")) {
247	// don't care about empty pages
248	continue;
249	}
250	else {
251	WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
252	countOfWebPagesWithBodyText++; // of this site alone
253
254	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
255	if(isMRI) {
256	numPagesInMRI++;
257	}
258
259	String[] sentences = maoriTxtDetector.getAllSentences(text);
260	int totalSentences = sentences.length;
261	int numSentencesInMRI = 0;
262	ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
263	ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
264
265	WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/new ObjectId()/,
266	this.siteID/SITE_COUNTER/,
267	isMRI,
268	totalSentences,
269	singleSentences,
270	overlappingSentences);
271
272
273	for(SentenceInfo si : singleSentences) {
274	//LanguageInfo bestLanguage = si.languagesInfo[0];
275	//if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
276	if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
277	numSentencesInMRI++;
278	}
279	}
280
281
282	webpage.setMRISentenceCount(numSentencesInMRI);
283	webpage.setContainsMRI((numSentencesInMRI > 0));
284	if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
285	// Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
286	// But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
287	numPagesContainingMRI++;
288	}
289
290	//mongodbAccess.insertWebpageInfo(webpage);
291	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
292	mongodbAccess.datastore.save(webpage);
293	}
294	}
295	}
296
297
298	public void websiteDataToDB() {
299
300
301	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
302	// LocalDateTime date =
303	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
304	// String crawlTimestamp =
305	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
306
307	boolean redoCrawl = false;
308
309	if(this.siteCrawlUnfinished) {
310	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
311	if(this.numPagesInMRI > 2) {
312	redoCrawl = true;
313	}
314	}
315
316	File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
317	try {
318	if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
319	this.geoLocationCountryCode = "UNKNOWN";
320	} else {
321	this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
322	}
323	} catch(Exception e) {
324	logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e);
325
326	//if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
327	//this.geoLocationCountryCode = "NZ";
328	//}
329
330	// Help along identification of domain's country by construing TLDs if 2 letters after last period mark
331	int periodIndex = domainOfSite.length()-3;
332	// .com\|org etc extensions that have 3 chars afte period mark will remain unknown
333	// 2 letter extensions will be considered TLD
334	if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
335	// has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
336	String TLD = domainOfSite.substring(periodIndex+1);
337	this.geoLocationCountryCode = TLD.toUpperCase();
338	} else {
339	this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
340	}
341	}
342
343	int totalPages = pages.size();
344
345	WebsiteInfo website = new WebsiteInfo(/SITE_COUNTER,/ this.siteID,
346	this.domainOfSite, //this.baseSiteDomain,
347	totalPages, this.countOfWebPagesWithBodyText,
348	this.numPagesInMRI, this.numPagesContainingMRI,
349	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
350	this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
351
352	//mongodbAccess.insertWebsiteInfo(website);
353	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
354	mongodbAccess.datastore.save(website);
355	}
356
357
358	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
359
360	public static void printUsage() {
361	System.err.println("Run this program as:");
362	System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
363	}
364
365	public static void main(String[] args) {
366	if(args.length != 1) {
367	printUsage();
368	return;
369	}
370
371	File sitesDir = new File(args[0]);
372	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
373	logger.error("Error: " + args[0] + " does not exist or is not a directory");
374	return;
375	}
376
377	NutchTextDumpToMongoDB.DEBUG_MODE = false;
378
379
380	try (
381	MongoDBAccess mongodb = new MongoDBAccess();
382	) {
383
384	mongodb.connectToDB();
385	//mongodb.showCollections();
386
387	// print out the column headers for the websites csv file
388	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
389	// OPTIONAL TODO: creating collections can be done here if dropping and recreating
390
391	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
392	File[] sites = sitesDir.listFiles();
393
394	// sort site folders in alphabetical order
395	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
396	Arrays.sort(sites);
397
398	for(File siteDir : sites) { // e.g. 00001
399	if(siteDir.isDirectory()) {
400	// look for dump.txt
401	File txtDumpFile = new File(siteDir, "dump.txt");
402	if(!txtDumpFile.exists()) {
403	logger.error("Text dump file " + txtDumpFile + " did not exist");
404	continue;
405	}
406
407	else {
408	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
409
410	String siteID = siteDir.getName();
411	if(siteID.contains("_")) {
412	logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
413	continue;
414	}
415
416	long lastModified = siteDir.lastModified();
417	logger.debug("@@@ Processing siteID: " + siteID);
418	NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
419	mongodb, mriTxtDetector,
420	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
421	// now it's parsed all the web pages in the site's text dump
422
423	// Let's print stats on each web page's detected language being MRI or not
424	// and how many pages there were in the site in total.
425
426	//nutchTxtDump.printSiteStats();
427
428	nutchTxtDump.websiteDataToDB();
429	}
430	}
431
432	}
433
434	} catch(Exception e) {
435	// can get an exception when instantiating NutchTextDumpToMongoDB instance
436	// or with CSV file
437	logger.error(e.getMessage(), e);
438	}
439	}
440	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: