Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33800

Last change on this file since 33800 was 33800, checked in by ak19, 4 years ago
Removed an adult site from crawled contents and added its url to blacklist conf file (for if ever anyone crawls our MRI set of common crawl sites again)
File size: 13.8 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12	//import org.bson.types.ObjectId;
13
14	import org.greenstone.atea.morphia.*;
15
16
17	/**
18	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
19	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
20	* This reads in the dump.txt file contained in each site folder within the input folder.
21	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
22	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
23	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
24	* which parses out the actual text body content of each webpage's section within a dump.txt.
25	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
26	* likely to be in Maori or not.
27	*
28	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
29	* nor even newline separator, it's hard to be sure that the entire page is in language.
30	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
31	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
32	*
33	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
34	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
35	* instead of running it over the entire html body's text.
36	*
37	* TO COMPILE OR RUN, FIRST DO:
38	* cd maori-lang-detection/apache-opennlp-1.9.1
39	* export OPENNLP_HOME=`pwd`
40	* cd maori-lang-detection/src
41	*
42	* TO COMPILE:
43	* maori-lang-detection/src$
44	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
45	*
46	* TO RUN:
47	* maori-lang-detection/src$
48	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
49	*
50	* or:
51	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
52	*
53	*/
54	public class NutchTextDumpToMongoDB {
55	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
56
57	static boolean DEBUG_MODE = true;
58
59	/** Counter for number of sites.
60	* Should be equal to number of times NutchTextDumpToMongoDB constructor
61	* is called: once per site.
62	*/
63	static private int SITE_COUNTER = 0;
64	static private long WEBPAGE_COUNTER = 0;
65
66	private final MaoriTextDetector maoriTxtDetector;
67	private final MongoDBAccess mongodbAccess;
68
69	public final String siteID;
70	public final boolean siteCrawlUnfinished;
71	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
72
73	private int countOfWebPagesWithBodyText = 0;
74
75	private String geoLocationCountryCode = null; /** 2 letter country code */
76	private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */
77
78	private String domainOfSite;
79	private int numPagesInMRI = 0;
80
81	/** keep a list to store the text of each page */
82	private ArrayList<TextDumpPage> pages;
83
84
85
86	/** Number of language and confidence results to return for storing in MongoDB
87	* MongoDB runs out of space if storing too many, as we store this info per sentence
88	* and a long text document becomes a very large MongoDB document presumable*/
89	private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
90
91
92	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
93	// The start of a new web page's record in nutch's text dump of an entire site
94	// is denoted by a newline followed by a URL (protocol)
95	// or the very start of the file with a URL (protocol)
96	return ((prevLine == null \|\| prevLine.equals(""))
97	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
98	}
99
100	public void debugPageDump(StringBuilder pageDump) {
101	if(DEBUG_MODE) {
102	// START DEBUG
103	logger.debug("__________________________________________");
104	logger.debug("@@@ Found page entry: ");
105	logger.debug("__________________________________________");
106	logger.debug(pageDump.toString());
107	logger.debug("------------------------------------------");
108	// END DEBUG
109	}
110	}
111
112	/** A NutchTextDumpToMongoDB processes the dump.txt for one site */
113	public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
114	MaoriTextDetector maoriTxtDetector, String siteID,
115	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
116	throws IOException
117	{
118	// increment static counter of sites processed by a NutchTextDumpToMongoDB instance
119	SITE_COUNTER++;
120
121	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
122	this.siteID = siteID;
123	this.siteCrawlUnfinished = siteCrawlUnfinished;
124	this.siteCrawledTimestamp = lastModified;
125
126	this.maoriTxtDetector = maoriTxtDetector;
127	this.mongodbAccess = mongodbAccess;
128
129	pages = new ArrayList<TextDumpPage>();
130
131	String line = null;
132	StringBuilder pageDump = null;
133	try (
134	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
135	) {
136
137	boolean readingText = false;
138	String prevLine = null;
139
140	while((line = reader.readLine()) != null) { // readLine removes newline separator
141	line = line.trim();
142	// iff outside of a page's body text, then an empty line marks the end of a page
143	// in nutch's text dump of a site.
144	// But note, there can be an empty line (or more?) between the start and end
145	// markers of a page's text, though.
146
147	if(isStartOfNewWebPageRecord(prevLine, line)) {
148
149	if(pageDump != null) { // should also be the case then: if(prevLine != null)
150	// finish old pageDump and begin new one
151
152	//debugPageDump(pageDump);
153
154	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
155	// parses the fields and body text of a webpage in nutch's txt dump of entire site
156	//page.parseFields();
157	//page.getText();
158	pages.add(page);
159	pageDump = null;
160
161	}
162
163	// begin new webpage dump
164	pageDump = new StringBuilder();
165	pageDump.append(line);
166	pageDump.append("\n");
167
168	}
169	else if(!line.equals("")) {
170	pageDump.append(line);
171	pageDump.append("\n");
172
173	}
174	// can throw away any newlines between text start and end markers.
175
176	prevLine = line;
177	}
178
179	// process final webpage record:
180	//debugPageDump(pageDump);
181
182	if(pageDump == null) {
183	logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
184	} else {
185	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
186	pages.add(page);
187	pageDump = null;
188
189	// for every site, we just need to work out if any of its pages
190	// contains /mi(/) in its URL
191	String url = page.getPageURL();
192	if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") \|\| url.endsWith("/mi"))) {
193	this.urlContainsLangCodeInPath = true;
194	}
195	}
196
197	} catch (IOException ioe) {
198	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
199	}
200
201	// Just do this once: get and store domain of site.
202	// Passing true to get domain with protocol prefix
203	if(pages.size() > 0) {
204	TextDumpPage firstPage = pages.get(0);
205	String url = firstPage.getPageURL();
206	this.domainOfSite = Utility.getDomainForURL(url, true);
207	}
208	else {
209	this.domainOfSite = "UNKNOWN";
210	}
211
212
213	prepareSiteStats(mongodbAccess);
214	}
215
216
217	private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
218
219	TextDumpPage page = null;
220	for(int i = 0; i < pages.size(); i++) {
221
222	page = pages.get(i);
223
224	String text = page.getPageText();
225
226	if(text.equals("")) {
227	// don't care about empty pages
228	continue;
229	}
230	else {
231	WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
232	countOfWebPagesWithBodyText++; // of this site alone
233
234	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
235	if(isMRI) {
236	numPagesInMRI++;
237	}
238
239	String[] sentences = maoriTxtDetector.getAllSentences(text);
240	int totalSentences = sentences.length;
241	int numSentencesInMRI = 0;
242	ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
243	ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
244
245	WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/new ObjectId()/,
246	this.siteID/SITE_COUNTER/,
247	isMRI,
248	totalSentences,
249	singleSentences,
250	overlappingSentences);
251
252
253	for(SentenceInfo si : singleSentences) {
254	//LanguageInfo bestLanguage = si.languagesInfo[0];
255	//if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
256	if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
257	numSentencesInMRI++;
258	}
259	}
260
261
262	webpage.setMRISentenceCount(numSentencesInMRI);
263	webpage.setContainsMRI((numSentencesInMRI > 0));
264
265	//mongodbAccess.insertWebpageInfo(webpage);
266	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
267	mongodbAccess.datastore.save(webpage);
268	}
269	}
270	}
271
272
273	public void websiteDataToDB() {
274
275
276	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
277	// LocalDateTime date =
278	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
279	// String crawlTimestamp =
280	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
281
282	boolean redoCrawl = false;
283
284	if(this.siteCrawlUnfinished) {
285	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
286	if(this.numPagesInMRI > 2) {
287	redoCrawl = true;
288	}
289	}
290
291	File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
292	try {
293	if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
294	this.geoLocationCountryCode = "UNKNOWN";
295	} else {
296	this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
297	}
298	} catch(Exception e) {
299	logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e);
300	this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
301	}
302
303	int totalPages = pages.size();
304
305	WebsiteInfo website = new WebsiteInfo(/SITE_COUNTER,/ this.siteID, this.domainOfSite,
306	totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
307	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
308	this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
309
310	//mongodbAccess.insertWebsiteInfo(website);
311	// Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
312	mongodbAccess.datastore.save(website);
313	}
314
315
316	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
317
318	public static void printUsage() {
319	System.err.println("Run this program as:");
320	System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
321	}
322
323	public static void main(String[] args) {
324	if(args.length != 1) {
325	printUsage();
326	return;
327	}
328
329	File sitesDir = new File(args[0]);
330	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
331	logger.error("Error: " + args[0] + " does not exist or is not a directory");
332	return;
333	}
334
335	NutchTextDumpToMongoDB.DEBUG_MODE = false;
336
337
338	try (
339	MongoDBAccess mongodb = new MongoDBAccess();
340	) {
341
342	mongodb.connectToDB();
343	//mongodb.showCollections();
344
345	// print out the column headers for the websites csv file
346	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
347	// OPTIONAL TODO: creating collections can be done here if dropping and recreating
348
349	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
350	File[] sites = sitesDir.listFiles();
351
352	// sort site folders in alphabetical order
353	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
354	Arrays.sort(sites);
355
356	for(File siteDir : sites) { // e.g. 00001
357	if(siteDir.isDirectory()) {
358	// look for dump.txt
359	File txtDumpFile = new File(siteDir, "dump.txt");
360	if(!txtDumpFile.exists()) {
361	logger.error("Text dump file " + txtDumpFile + " did not exist");
362	continue;
363	}
364
365	else {
366	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
367
368	String siteID = siteDir.getName();
369	if(siteID.contains("_")) {
370	logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
371	continue;
372	}
373
374	long lastModified = siteDir.lastModified();
375	logger.debug("@@@ Processing siteID: " + siteID);
376	NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
377	mongodb, mriTxtDetector,
378	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
379	// now it's parsed all the web pages in the site's text dump
380
381	// Let's print stats on each web page's detected language being MRI or not
382	// and how many pages there were in the site in total.
383
384	//nutchTxtDump.printSiteStats();
385
386	nutchTxtDump.websiteDataToDB();
387	}
388	}
389
390	}
391
392	} catch(Exception e) {
393	// can get an exception when instantiating NutchTextDumpToMongoDB instance
394	// or with CSV file
395	logger.error(e.getMessage(), e);
396	}
397	}
398	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: