Context Navigation

NutchTextDumpProcessor.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago

Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.

File size: 16.5 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12
13	/**
14	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16	* This reads in the dump.txt file contained in each site folder within the input folder.
17	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
19	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20	* which parses out the actual text body content of each webpage's section within a dump.txt.
21	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
22	* likely to be in Maori or not.
23	*
24	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25	* nor even newline separator, it's hard to be sure that the entire page is in language.
26	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
28	*
29	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31	* instead of running it over the entire html body's text.
32	*
33	* TO COMPILE OR RUN, FIRST DO:
34	* cd maori-lang-detection/apache-opennlp-1.9.1
35	* export OPENNLP_HOME=`pwd`
36	* cd maori-lang-detection/src
37	*
38	* TO COMPILE:
39	* maori-lang-detection/src$
40	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
41	*
42	* TO RUN:
43	* maori-lang-detection/src$
44	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
45	*
46	* or:
47	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
48	*
49	*/
50	public class NutchTextDumpProcessor {
51	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
52
53	static boolean DEBUG_MODE = true;
54
55	/** Counter for number of sites.
56	* Should be equal to number of times NutchTextDumpProcessor constructor
57	* is called: once per site.
58	*/
59	static private int SITE_COUNTER = 0;
60	static private long WEBPAGE_COUNTER = 0;
61	static private long MRI_SENTENCE_COUNTER = 0;
62
63	private final MaoriTextDetector maoriTxtDetector;
64
65	public final String siteID;
66	public final boolean siteCrawlUnfinished;
67	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69	private String domainOfSite;
70
71	/** keep a list to store the text of each page */
72	private ArrayList<TextDumpPage> pages;
73
74	/** list of pages in this site which were detected as being in MRI */
75	private ArrayList<MRIWebPageStats> pagesInMRI;
76	/**
77	* list of pages in this site which were NOT detected as being in MRI but nevertheless
78	* contain one or more sentences in MRI
79	*/
80	private ArrayList<MRIWebPageStats> pagesContainingMRI;
81
82	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
83	// The start of a new web page's record in nutch's text dump of an entire site
84	// is denoted by a newline followed by a URL (protocol)
85	// or the very start of the file with a URL (protocol)
86	return ((prevLine == null \|\| prevLine.equals(""))
87	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
88	}
89
90	public void debugPageDump(StringBuilder pageDump) {
91	if(DEBUG_MODE) {
92	// START DEBUG
93	logger.debug("__________________________________________");
94	logger.debug("@@@ Found page entry: ");
95	logger.debug("__________________________________________");
96	logger.debug(pageDump.toString());
97	logger.debug("------------------------------------------");
98	// END DEBUG
99	}
100	}
101
102	/** A NutchTextDumpProcessor processes the dump.txt for one site */
103	public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
104	MaoriTextDetector maoriTxtDetector, String siteID,
105	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
106	throws IOException
107	{
108	// increment static counter of sites processed by a NutchTextDumpProcessor instance
109	SITE_COUNTER++;
110
111	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
112	this.siteID = siteID;
113	this.siteCrawlUnfinished = siteCrawlUnfinished;
114	this.siteCrawledTimestamp = lastModified;
115
116	this.maoriTxtDetector = maoriTxtDetector;
117
118	pages = new ArrayList<TextDumpPage>();
119
120	String line = null;
121	StringBuilder pageDump = null;
122	try (
123	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
124	) {
125
126	boolean readingText = false;
127	String prevLine = null;
128
129	while((line = reader.readLine()) != null) { // readLine removes newline separator
130	line = line.trim();
131	// iff outside of a page's body text, then an empty line marks the end of a page
132	// in nutch's text dump of a site.
133	// But note, there can be an empty line (or more?) between the start and end
134	// markers of a page's text, though.
135
136	if(isStartOfNewWebPageRecord(prevLine, line)) {
137
138	if(pageDump != null) { // should also be the case then: if(prevLine != null)
139	// finish old pageDump and begin new one
140
141	//debugPageDump(pageDump);
142
143	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
144	// parses the fields and body text of a webpage in nutch's txt dump of entire site
145	//page.parseFields();
146	//page.getText();
147	pages.add(page);
148	pageDump = null;
149
150	}
151
152	// begin new webpage dump
153	pageDump = new StringBuilder();
154	pageDump.append(line);
155	pageDump.append("\n");
156
157	}
158	else if(!line.equals("")) {
159	pageDump.append(line);
160	pageDump.append("\n");
161
162	}
163	// can throw away any newlines between text start and end markers.
164
165	prevLine = line;
166	}
167
168	// process final webpage record:
169	//debugPageDump(pageDump);
170
171	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
172	pages.add(page);
173	pageDump = null;
174
175	} catch (IOException ioe) {
176	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
177	}
178
179	// Just do this once: get and store domain of site.
180	// Passing true to get domain with protocol prefix
181	if(pages.size() > 0) {
182	TextDumpPage firstPage = pages.get(0);
183	String url = firstPage.getPageURL();
184	this.domainOfSite = Utility.getDomainForURL(url, true);
185	}
186	else {
187	this.domainOfSite = "UNKNOWN";
188	}
189
190	prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
191	}
192
193	/** pageID: id into pages array */
194	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
195
196	String text = getTextForPage(pageID);
197
198	// QTODO: what to do when page body text is empty?
199	if(text.equals("")) return false;
200	return maoriTxtDetector.isTextInMaori(text);
201	}
202
203	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
204	if(pageID < 0 \|\| pageID >= pages.size()) {
205	throw new ArrayIndexOutOfBoundsException();
206	}
207
208	TextDumpPage page = pages.get(pageID);
209	return page;
210	}
211
212	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
213	TextDumpPage page = getPage(pageID);
214	return page.getPageText();
215	}
216	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
217	TextDumpPage page = getPage(pageID);
218	return page.getPageURL();
219	}
220
221	public int totalNumPages() {
222	return pages.size();
223	}
224	public int getNumPagesInMRI() {
225	return pagesInMRI.size();
226	}
227	public int getNumPagesContainingMRI() {
228	return pagesContainingMRI.size();
229	}
230
231	private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
232	pagesInMRI = new ArrayList<MRIWebPageStats>();
233	pagesContainingMRI = new ArrayList<MRIWebPageStats>();
234
235	TextDumpPage page = null;
236	for(int i = 0; i < pages.size(); i++) {
237
238	page = pages.get(i);
239
240	String text = page.getPageText();
241
242	if(text.equals("")) {
243	page.addMRILanguageStatus(false);
244	continue;
245	}
246	else {
247	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
248
249	page.addMRILanguageStatus(isMRI);
250
251
252	// Even if the entire page is not found to be overall in MÄori,
253	// let's still inspect the sentences of the page and count how many (if any)
254	// are in te reo.
255	ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
256	// first element of ArrayList returned is always total num sentences on page
257	// remaining elements are the actual sentences that were detected as being MÄori
258	int totalSentences = Integer.parseInt(mriSentences.get(0));
259	int numSentencesInMRI = mriSentences.size() - 1;
260
261	// Add page to list of MRI pages if the page's body text overall was detected
262	// as MÄori
263	// Add page to list of pages containing MRI if >= 1 sentences in the page
264	// were detected as being in MRI
265	if(isMRI \|\| numSentencesInMRI >= 1) {
266	String url = page.getPageURL();
267	MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
268	totalSentences, numSentencesInMRI);
269	if(isMRI) {
270	pagesInMRI.add(MRIpageStats);
271	} else if(numSentencesInMRI >= 1) {
272	pagesContainingMRI.add(MRIpageStats);
273	}
274
275	// Only write to webpages csv file for those pages that had any MRI
276	// language content.
277	// column headers:
278	// pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
279	//int pageID = i; // not primary key by itself,
280	// must be combined with siteID to form primary key
281
282	webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
283	SITE_COUNTER, /* alternative: this.siteID */
284	url,
285	//"origCharEncoding", "modifiedTime", "fetchTime",
286	page.getOriginalCharEncoding(),
287	page.getModifiedTime(),
288	page.getFetchTime(),
289	isMRI, totalSentences, numSentencesInMRI);
290
291	// Write the sentences that are in te reo into the mri-sentences CSV file
292	// whether from webpages that are MRI overall or only those that containing
293	// any sentences in MRI
294	for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
295	//int sentenceID = j; // combine with siteID and pageID to form primary key
296	String mriSentence = mriSentences.get(j);
297	// sentenceID, pageID, sentence
298	//mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
299	mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
300	}
301	}
302
303	}
304	}
305	}
306
307	public void printSiteStats() {
308
309
310	logger.info("------------- " + this.siteID + " SITE STATS -----------");
311
312	logger.info("SITE DOMAIN: " + this.domainOfSite);
313	logger.info("Total number of web pages in site: " + pages.size());
314	logger.info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
315
316	if(pagesInMRI.size() > 0) {
317	logger.info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
318	for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
319	logger.info(mriWebPageInfo.toString());
320	}
321	}
322
323	logger.info(" ----------- ");
324	if(pagesContainingMRI.size() > 0) {
325	logger.info("The following pages weren't detected as primarily being in MÄori");
326	logger.info("But still contained sentences detected as MÄori");
327	for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
328	logger.info(mriWebPageInfo.toString());
329	}
330
331	} else {
332	logger.info("No further pages detected as containing any sentences in MRI");
333	}
334	logger.info(" ----------- ");
335	}
336
337
338	public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
339
340	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
341	// LocalDateTime date =
342	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
343	// String crawlTimestamp =
344	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
345
346	boolean redoCrawl = false;
347	int numPagesInMRI = pagesInMRI.size();
348	int numPagesContainingMRI = pagesContainingMRI.size();
349
350	if(this.siteCrawlUnfinished) {
351	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
352	if(numPagesInMRI > 2 \|\| numPagesContainingMRI > 2) {
353	redoCrawl = true;
354	}
355	}
356
357	// site.csv CSV file row:
358	// ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
359	websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
360	pages.size(), numPagesInMRI, numPagesContainingMRI,
361	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
362	}
363
364
365	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
366
367	public static void printUsage() {
368	System.err.println("Run this program as:");
369	System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
370	}
371
372	public static void main(String[] args) {
373	if(args.length != 1) {
374	printUsage();
375	return;
376	}
377
378	File sitesDir = new File(args[0]);
379	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
380	logger.error("Error: " + args[0] + " does not exist or is not a directory");
381	return;
382	}
383
384	NutchTextDumpProcessor.DEBUG_MODE = false;
385
386	File websitesCSVFile = new File(sitesDir, "websites.csv");
387	File webpagesCSVFile = new File(sitesDir, "webpages.csv");
388	File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
389
390	try (
391	CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
392	CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
393	CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
394	) {
395
396	// print out the column headers for the websites csv file
397	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
398	websitesCSVPrinter.printRecord("ID" /websiteID/, "siteID"/* site folder name*/,
399	"domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
400	"nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
401	webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL",
402	"origCharEncoding", "modifiedTime", "fetchTime",
403	"isMRI", "numSentences", "numSentencesInMRI");
404	mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
405
406	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
407	File[] sites = sitesDir.listFiles();
408
409	// sort site folders in alphabetical order
410	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
411	Arrays.sort(sites);
412
413	for(File siteDir : sites) { // e.g. 00001
414	if(siteDir.isDirectory()) {
415	// look for dump.txt
416	File txtDumpFile = new File(siteDir, "dump.txt");
417	if(!txtDumpFile.exists()) {
418	logger.error("Text dump file " + txtDumpFile + " did not exist");
419	continue;
420	}
421
422	else {
423	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
424
425	String siteID = siteDir.getName();
426	long lastModified = siteDir.lastModified();
427	logger.debug("Found siteID: " + siteID);
428	NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
429	webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
430	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
431	// now it's parsed all the web pages in the site's text dump
432
433	// Let's print stats on each web page's detected language being MRI or not
434	// and how many pages there were in the site in total.
435
436	nutchTxtDump.printSiteStats();
437	nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
438	}
439	}
440
441	}
442
443	} catch(Exception e) {
444	// can get an exception when instantiating NutchTextDumpProcessor instance
445	// or with CSV file
446	logger.error(e.getMessage(), e);
447	}
448	}
449	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33623

Download in other formats: