Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33615

Last change on this file since 33615 was 33615, checked in by ak19, 4 years ago
Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 16.3 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12
13	/**
14	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16	* This reads in the dump.txt file contained in each site folder within the input folder.
17	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
19	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20	* which parses out the actual text body content of each webpage's section within a dump.txt.
21	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
22	* likely to be in Maori or not.
23	*
24	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25	* nor even newline separator, it's hard to be sure that the entire page is in language.
26	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
28	*
29	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31	* instead of running it over the entire html body's text.
32	*
33	* TO COMPILE OR RUN, FIRST DO:
34	* cd maori-lang-detection/apache-opennlp-1.9.1
35	* export OPENNLP_HOME=`pwd`
36	* cd maori-lang-detection/src
37	*
38	* TO COMPILE:
39	* maori-lang-detection/src$
40	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
41	*
42	* TO RUN:
43	* maori-lang-detection/src$
44	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
45	*
46	* or:
47	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
48	*
49	*/
50	public class NutchTextDumpProcessor {
51	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
52
53	static boolean DEBUG_MODE = true;
54
55	/** Counter for number of sites.
56	* Should be equal to number of times NutchTextDumpProcessor constructor
57	* is called: once per site.
58	*/
59	static private int SITE_COUNTER = 0;
60	static private long WEBPAGE_COUNTER = 0;
61	static private long MRI_SENTENCE_COUNTER = 0;
62
63	private final MaoriTextDetector maoriTxtDetector;
64
65	public final String siteID;
66	public final boolean siteCrawlUnfinished;
67	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69	private String domainOfSite;
70
71	/** keep a list to store the text of each page */
72	private ArrayList<TextDumpPage> pages;
73
74	/** list of pages in this site which were detected as being in MRI */
75	private ArrayList<MRIWebPageStats> pagesInMRI;
76	/**
77	* list of pages in this site which were NOT detected as being in MRI but nevertheless
78	* contain one or more sentences in MRI
79	*/
80	private ArrayList<MRIWebPageStats> pagesContainingMRI;
81
82	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
83	// The start of a new web page's record in nutch's text dump of an entire site
84	// is denoted by a newline followed by a URL (protocol)
85	// or the very start of the file with a URL (protocol)
86	return ((prevLine == null \|\| prevLine.equals(""))
87	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
88	}
89
90	public void debugPageDump(StringBuilder pageDump) {
91	if(DEBUG_MODE) {
92	// START DEBUG
93	logger.debug("__________________________________________");
94	logger.debug("@@@ Found page entry: ");
95	logger.debug("__________________________________________");
96	logger.debug(pageDump.toString());
97	logger.debug("------------------------------------------");
98	// END DEBUG
99	}
100	}
101
102	/** A NutchTextDumpProcessor processes the dump.txt for one site */
103	public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
104	MaoriTextDetector maoriTxtDetector, String siteID,
105	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
106	throws IOException
107	{
108	// increment static counter of sites processed by a NutchTextDumpProcessor instance
109	SITE_COUNTER++;
110
111	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
112	this.siteID = siteID;
113	this.siteCrawlUnfinished = siteCrawlUnfinished;
114	this.siteCrawledTimestamp = lastModified;
115
116	this.maoriTxtDetector = maoriTxtDetector;
117
118	pages = new ArrayList<TextDumpPage>();
119
120	String line = null;
121	StringBuilder pageDump = null;
122	try (
123	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
124	) {
125
126	boolean readingText = false;
127	String prevLine = null;
128
129	while((line = reader.readLine()) != null) { // readLine removes newline separator
130	line = line.trim();
131	// iff outside of a page's body text, then an empty line marks the end of a page
132	// in nutch's text dump of a site.
133	// But note, there can be an empty line (or more?) between the start and end
134	// markers of a page's text, though.
135
136	if(isStartOfNewWebPageRecord(prevLine, line)) {
137
138	if(pageDump != null) { // should also be the case then: if(prevLine != null)
139	// finish old pageDump and begin new one
140
141	//debugPageDump(pageDump);
142
143	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
144	// parses the fields and body text of a webpage in nutch's txt dump of entire site
145	//page.parseFields();
146	//page.getText();
147	pages.add(page);
148	pageDump = null;
149
150	}
151
152	// begin new webpage dump
153	pageDump = new StringBuilder();
154	pageDump.append(line);
155	pageDump.append("\n");
156
157	}
158	else if(!line.equals("")) {
159	pageDump.append(line);
160	pageDump.append("\n");
161
162	}
163	// can throw away any newlines between text start and end markers.
164
165	prevLine = line;
166	}
167
168	// process final webpage record:
169	//debugPageDump(pageDump);
170
171	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
172	pages.add(page);
173	pageDump = null;
174
175	} catch (IOException ioe) {
176	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
177	}
178
179	// Just do this once: get and store domain of site.
180	// Passing true to get domain with protocol prefix
181	if(pages.size() > 0) {
182	TextDumpPage firstPage = pages.get(0);
183	String url = firstPage.getPageURL();
184	this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
185	}
186	else {
187	this.domainOfSite = "UNKNOWN";
188	}
189
190	prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
191	}
192
193	/** pageID: id into pages array */
194	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
195
196	String text = getTextForPage(pageID);
197
198	// QTODO: what to do when page body text is empty?
199	if(text.equals("")) return false;
200	return maoriTxtDetector.isTextInMaori(text);
201	}
202
203	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
204	if(pageID < 0 \|\| pageID >= pages.size()) {
205	throw new ArrayIndexOutOfBoundsException();
206	}
207
208	TextDumpPage page = pages.get(pageID);
209	return page;
210	}
211
212	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
213	TextDumpPage page = getPage(pageID);
214	return page.getPageText();
215	}
216	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
217	TextDumpPage page = getPage(pageID);
218	return page.getPageURL();
219	}
220
221	public int totalNumPages() {
222	return pages.size();
223	}
224	public int getNumPagesInMRI() {
225	return pagesInMRI.size();
226	}
227	public int getNumPagesContainingMRI() {
228	return pagesContainingMRI.size();
229	}
230
231	private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
232	pagesInMRI = new ArrayList<MRIWebPageStats>();
233	pagesContainingMRI = new ArrayList<MRIWebPageStats>();
234
235	TextDumpPage page = null;
236	for(int i = 0; i < pages.size(); i++) {
237
238	page = pages.get(i);
239
240	String text = page.getPageText();
241
242	if(text.equals("")) {
243	page.addMRILanguageStatus(false);
244	continue;
245	}
246	else {
247	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
248
249	page.addMRILanguageStatus(isMRI);
250
251	// Even if the entire page is not found to be overall in MÄori,
252	// let's still inspect the sentences of the page and count how many (if any)
253	// are in te reo.
254	ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
255	// first element of ArrayList returned is always total num sentences on page
256	// remaining elements are the actual sentences that were detected as being MÄori
257	int totalSentences = Integer.parseInt(mriSentences.get(0));
258	int numSentencesInMRI = mriSentences.size() - 1;
259
260	// Add page to list of MRI pages if the page's body text overall was detected
261	// as MÄori
262	// Add page to list of pages containing MRI if >= 1 sentences in the page
263	// were detected as being in MRI
264	if(isMRI \|\| numSentencesInMRI >= 1) {
265	String url = page.getPageURL();
266	MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
267	totalSentences, numSentencesInMRI);
268	if(isMRI) {
269	pagesInMRI.add(MRIpageStats);
270	} else if(numSentencesInMRI >= 1) {
271	pagesContainingMRI.add(MRIpageStats);
272	}
273
274	// Only write to webpages csv file for those pages that had any MRI
275	// language content.
276	// column headers:
277	// pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
278	//int pageID = i; // not primary key by itself,
279	// must be combined with siteID to form primary key
280
281	webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
282	SITE_COUNTER, /* alternative: this.siteID */
283	url, isMRI, totalSentences, numSentencesInMRI);
284
285	// Write the sentences that are in te reo into the mri-sentences CSV file
286	// whether from webpages that are MRI overall or only those that containing
287	// any sentences in MRI
288	for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
289	//int sentenceID = j; // combine with siteID and pageID to form primary key
290	String mriSentence = mriSentences.get(j);
291	// sentenceID, pageID, sentence
292	//mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
293	mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
294	}
295	}
296
297	}
298	}
299	}
300
301	public void printSiteStats() {
302
303
304	logger.info("------------- " + this.siteID + " SITE STATS -----------");
305
306	logger.info("SITE DOMAIN: " + this.domainOfSite);
307	logger.info("Total number of web pages in site: " + pages.size());
308	logger.info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
309
310	if(pagesInMRI.size() > 0) {
311	logger.info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
312	for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
313	logger.info(mriWebPageInfo.toString());
314	}
315	}
316
317	logger.info(" ----------- ");
318	if(pagesContainingMRI.size() > 0) {
319	logger.info("The following pages weren't detected as primarily being in MÄori");
320	logger.info("But still contained sentences detected as MÄori");
321	for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
322	logger.info(mriWebPageInfo.toString());
323	}
324
325	} else {
326	logger.info("No further pages detected as containing any sentences in MRI");
327	}
328	logger.info(" ----------- ");
329	}
330
331
332	public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
333
334	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
335	// LocalDateTime date =
336	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
337	// String crawlTimestamp =
338	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
339
340	boolean redoCrawl = false;
341	int numPagesInMRI = pagesInMRI.size();
342	int numPagesContainingMRI = pagesContainingMRI.size();
343
344	if(this.siteCrawlUnfinished) {
345	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
346	if(numPagesInMRI > 2 \|\| numPagesContainingMRI > 2) {
347	redoCrawl = true;
348	}
349	}
350
351	// site.csv CSV file row:
352	// ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
353	websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
354	pages.size(), numPagesInMRI, numPagesContainingMRI,
355	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
356	}
357
358
359	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
360
361	public static void printUsage() {
362	System.err.println("Run this program as:");
363	System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
364	}
365
366	public static void main(String[] args) {
367	if(args.length != 1) {
368	printUsage();
369	return;
370	}
371
372	File sitesDir = new File(args[0]);
373	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
374	logger.error("Error: " + args[0] + " does not exist or is not a directory");
375	return;
376	}
377
378	NutchTextDumpProcessor.DEBUG_MODE = false;
379
380	File websitesCSVFile = new File(sitesDir, "websites.csv");
381	File webpagesCSVFile = new File(sitesDir, "webpages.csv");
382	File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
383
384	try (
385	CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
386	CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
387	CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
388	) {
389
390	// print out the column headers for the websites csv file
391	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
392	websitesCSVPrinter.printRecord("ID" /websiteID/, "siteID"/* site folder name*/,
393	"domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
394	"nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
395	webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI",
396	"numSentences", "numSentencesInMRI");
397	mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
398
399	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
400	File[] sites = sitesDir.listFiles();
401
402	// sort site folders in alphabetical order
403	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
404	Arrays.sort(sites);
405
406	for(File siteDir : sites) { // e.g. 00001
407	if(siteDir.isDirectory()) {
408	// look for dump.txt
409	File txtDumpFile = new File(siteDir, "dump.txt");
410	if(!txtDumpFile.exists()) {
411	logger.error("Text dump file " + txtDumpFile + " did not exist");
412	continue;
413	}
414
415	else {
416	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
417
418	String siteID = siteDir.getName();
419	long lastModified = siteDir.lastModified();
420	logger.debug("Found siteID: " + siteID);
421	NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
422	webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
423	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
424	// now it's parsed all the web pages in the site's text dump
425
426	// Let's print stats on each web page's detected language being MRI or not
427	// and how many pages there were in the site in total.
428
429	nutchTxtDump.printSiteStats();
430	nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
431	}
432	}
433
434	}
435
436	} catch(Exception e) {
437	// can get an exception when instantiating CCWETProcessor instance
438	// or with CSV file
439	logger.error(e.getMessage(), e);
440	}
441	}
442	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: