Context Navigation

NutchTextDumpToCSV.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 16.5 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.time.LocalDateTime;
6	import java.util.ArrayList;
7	import java.util.Arrays;
8
9	import org.apache.commons.csv.*;
10	import org.apache.log4j.Logger;
11
12
13	/**
14	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16	* This reads in the dump.txt file contained in each site folder within the input folder.
17	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
19	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20	* which parses out the actual text body content of each webpage's section within a dump.txt.
21	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
22	* likely to be in Maori or not.
23	*
24	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25	* nor even newline separator, it's hard to be sure that the entire page is in language.
26	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
28	*
29	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31	* instead of running it over the entire html body's text.
32	*
33	* TO COMPILE OR RUN, FIRST DO:
34	* cd maori-lang-detection/apache-opennlp-1.9.1
35	* export OPENNLP_HOME=`pwd`
36	* cd maori-lang-detection/src
37	*
38	* TO COMPILE:
39	* maori-lang-detection/src$
40	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java
41	*
42	* TO RUN:
43	* maori-lang-detection/src$
44	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small
45	*
46	* or:
47	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1
48	*
49	*/
50	public class NutchTextDumpToCSV {
51	static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName());
52
53	static boolean DEBUG_MODE = true;
54
55	/** Counter for number of sites.
56	* Should be equal to number of times NutchTextDumpToCSV constructor
57	* is called: once per site.
58	*/
59	static private int SITE_COUNTER = 0;
60	static private long WEBPAGE_COUNTER = 0;
61	static private long MRI_SENTENCE_COUNTER = 0;
62
63	private final MaoriTextDetector maoriTxtDetector;
64
65	public final String siteID;
66	public final boolean siteCrawlUnfinished;
67	public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69	private String domainOfSite;
70
71	/** keep a list to store the text of each page */
72	private ArrayList<TextDumpPage> pages;
73
74	/** list of pages in this site which were detected as being in MRI */
75	private ArrayList<MRIWebPageStats> pagesInMRI;
76	/**
77	* list of pages in this site which were NOT detected as being in MRI but nevertheless
78	* contain one or more sentences in MRI
79	*/
80	private ArrayList<MRIWebPageStats> pagesContainingMRI;
81
82	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
83	// The start of a new web page's record in nutch's text dump of an entire site
84	// is denoted by a newline followed by a URL (protocol)
85	// or the very start of the file with a URL (protocol)
86	return ((prevLine == null \|\| prevLine.equals(""))
87	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
88	}
89
90	public void debugPageDump(StringBuilder pageDump) {
91	if(DEBUG_MODE) {
92	// START DEBUG
93	logger.debug("__________________________________________");
94	logger.debug("@@@ Found page entry: ");
95	logger.debug("__________________________________________");
96	logger.debug(pageDump.toString());
97	logger.debug("------------------------------------------");
98	// END DEBUG
99	}
100	}
101
102	/** A NutchTextDumpToCSV processes the dump.txt for one site */
103	public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
104	MaoriTextDetector maoriTxtDetector, String siteID,
105	File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
106	throws IOException
107	{
108	// increment static counter of sites processed by a NutchTextDumpToCSV instance
109	SITE_COUNTER++;
110
111	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
112	this.siteID = siteID;
113	this.siteCrawlUnfinished = siteCrawlUnfinished;
114	this.siteCrawledTimestamp = lastModified;
115
116	this.maoriTxtDetector = maoriTxtDetector;
117
118	pages = new ArrayList<TextDumpPage>();
119
120	String line = null;
121	StringBuilder pageDump = null;
122	try (
123	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
124	) {
125
126	boolean readingText = false;
127	String prevLine = null;
128
129	while((line = reader.readLine()) != null) { // readLine removes newline separator
130	line = line.trim();
131	// iff outside of a page's body text, then an empty line marks the end of a page
132	// in nutch's text dump of a site.
133	// But note, there can be an empty line (or more?) between the start and end
134	// markers of a page's text, though.
135
136	if(isStartOfNewWebPageRecord(prevLine, line)) {
137
138	if(pageDump != null) { // should also be the case then: if(prevLine != null)
139	// finish old pageDump and begin new one
140
141	//debugPageDump(pageDump);
142
143	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
144	// parses the fields and body text of a webpage in nutch's txt dump of entire site
145	//page.parseFields();
146	//page.getText();
147	pages.add(page);
148	pageDump = null;
149
150	}
151
152	// begin new webpage dump
153	pageDump = new StringBuilder();
154	pageDump.append(line);
155	pageDump.append("\n");
156
157	}
158	else if(!line.equals("")) {
159	pageDump.append(line);
160	pageDump.append("\n");
161
162	}
163	// can throw away any newlines between text start and end markers.
164
165	prevLine = line;
166	}
167
168	// process final webpage record:
169	//debugPageDump(pageDump);
170
171	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
172	pages.add(page);
173	pageDump = null;
174
175	} catch (IOException ioe) {
176	logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
177	}
178
179	// Just do this once: get and store domain of site.
180	// Passing true to get domain with protocol prefix
181	if(pages.size() > 0) {
182	TextDumpPage firstPage = pages.get(0);
183	String url = firstPage.getPageURL();
184	this.domainOfSite = Utility.getDomainForURL(url, true);
185	}
186	else {
187	this.domainOfSite = "UNKNOWN";
188	}
189
190	prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
191	}
192
193	/* UNUSED */
194	/** pageID: id into pages array */
195	/*
196	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
197
198	String text = getTextForPage(pageID);
199
200	// QTODO: what to do when page body text is empty?
201	if(text.equals("")) return false;
202	return maoriTxtDetector.isTextInMaori(text);
203	}
204
205	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
206	if(pageID < 0 \|\| pageID >= pages.size()) {
207	throw new ArrayIndexOutOfBoundsException();
208	}
209
210	TextDumpPage page = pages.get(pageID);
211	return page;
212	}
213
214	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
215	TextDumpPage page = getPage(pageID);
216	return page.getPageText();
217	}
218	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
219	TextDumpPage page = getPage(pageID);
220	return page.getPageURL();
221	}
222
223	public int totalNumPages() {
224	return pages.size();
225	}
226	public int getNumPagesInMRI() {
227	return pagesInMRI.size();
228	}
229	public int getNumPagesContainingMRI() {
230	return pagesContainingMRI.size();
231	}
232	*/
233
234	private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
235	pagesInMRI = new ArrayList<MRIWebPageStats>();
236	pagesContainingMRI = new ArrayList<MRIWebPageStats>();
237
238	TextDumpPage page = null;
239	for(int i = 0; i < pages.size(); i++) {
240
241	page = pages.get(i);
242
243	String text = page.getPageText();
244
245	if(text.equals("")) {
246	//page.addMRILanguageStatus(false);
247	continue;
248	}
249	else {
250	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
251
252	//page.addMRILanguageStatus(isMRI);
253
254
255	// Even if the entire page is not found to be overall in MÄori,
256	// let's still inspect the sentences of the page and count how many (if any)
257	// are in te reo.
258	ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
259	// first element of ArrayList returned is always total num sentences on page
260	// remaining elements are the actual sentences that were detected as being MÄori
261	int totalSentences = Integer.parseInt(mriSentences.get(0));
262	int numSentencesInMRI = mriSentences.size() - 1;
263
264	// Add page to list of MRI pages if the page's body text overall was detected
265	// as MÄori
266	// Add page to list of pages containing MRI if >= 1 sentences in the page
267	// were detected as being in MRI
268	if(isMRI \|\| numSentencesInMRI >= 1) {
269	String url = page.getPageURL();
270	MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
271	totalSentences, numSentencesInMRI);
272	if(isMRI) {
273	pagesInMRI.add(MRIpageStats);
274	} else if(numSentencesInMRI >= 1) {
275	pagesContainingMRI.add(MRIpageStats);
276	}
277
278	// Only write to webpages csv file for those pages that had any MRI
279	// language content.
280	// column headers:
281	// pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
282	//int pageID = i; // not primary key by itself,
283	// must be combined with siteID to form primary key
284
285	webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
286	SITE_COUNTER, /* alternative: this.siteID */
287	url,
288	//"origCharEncoding", "modifiedTime", "fetchTime",
289	page.getOriginalCharEncoding(),
290	page.getModifiedTime(),
291	page.getFetchTime(),
292	isMRI, totalSentences, numSentencesInMRI);
293
294	// Write the sentences that are in te reo into the mri-sentences CSV file
295	// whether from webpages that are MRI overall or only those that containing
296	// any sentences in MRI
297	for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
298	//int sentenceID = j; // combine with siteID and pageID to form primary key
299	String mriSentence = mriSentences.get(j);
300	// sentenceID, pageID, sentence
301	//mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
302	mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
303	}
304	}
305
306	}
307	}
308	}
309
310	public void printSiteStats() {
311
312
313	logger.info("------------- " + this.siteID + " SITE STATS -----------");
314
315	logger.info("SITE DOMAIN: " + this.domainOfSite);
316	logger.info("Total number of web pages in site: " + pages.size());
317	logger.info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
318
319	if(pagesInMRI.size() > 0) {
320	logger.info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
321	for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
322	logger.info(mriWebPageInfo.toString());
323	}
324	}
325
326	logger.info(" ----------- ");
327	if(pagesContainingMRI.size() > 0) {
328	logger.info("The following pages weren't detected as primarily being in MÄori");
329	logger.info("But still contained sentences detected as MÄori");
330	for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
331	logger.info(mriWebPageInfo.toString());
332	}
333
334	} else {
335	logger.info("No further pages detected as containing any sentences in MRI");
336	}
337	logger.info(" ----------- ");
338	}
339
340
341	public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
342
343	// https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
344	// LocalDateTime date =
345	// LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
346	// String crawlTimestamp =
347	// date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
348
349	boolean redoCrawl = false;
350	int numPagesInMRI = pagesInMRI.size();
351	int numPagesContainingMRI = pagesContainingMRI.size();
352
353	if(this.siteCrawlUnfinished) {
354	// arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
355	if(numPagesInMRI > 2 \|\| numPagesContainingMRI > 2) {
356	redoCrawl = true;
357	}
358	}
359
360	// site.csv CSV file row:
361	// ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
362	websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
363	pages.size(), numPagesInMRI, numPagesContainingMRI,
364	this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
365	}
366
367
368	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
369
370	public static void printUsage() {
371	System.err.println("Run this program as:");
372	System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>");
373	}
374
375	public static void main(String[] args) {
376	if(args.length != 1) {
377	printUsage();
378	return;
379	}
380
381	File sitesDir = new File(args[0]);
382	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
383	logger.error("Error: " + args[0] + " does not exist or is not a directory");
384	return;
385	}
386
387	NutchTextDumpToCSV.DEBUG_MODE = false;
388
389	File websitesCSVFile = new File(sitesDir, "websites.csv");
390	File webpagesCSVFile = new File(sitesDir, "webpages.csv");
391	File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
392
393	try (
394	CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
395	CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
396	CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
397	) {
398
399	// print out the column headers for the websites csv file
400	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
401	websitesCSVPrinter.printRecord("ID" /websiteID/, "siteID"/* site folder name*/,
402	"domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
403	"nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
404	webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL",
405	"origCharEncoding", "modifiedTime", "fetchTime",
406	"isMRI", "numSentences", "numSentencesInMRI");
407	mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
408
409	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
410	File[] sites = sitesDir.listFiles();
411
412	// sort site folders in alphabetical order
413	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
414	Arrays.sort(sites);
415
416	for(File siteDir : sites) { // e.g. 00001
417	if(siteDir.isDirectory()) {
418	// look for dump.txt
419	File txtDumpFile = new File(siteDir, "dump.txt");
420	if(!txtDumpFile.exists()) {
421	logger.error("Text dump file " + txtDumpFile + " did not exist");
422	continue;
423	}
424
425	else {
426	File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
427
428	String siteID = siteDir.getName();
429	long lastModified = siteDir.lastModified();
430	logger.debug("Found siteID: " + siteID);
431	NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV(
432	webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
433	siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
434	// now it's parsed all the web pages in the site's text dump
435
436	// Let's print stats on each web page's detected language being MRI or not
437	// and how many pages there were in the site in total.
438
439	nutchTxtDump.printSiteStats();
440	nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
441	}
442	}
443
444	}
445
446	} catch(Exception e) {
447	// can get an exception when instantiating NutchTextDumpToCSV instance
448	// or with CSV file
449	logger.error(e.getMessage(), e);
450	}
451	}
452	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java@ 33634

Download in other formats: