source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33615

Last change on this file since 33615 was 33615, checked in by ak19, 4 years ago
  1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 16.3 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12
13/**
14 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16 * This reads in the dump.txt file contained in each site folder within the input folder.
17 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
19 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20 * which parses out the actual text body content of each webpage's section within a dump.txt.
21 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
22 * likely to be in Maori or not.
23 *
24 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25 * nor even newline separator, it's hard to be sure that the entire page is in language.
26 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
28 *
29 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31 * instead of running it over the entire html body's text.
32 *
33 * TO COMPILE OR RUN, FIRST DO:
34 * cd maori-lang-detection/apache-opennlp-1.9.1
35 * export OPENNLP_HOME=`pwd`
36 * cd maori-lang-detection/src
37 *
38 * TO COMPILE:
39 * maori-lang-detection/src$
40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
41 *
42 * TO RUN:
43 * maori-lang-detection/src$
44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
45 *
46 * or:
47 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
48 *
49*/
50public class NutchTextDumpProcessor {
51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
52
53 static boolean DEBUG_MODE = true;
54
55 /** Counter for number of sites.
56 * Should be equal to number of times NutchTextDumpProcessor constructor
57 * is called: once per site.
58 */
59 static private int SITE_COUNTER = 0;
60 static private long WEBPAGE_COUNTER = 0;
61 static private long MRI_SENTENCE_COUNTER = 0;
62
63 private final MaoriTextDetector maoriTxtDetector;
64
65 public final String siteID;
66 public final boolean siteCrawlUnfinished;
67 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69 private String domainOfSite;
70
71 /** keep a list to store the text of each page */
72 private ArrayList<TextDumpPage> pages;
73
74 /** list of pages in this site which were detected as being in MRI */
75 private ArrayList<MRIWebPageStats> pagesInMRI;
76 /**
77 * list of pages in this site which were NOT detected as being in MRI but nevertheless
78 * contain one or more sentences in MRI
79 */
80 private ArrayList<MRIWebPageStats> pagesContainingMRI;
81
82 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
83 // The start of a new web page's record in nutch's text dump of an entire site
84 // is denoted by a newline followed by a URL (protocol)
85 // or the very start of the file with a URL (protocol)
86 return ((prevLine == null || prevLine.equals(""))
87 && (line.startsWith("http://") || line.startsWith("https://")));
88 }
89
90 public void debugPageDump(StringBuilder pageDump) {
91 if(DEBUG_MODE) {
92 // START DEBUG
93 logger.debug("__________________________________________");
94 logger.debug("@@@ Found page entry: ");
95 logger.debug("__________________________________________");
96 logger.debug(pageDump.toString());
97 logger.debug("------------------------------------------");
98 // END DEBUG
99 }
100 }
101
102 /** A NutchTextDumpProcessor processes the dump.txt for one site */
103 public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
104 MaoriTextDetector maoriTxtDetector, String siteID,
105 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
106 throws IOException
107 {
108 // increment static counter of sites processed by a NutchTextDumpProcessor instance
109 SITE_COUNTER++;
110
111 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
112 this.siteID = siteID;
113 this.siteCrawlUnfinished = siteCrawlUnfinished;
114 this.siteCrawledTimestamp = lastModified;
115
116 this.maoriTxtDetector = maoriTxtDetector;
117
118 pages = new ArrayList<TextDumpPage>();
119
120 String line = null;
121 StringBuilder pageDump = null;
122 try (
123 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
124 ) {
125
126 boolean readingText = false;
127 String prevLine = null;
128
129 while((line = reader.readLine()) != null) { // readLine removes newline separator
130 line = line.trim();
131 // iff outside of a page's body text, then an empty line marks the end of a page
132 // in nutch's text dump of a site.
133 // But note, there can be an empty line (or more?) between the start and end
134 // markers of a page's text, though.
135
136 if(isStartOfNewWebPageRecord(prevLine, line)) {
137
138 if(pageDump != null) { // should also be the case then: if(prevLine != null)
139 // finish old pageDump and begin new one
140
141 //debugPageDump(pageDump);
142
143 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
144 // parses the fields and body text of a webpage in nutch's txt dump of entire site
145 //page.parseFields();
146 //page.getText();
147 pages.add(page);
148 pageDump = null;
149
150 }
151
152 // begin new webpage dump
153 pageDump = new StringBuilder();
154 pageDump.append(line);
155 pageDump.append("\n");
156
157 }
158 else if(!line.equals("")) {
159 pageDump.append(line);
160 pageDump.append("\n");
161
162 }
163 // can throw away any newlines between text start and end markers.
164
165 prevLine = line;
166 }
167
168 // process final webpage record:
169 //debugPageDump(pageDump);
170
171 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
172 pages.add(page);
173 pageDump = null;
174
175 } catch (IOException ioe) {
176 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
177 }
178
179 // Just do this once: get and store domain of site.
180 // Passing true to get domain with protocol prefix
181 if(pages.size() > 0) {
182 TextDumpPage firstPage = pages.get(0);
183 String url = firstPage.getPageURL();
184 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
185 }
186 else {
187 this.domainOfSite = "UNKNOWN";
188 }
189
190 prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
191 }
192
193 /** pageID: id into pages array */
194 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
195
196 String text = getTextForPage(pageID);
197
198 // QTODO: what to do when page body text is empty?
199 if(text.equals("")) return false;
200 return maoriTxtDetector.isTextInMaori(text);
201 }
202
203 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
204 if(pageID < 0 || pageID >= pages.size()) {
205 throw new ArrayIndexOutOfBoundsException();
206 }
207
208 TextDumpPage page = pages.get(pageID);
209 return page;
210 }
211
212 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
213 TextDumpPage page = getPage(pageID);
214 return page.getPageText();
215 }
216 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
217 TextDumpPage page = getPage(pageID);
218 return page.getPageURL();
219 }
220
221 public int totalNumPages() {
222 return pages.size();
223 }
224 public int getNumPagesInMRI() {
225 return pagesInMRI.size();
226 }
227 public int getNumPagesContainingMRI() {
228 return pagesContainingMRI.size();
229 }
230
231 private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
232 pagesInMRI = new ArrayList<MRIWebPageStats>();
233 pagesContainingMRI = new ArrayList<MRIWebPageStats>();
234
235 TextDumpPage page = null;
236 for(int i = 0; i < pages.size(); i++) {
237
238 page = pages.get(i);
239
240 String text = page.getPageText();
241
242 if(text.equals("")) {
243 page.addMRILanguageStatus(false);
244 continue;
245 }
246 else {
247 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
248
249 page.addMRILanguageStatus(isMRI);
250
251 // Even if the entire page is not found to be overall in Māori,
252 // let's still inspect the sentences of the page and count how many (if any)
253 // are in te reo.
254 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
255 // first element of ArrayList returned is always total num sentences on page
256 // remaining elements are the actual sentences that were detected as being Māori
257 int totalSentences = Integer.parseInt(mriSentences.get(0));
258 int numSentencesInMRI = mriSentences.size() - 1;
259
260 // Add page to list of MRI pages if the page's body text overall was detected
261 // as Māori
262 // Add page to list of pages containing MRI if >= 1 sentences in the page
263 // were detected as being in MRI
264 if(isMRI || numSentencesInMRI >= 1) {
265 String url = page.getPageURL();
266 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
267 totalSentences, numSentencesInMRI);
268 if(isMRI) {
269 pagesInMRI.add(MRIpageStats);
270 } else if(numSentencesInMRI >= 1) {
271 pagesContainingMRI.add(MRIpageStats);
272 }
273
274 // Only write to webpages csv file for those pages that had any MRI
275 // language content.
276 // column headers:
277 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
278 //int pageID = i; // not primary key by itself,
279 // must be combined with siteID to form primary key
280
281 webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
282 SITE_COUNTER, /* alternative: this.siteID */
283 url, isMRI, totalSentences, numSentencesInMRI);
284
285 // Write the sentences that are in te reo into the mri-sentences CSV file
286 // whether from webpages that are MRI overall or only those that containing
287 // any sentences in MRI
288 for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
289 //int sentenceID = j; // combine with siteID and pageID to form primary key
290 String mriSentence = mriSentences.get(j);
291 // sentenceID, pageID, sentence
292 //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
293 mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
294 }
295 }
296
297 }
298 }
299 }
300
301 public void printSiteStats() {
302
303
304 logger.info("------------- " + this.siteID + " SITE STATS -----------");
305
306 logger.info("SITE DOMAIN: " + this.domainOfSite);
307 logger.info("Total number of web pages in site: " + pages.size());
308 logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
309
310 if(pagesInMRI.size() > 0) {
311 logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
312 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
313 logger.info(mriWebPageInfo.toString());
314 }
315 }
316
317 logger.info(" ----------- ");
318 if(pagesContainingMRI.size() > 0) {
319 logger.info("The following pages weren't detected as primarily being in Māori");
320 logger.info("But still contained sentences detected as Māori");
321 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
322 logger.info(mriWebPageInfo.toString());
323 }
324
325 } else {
326 logger.info("No further pages detected as containing any sentences in MRI");
327 }
328 logger.info(" ----------- ");
329 }
330
331
332 public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
333
334 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
335 // LocalDateTime date =
336 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
337 // String crawlTimestamp =
338 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
339
340 boolean redoCrawl = false;
341 int numPagesInMRI = pagesInMRI.size();
342 int numPagesContainingMRI = pagesContainingMRI.size();
343
344 if(this.siteCrawlUnfinished) {
345 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
346 if(numPagesInMRI > 2 || numPagesContainingMRI > 2) {
347 redoCrawl = true;
348 }
349 }
350
351 // site.csv CSV file row:
352 // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
353 websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
354 pages.size(), numPagesInMRI, numPagesContainingMRI,
355 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
356 }
357
358
359 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
360
361 public static void printUsage() {
362 System.err.println("Run this program as:");
363 System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
364 }
365
366 public static void main(String[] args) {
367 if(args.length != 1) {
368 printUsage();
369 return;
370 }
371
372 File sitesDir = new File(args[0]);
373 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
374 logger.error("Error: " + args[0] + " does not exist or is not a directory");
375 return;
376 }
377
378 NutchTextDumpProcessor.DEBUG_MODE = false;
379
380 File websitesCSVFile = new File(sitesDir, "websites.csv");
381 File webpagesCSVFile = new File(sitesDir, "webpages.csv");
382 File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
383
384 try (
385 CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
386 CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
387 CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
388 ) {
389
390 // print out the column headers for the websites csv file
391 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
392 websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/,
393 "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
394 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
395 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI",
396 "numSentences", "numSentencesInMRI");
397 mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
398
399 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
400 File[] sites = sitesDir.listFiles();
401
402 // sort site folders in alphabetical order
403 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
404 Arrays.sort(sites);
405
406 for(File siteDir : sites) { // e.g. 00001
407 if(siteDir.isDirectory()) {
408 // look for dump.txt
409 File txtDumpFile = new File(siteDir, "dump.txt");
410 if(!txtDumpFile.exists()) {
411 logger.error("Text dump file " + txtDumpFile + " did not exist");
412 continue;
413 }
414
415 else {
416 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
417
418 String siteID = siteDir.getName();
419 long lastModified = siteDir.lastModified();
420 logger.debug("Found siteID: " + siteID);
421 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
422 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
423 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
424 // now it's parsed all the web pages in the site's text dump
425
426 // Let's print stats on each web page's detected language being MRI or not
427 // and how many pages there were in the site in total.
428
429 nutchTxtDump.printSiteStats();
430 nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
431 }
432 }
433
434 }
435
436 } catch(Exception e) {
437 // can get an exception when instantiating CCWETProcessor instance
438 // or with CSV file
439 logger.error(e.getMessage(), e);
440 }
441 }
442}
Note: See TracBrowser for help on using the repository browser.