source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
File size: 16.5 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12
13/**
14 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16 * This reads in the dump.txt file contained in each site folder within the input folder.
17 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
19 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20 * which parses out the actual text body content of each webpage's section within a dump.txt.
21 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
22 * likely to be in Maori or not.
23 *
24 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25 * nor even newline separator, it's hard to be sure that the entire page is in language.
26 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
28 *
29 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31 * instead of running it over the entire html body's text.
32 *
33 * TO COMPILE OR RUN, FIRST DO:
34 * cd maori-lang-detection/apache-opennlp-1.9.1
35 * export OPENNLP_HOME=`pwd`
36 * cd maori-lang-detection/src
37 *
38 * TO COMPILE:
39 * maori-lang-detection/src$
40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
41 *
42 * TO RUN:
43 * maori-lang-detection/src$
44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small
45 *
46 * or:
47 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1
48 *
49*/
50public class NutchTextDumpProcessor {
51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
52
53 static boolean DEBUG_MODE = true;
54
55 /** Counter for number of sites.
56 * Should be equal to number of times NutchTextDumpProcessor constructor
57 * is called: once per site.
58 */
59 static private int SITE_COUNTER = 0;
60 static private long WEBPAGE_COUNTER = 0;
61 static private long MRI_SENTENCE_COUNTER = 0;
62
63 private final MaoriTextDetector maoriTxtDetector;
64
65 public final String siteID;
66 public final boolean siteCrawlUnfinished;
67 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69 private String domainOfSite;
70
71 /** keep a list to store the text of each page */
72 private ArrayList<TextDumpPage> pages;
73
74 /** list of pages in this site which were detected as being in MRI */
75 private ArrayList<MRIWebPageStats> pagesInMRI;
76 /**
77 * list of pages in this site which were NOT detected as being in MRI but nevertheless
78 * contain one or more sentences in MRI
79 */
80 private ArrayList<MRIWebPageStats> pagesContainingMRI;
81
82 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
83 // The start of a new web page's record in nutch's text dump of an entire site
84 // is denoted by a newline followed by a URL (protocol)
85 // or the very start of the file with a URL (protocol)
86 return ((prevLine == null || prevLine.equals(""))
87 && (line.startsWith("http://") || line.startsWith("https://")));
88 }
89
90 public void debugPageDump(StringBuilder pageDump) {
91 if(DEBUG_MODE) {
92 // START DEBUG
93 logger.debug("__________________________________________");
94 logger.debug("@@@ Found page entry: ");
95 logger.debug("__________________________________________");
96 logger.debug(pageDump.toString());
97 logger.debug("------------------------------------------");
98 // END DEBUG
99 }
100 }
101
102 /** A NutchTextDumpProcessor processes the dump.txt for one site */
103 public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
104 MaoriTextDetector maoriTxtDetector, String siteID,
105 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
106 throws IOException
107 {
108 // increment static counter of sites processed by a NutchTextDumpProcessor instance
109 SITE_COUNTER++;
110
111 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
112 this.siteID = siteID;
113 this.siteCrawlUnfinished = siteCrawlUnfinished;
114 this.siteCrawledTimestamp = lastModified;
115
116 this.maoriTxtDetector = maoriTxtDetector;
117
118 pages = new ArrayList<TextDumpPage>();
119
120 String line = null;
121 StringBuilder pageDump = null;
122 try (
123 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
124 ) {
125
126 boolean readingText = false;
127 String prevLine = null;
128
129 while((line = reader.readLine()) != null) { // readLine removes newline separator
130 line = line.trim();
131 // iff outside of a page's body text, then an empty line marks the end of a page
132 // in nutch's text dump of a site.
133 // But note, there can be an empty line (or more?) between the start and end
134 // markers of a page's text, though.
135
136 if(isStartOfNewWebPageRecord(prevLine, line)) {
137
138 if(pageDump != null) { // should also be the case then: if(prevLine != null)
139 // finish old pageDump and begin new one
140
141 //debugPageDump(pageDump);
142
143 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
144 // parses the fields and body text of a webpage in nutch's txt dump of entire site
145 //page.parseFields();
146 //page.getText();
147 pages.add(page);
148 pageDump = null;
149
150 }
151
152 // begin new webpage dump
153 pageDump = new StringBuilder();
154 pageDump.append(line);
155 pageDump.append("\n");
156
157 }
158 else if(!line.equals("")) {
159 pageDump.append(line);
160 pageDump.append("\n");
161
162 }
163 // can throw away any newlines between text start and end markers.
164
165 prevLine = line;
166 }
167
168 // process final webpage record:
169 //debugPageDump(pageDump);
170
171 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
172 pages.add(page);
173 pageDump = null;
174
175 } catch (IOException ioe) {
176 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
177 }
178
179 // Just do this once: get and store domain of site.
180 // Passing true to get domain with protocol prefix
181 if(pages.size() > 0) {
182 TextDumpPage firstPage = pages.get(0);
183 String url = firstPage.getPageURL();
184 this.domainOfSite = Utility.getDomainForURL(url, true);
185 }
186 else {
187 this.domainOfSite = "UNKNOWN";
188 }
189
190 prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
191 }
192
193 /** pageID: id into pages array */
194 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
195
196 String text = getTextForPage(pageID);
197
198 // QTODO: what to do when page body text is empty?
199 if(text.equals("")) return false;
200 return maoriTxtDetector.isTextInMaori(text);
201 }
202
203 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
204 if(pageID < 0 || pageID >= pages.size()) {
205 throw new ArrayIndexOutOfBoundsException();
206 }
207
208 TextDumpPage page = pages.get(pageID);
209 return page;
210 }
211
212 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
213 TextDumpPage page = getPage(pageID);
214 return page.getPageText();
215 }
216 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
217 TextDumpPage page = getPage(pageID);
218 return page.getPageURL();
219 }
220
221 public int totalNumPages() {
222 return pages.size();
223 }
224 public int getNumPagesInMRI() {
225 return pagesInMRI.size();
226 }
227 public int getNumPagesContainingMRI() {
228 return pagesContainingMRI.size();
229 }
230
231 private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
232 pagesInMRI = new ArrayList<MRIWebPageStats>();
233 pagesContainingMRI = new ArrayList<MRIWebPageStats>();
234
235 TextDumpPage page = null;
236 for(int i = 0; i < pages.size(); i++) {
237
238 page = pages.get(i);
239
240 String text = page.getPageText();
241
242 if(text.equals("")) {
243 page.addMRILanguageStatus(false);
244 continue;
245 }
246 else {
247 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
248
249 page.addMRILanguageStatus(isMRI);
250
251
252 // Even if the entire page is not found to be overall in Māori,
253 // let's still inspect the sentences of the page and count how many (if any)
254 // are in te reo.
255 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
256 // first element of ArrayList returned is always total num sentences on page
257 // remaining elements are the actual sentences that were detected as being Māori
258 int totalSentences = Integer.parseInt(mriSentences.get(0));
259 int numSentencesInMRI = mriSentences.size() - 1;
260
261 // Add page to list of MRI pages if the page's body text overall was detected
262 // as Māori
263 // Add page to list of pages containing MRI if >= 1 sentences in the page
264 // were detected as being in MRI
265 if(isMRI || numSentencesInMRI >= 1) {
266 String url = page.getPageURL();
267 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
268 totalSentences, numSentencesInMRI);
269 if(isMRI) {
270 pagesInMRI.add(MRIpageStats);
271 } else if(numSentencesInMRI >= 1) {
272 pagesContainingMRI.add(MRIpageStats);
273 }
274
275 // Only write to webpages csv file for those pages that had any MRI
276 // language content.
277 // column headers:
278 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
279 //int pageID = i; // not primary key by itself,
280 // must be combined with siteID to form primary key
281
282 webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
283 SITE_COUNTER, /* alternative: this.siteID */
284 url,
285 //"origCharEncoding", "modifiedTime", "fetchTime",
286 page.getOriginalCharEncoding(),
287 page.getModifiedTime(),
288 page.getFetchTime(),
289 isMRI, totalSentences, numSentencesInMRI);
290
291 // Write the sentences that are in te reo into the mri-sentences CSV file
292 // whether from webpages that are MRI overall or only those that containing
293 // any sentences in MRI
294 for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
295 //int sentenceID = j; // combine with siteID and pageID to form primary key
296 String mriSentence = mriSentences.get(j);
297 // sentenceID, pageID, sentence
298 //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
299 mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
300 }
301 }
302
303 }
304 }
305 }
306
307 public void printSiteStats() {
308
309
310 logger.info("------------- " + this.siteID + " SITE STATS -----------");
311
312 logger.info("SITE DOMAIN: " + this.domainOfSite);
313 logger.info("Total number of web pages in site: " + pages.size());
314 logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
315
316 if(pagesInMRI.size() > 0) {
317 logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
318 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
319 logger.info(mriWebPageInfo.toString());
320 }
321 }
322
323 logger.info(" ----------- ");
324 if(pagesContainingMRI.size() > 0) {
325 logger.info("The following pages weren't detected as primarily being in Māori");
326 logger.info("But still contained sentences detected as Māori");
327 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
328 logger.info(mriWebPageInfo.toString());
329 }
330
331 } else {
332 logger.info("No further pages detected as containing any sentences in MRI");
333 }
334 logger.info(" ----------- ");
335 }
336
337
338 public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
339
340 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
341 // LocalDateTime date =
342 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
343 // String crawlTimestamp =
344 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
345
346 boolean redoCrawl = false;
347 int numPagesInMRI = pagesInMRI.size();
348 int numPagesContainingMRI = pagesContainingMRI.size();
349
350 if(this.siteCrawlUnfinished) {
351 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
352 if(numPagesInMRI > 2 || numPagesContainingMRI > 2) {
353 redoCrawl = true;
354 }
355 }
356
357 // site.csv CSV file row:
358 // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
359 websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
360 pages.size(), numPagesInMRI, numPagesContainingMRI,
361 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
362 }
363
364
365 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
366
367 public static void printUsage() {
368 System.err.println("Run this program as:");
369 System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
370 }
371
372 public static void main(String[] args) {
373 if(args.length != 1) {
374 printUsage();
375 return;
376 }
377
378 File sitesDir = new File(args[0]);
379 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
380 logger.error("Error: " + args[0] + " does not exist or is not a directory");
381 return;
382 }
383
384 NutchTextDumpProcessor.DEBUG_MODE = false;
385
386 File websitesCSVFile = new File(sitesDir, "websites.csv");
387 File webpagesCSVFile = new File(sitesDir, "webpages.csv");
388 File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
389
390 try (
391 CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
392 CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
393 CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
394 ) {
395
396 // print out the column headers for the websites csv file
397 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
398 websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/,
399 "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
400 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
401 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL",
402 "origCharEncoding", "modifiedTime", "fetchTime",
403 "isMRI", "numSentences", "numSentencesInMRI");
404 mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
405
406 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
407 File[] sites = sitesDir.listFiles();
408
409 // sort site folders in alphabetical order
410 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
411 Arrays.sort(sites);
412
413 for(File siteDir : sites) { // e.g. 00001
414 if(siteDir.isDirectory()) {
415 // look for dump.txt
416 File txtDumpFile = new File(siteDir, "dump.txt");
417 if(!txtDumpFile.exists()) {
418 logger.error("Text dump file " + txtDumpFile + " did not exist");
419 continue;
420 }
421
422 else {
423 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
424
425 String siteID = siteDir.getName();
426 long lastModified = siteDir.lastModified();
427 logger.debug("Found siteID: " + siteID);
428 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
429 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
430 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
431 // now it's parsed all the web pages in the site's text dump
432
433 // Let's print stats on each web page's detected language being MRI or not
434 // and how many pages there were in the site in total.
435
436 nutchTxtDump.printSiteStats();
437 nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
438 }
439 }
440
441 }
442
443 } catch(Exception e) {
444 // can get an exception when instantiating NutchTextDumpProcessor instance
445 // or with CSV file
446 logger.error(e.getMessage(), e);
447 }
448 }
449}
Note: See TracBrowser for help on using the repository browser.