source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 16.5 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12
13/**
14 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16 * This reads in the dump.txt file contained in each site folder within the input folder.
17 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
19 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20 * which parses out the actual text body content of each webpage's section within a dump.txt.
21 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
22 * likely to be in Maori or not.
23 *
24 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25 * nor even newline separator, it's hard to be sure that the entire page is in language.
26 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
28 *
29 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31 * instead of running it over the entire html body's text.
32 *
33 * TO COMPILE OR RUN, FIRST DO:
34 * cd maori-lang-detection/apache-opennlp-1.9.1
35 * export OPENNLP_HOME=`pwd`
36 * cd maori-lang-detection/src
37 *
38 * TO COMPILE:
39 * maori-lang-detection/src$
40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java
41 *
42 * TO RUN:
43 * maori-lang-detection/src$
44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small
45 *
46 * or:
47 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1
48 *
49*/
50public class NutchTextDumpToCSV {
51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName());
52
53 static boolean DEBUG_MODE = true;
54
55 /** Counter for number of sites.
56 * Should be equal to number of times NutchTextDumpToCSV constructor
57 * is called: once per site.
58 */
59 static private int SITE_COUNTER = 0;
60 static private long WEBPAGE_COUNTER = 0;
61 static private long MRI_SENTENCE_COUNTER = 0;
62
63 private final MaoriTextDetector maoriTxtDetector;
64
65 public final String siteID;
66 public final boolean siteCrawlUnfinished;
67 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69 private String domainOfSite;
70
71 /** keep a list to store the text of each page */
72 private ArrayList<TextDumpPage> pages;
73
74 /** list of pages in this site which were detected as being in MRI */
75 private ArrayList<MRIWebPageStats> pagesInMRI;
76 /**
77 * list of pages in this site which were NOT detected as being in MRI but nevertheless
78 * contain one or more sentences in MRI
79 */
80 private ArrayList<MRIWebPageStats> pagesContainingMRI;
81
82 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
83 // The start of a new web page's record in nutch's text dump of an entire site
84 // is denoted by a newline followed by a URL (protocol)
85 // or the very start of the file with a URL (protocol)
86 return ((prevLine == null || prevLine.equals(""))
87 && (line.startsWith("http://") || line.startsWith("https://")));
88 }
89
90 public void debugPageDump(StringBuilder pageDump) {
91 if(DEBUG_MODE) {
92 // START DEBUG
93 logger.debug("__________________________________________");
94 logger.debug("@@@ Found page entry: ");
95 logger.debug("__________________________________________");
96 logger.debug(pageDump.toString());
97 logger.debug("------------------------------------------");
98 // END DEBUG
99 }
100 }
101
102 /** A NutchTextDumpToCSV processes the dump.txt for one site */
103 public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
104 MaoriTextDetector maoriTxtDetector, String siteID,
105 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
106 throws IOException
107 {
108 // increment static counter of sites processed by a NutchTextDumpToCSV instance
109 SITE_COUNTER++;
110
111 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
112 this.siteID = siteID;
113 this.siteCrawlUnfinished = siteCrawlUnfinished;
114 this.siteCrawledTimestamp = lastModified;
115
116 this.maoriTxtDetector = maoriTxtDetector;
117
118 pages = new ArrayList<TextDumpPage>();
119
120 String line = null;
121 StringBuilder pageDump = null;
122 try (
123 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
124 ) {
125
126 boolean readingText = false;
127 String prevLine = null;
128
129 while((line = reader.readLine()) != null) { // readLine removes newline separator
130 line = line.trim();
131 // iff outside of a page's body text, then an empty line marks the end of a page
132 // in nutch's text dump of a site.
133 // But note, there can be an empty line (or more?) between the start and end
134 // markers of a page's text, though.
135
136 if(isStartOfNewWebPageRecord(prevLine, line)) {
137
138 if(pageDump != null) { // should also be the case then: if(prevLine != null)
139 // finish old pageDump and begin new one
140
141 //debugPageDump(pageDump);
142
143 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
144 // parses the fields and body text of a webpage in nutch's txt dump of entire site
145 //page.parseFields();
146 //page.getText();
147 pages.add(page);
148 pageDump = null;
149
150 }
151
152 // begin new webpage dump
153 pageDump = new StringBuilder();
154 pageDump.append(line);
155 pageDump.append("\n");
156
157 }
158 else if(!line.equals("")) {
159 pageDump.append(line);
160 pageDump.append("\n");
161
162 }
163 // can throw away any newlines between text start and end markers.
164
165 prevLine = line;
166 }
167
168 // process final webpage record:
169 //debugPageDump(pageDump);
170
171 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
172 pages.add(page);
173 pageDump = null;
174
175 } catch (IOException ioe) {
176 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
177 }
178
179 // Just do this once: get and store domain of site.
180 // Passing true to get domain with protocol prefix
181 if(pages.size() > 0) {
182 TextDumpPage firstPage = pages.get(0);
183 String url = firstPage.getPageURL();
184 this.domainOfSite = Utility.getDomainForURL(url, true);
185 }
186 else {
187 this.domainOfSite = "UNKNOWN";
188 }
189
190 prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
191 }
192
193 /* UNUSED */
194 /** pageID: id into pages array */
195 /*
196 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
197
198 String text = getTextForPage(pageID);
199
200 // QTODO: what to do when page body text is empty?
201 if(text.equals("")) return false;
202 return maoriTxtDetector.isTextInMaori(text);
203 }
204
205 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
206 if(pageID < 0 || pageID >= pages.size()) {
207 throw new ArrayIndexOutOfBoundsException();
208 }
209
210 TextDumpPage page = pages.get(pageID);
211 return page;
212 }
213
214 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
215 TextDumpPage page = getPage(pageID);
216 return page.getPageText();
217 }
218 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
219 TextDumpPage page = getPage(pageID);
220 return page.getPageURL();
221 }
222
223 public int totalNumPages() {
224 return pages.size();
225 }
226 public int getNumPagesInMRI() {
227 return pagesInMRI.size();
228 }
229 public int getNumPagesContainingMRI() {
230 return pagesContainingMRI.size();
231 }
232 */
233
234 private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
235 pagesInMRI = new ArrayList<MRIWebPageStats>();
236 pagesContainingMRI = new ArrayList<MRIWebPageStats>();
237
238 TextDumpPage page = null;
239 for(int i = 0; i < pages.size(); i++) {
240
241 page = pages.get(i);
242
243 String text = page.getPageText();
244
245 if(text.equals("")) {
246 //page.addMRILanguageStatus(false);
247 continue;
248 }
249 else {
250 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
251
252 //page.addMRILanguageStatus(isMRI);
253
254
255 // Even if the entire page is not found to be overall in Māori,
256 // let's still inspect the sentences of the page and count how many (if any)
257 // are in te reo.
258 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
259 // first element of ArrayList returned is always total num sentences on page
260 // remaining elements are the actual sentences that were detected as being Māori
261 int totalSentences = Integer.parseInt(mriSentences.get(0));
262 int numSentencesInMRI = mriSentences.size() - 1;
263
264 // Add page to list of MRI pages if the page's body text overall was detected
265 // as Māori
266 // Add page to list of pages containing MRI if >= 1 sentences in the page
267 // were detected as being in MRI
268 if(isMRI || numSentencesInMRI >= 1) {
269 String url = page.getPageURL();
270 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
271 totalSentences, numSentencesInMRI);
272 if(isMRI) {
273 pagesInMRI.add(MRIpageStats);
274 } else if(numSentencesInMRI >= 1) {
275 pagesContainingMRI.add(MRIpageStats);
276 }
277
278 // Only write to webpages csv file for those pages that had any MRI
279 // language content.
280 // column headers:
281 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
282 //int pageID = i; // not primary key by itself,
283 // must be combined with siteID to form primary key
284
285 webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
286 SITE_COUNTER, /* alternative: this.siteID */
287 url,
288 //"origCharEncoding", "modifiedTime", "fetchTime",
289 page.getOriginalCharEncoding(),
290 page.getModifiedTime(),
291 page.getFetchTime(),
292 isMRI, totalSentences, numSentencesInMRI);
293
294 // Write the sentences that are in te reo into the mri-sentences CSV file
295 // whether from webpages that are MRI overall or only those that containing
296 // any sentences in MRI
297 for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
298 //int sentenceID = j; // combine with siteID and pageID to form primary key
299 String mriSentence = mriSentences.get(j);
300 // sentenceID, pageID, sentence
301 //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
302 mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
303 }
304 }
305
306 }
307 }
308 }
309
310 public void printSiteStats() {
311
312
313 logger.info("------------- " + this.siteID + " SITE STATS -----------");
314
315 logger.info("SITE DOMAIN: " + this.domainOfSite);
316 logger.info("Total number of web pages in site: " + pages.size());
317 logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
318
319 if(pagesInMRI.size() > 0) {
320 logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
321 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
322 logger.info(mriWebPageInfo.toString());
323 }
324 }
325
326 logger.info(" ----------- ");
327 if(pagesContainingMRI.size() > 0) {
328 logger.info("The following pages weren't detected as primarily being in Māori");
329 logger.info("But still contained sentences detected as Māori");
330 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
331 logger.info(mriWebPageInfo.toString());
332 }
333
334 } else {
335 logger.info("No further pages detected as containing any sentences in MRI");
336 }
337 logger.info(" ----------- ");
338 }
339
340
341 public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException {
342
343 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
344 // LocalDateTime date =
345 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
346 // String crawlTimestamp =
347 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
348
349 boolean redoCrawl = false;
350 int numPagesInMRI = pagesInMRI.size();
351 int numPagesContainingMRI = pagesContainingMRI.size();
352
353 if(this.siteCrawlUnfinished) {
354 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
355 if(numPagesInMRI > 2 || numPagesContainingMRI > 2) {
356 redoCrawl = true;
357 }
358 }
359
360 // site.csv CSV file row:
361 // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl
362 websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite,
363 pages.size(), numPagesInMRI, numPagesContainingMRI,
364 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl);
365 }
366
367
368 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
369
370 public static void printUsage() {
371 System.err.println("Run this program as:");
372 System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>");
373 }
374
375 public static void main(String[] args) {
376 if(args.length != 1) {
377 printUsage();
378 return;
379 }
380
381 File sitesDir = new File(args[0]);
382 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
383 logger.error("Error: " + args[0] + " does not exist or is not a directory");
384 return;
385 }
386
387 NutchTextDumpToCSV.DEBUG_MODE = false;
388
389 File websitesCSVFile = new File(sitesDir, "websites.csv");
390 File webpagesCSVFile = new File(sitesDir, "webpages.csv");
391 File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
392
393 try (
394 CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
395 CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
396 CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
397 ) {
398
399 // print out the column headers for the websites csv file
400 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
401 websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/,
402 "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
403 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
404 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL",
405 "origCharEncoding", "modifiedTime", "fetchTime",
406 "isMRI", "numSentences", "numSentencesInMRI");
407 mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
408
409 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
410 File[] sites = sitesDir.listFiles();
411
412 // sort site folders in alphabetical order
413 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
414 Arrays.sort(sites);
415
416 for(File siteDir : sites) { // e.g. 00001
417 if(siteDir.isDirectory()) {
418 // look for dump.txt
419 File txtDumpFile = new File(siteDir, "dump.txt");
420 if(!txtDumpFile.exists()) {
421 logger.error("Text dump file " + txtDumpFile + " did not exist");
422 continue;
423 }
424
425 else {
426 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
427
428 String siteID = siteDir.getName();
429 long lastModified = siteDir.lastModified();
430 logger.debug("Found siteID: " + siteID);
431 NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV(
432 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
433 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
434 // now it's parsed all the web pages in the site's text dump
435
436 // Let's print stats on each web page's detected language being MRI or not
437 // and how many pages there were in the site in total.
438
439 nutchTxtDump.printSiteStats();
440 nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter);
441 }
442 }
443
444 }
445
446 } catch(Exception e) {
447 // can get an exception when instantiating NutchTextDumpToCSV instance
448 // or with CSV file
449 logger.error(e.getMessage(), e);
450 }
451 }
452}
Note: See TracBrowser for help on using the repository browser.