source: other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33909

Last change on this file since 33909 was 33909, checked in by ak19, 4 years ago
  1. Implementing tables 3 to 5. 2. Rolled back the introduction of the basicDomain field (domain stripped of http/https and www prefixes) as the code can create and sort this field alphabetically, whereas it didn't sort properly in mongodb. 3. The code now does sort the domains stripped of protocol and www for the mongodb queries producing domain results and ensures the domain list is unique. 4. Split the MongoDBAccess class into 2, with the connection code in MongoDBAccess.java and the querying code in MongoDBQueryer (a subclass of MongoDBAccess) that is so far exclusively used by WebPageURLsListing.java
File size: 16.0 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12//import org.bson.types.ObjectId;
13
14import org.greenstone.atea.morphia.*;
15
16
17/**
18 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
19 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
20 * This reads in the dump.txt file contained in each site folder within the input folder.
21 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
22 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
23 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
24 * which parses out the actual text body content of each webpage's section within a dump.txt.
25 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
26 * likely to be in Maori or not.
27 *
28 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
29 * nor even newline separator, it's hard to be sure that the entire page is in language.
30 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
31 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
32 *
33 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
34 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
35 * instead of running it over the entire html body's text.
36 *
37 * TO COMPILE OR RUN, FIRST DO:
38 * cd maori-lang-detection/apache-opennlp-1.9.1
39 * export OPENNLP_HOME=`pwd`
40 * cd maori-lang-detection/src
41 *
42 * TO COMPILE:
43 * maori-lang-detection/src$
44 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
45 *
46 * TO RUN:
47 * maori-lang-detection/src$
48 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
49 *
50 * or:
51 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
52 *
53*/
54public class NutchTextDumpToMongoDB {
55 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
56
57 static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
58
59 /** Counter for number of sites.
60 * Should be equal to number of times NutchTextDumpToMongoDB constructor
61 * is called: once per site.
62 */
63 static private int SITE_COUNTER = 0;
64 static private long WEBPAGE_COUNTER = 0;
65
66 private final MaoriTextDetector maoriTxtDetector;
67 private final MongoDBAccess mongodbAccess;
68
69 public final String siteID;
70 public final boolean siteCrawlUnfinished;
71 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
72
73 private int countOfWebPagesWithBodyText = 0;
74
75 private String geoLocationCountryCode = null; /** 2 letter country code */
76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */
77
78 private String domainOfSite;
79 //private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
80 private int numPagesInMRI = 0;
81 private int numPagesContainingMRI = 0;
82
83 /** keep a list to store the text of each page */
84 private ArrayList<TextDumpPage> pages;
85
86
87
88 /** Number of language and confidence results to return for storing in MongoDB
89 * MongoDB runs out of space if storing too many, as we store this info per sentence
90 * and a long text document becomes a very large MongoDB document presumably */
91 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
92
93
94 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
95 // The start of a new web page's record in nutch's text dump of an entire site
96 // is denoted by a newline followed by a URL (protocol)
97 // or the very start of the file with a URL (protocol)
98 return ((prevLine == null || prevLine.equals(""))
99 && (line.startsWith("http://") || line.startsWith("https://")));
100 }
101
102 public void debugPageDump(StringBuilder pageDump) {
103 if(DEBUG_MODE) {
104 // START DEBUG
105 logger.debug("__________________________________________");
106 logger.debug("@@@ Found page entry: ");
107 logger.debug("__________________________________________");
108 logger.debug(pageDump.toString());
109 logger.debug("------------------------------------------");
110 // END DEBUG
111 }
112 }
113
114 /** A NutchTextDumpToMongoDB processes the dump.txt for one site */
115 public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
116 MaoriTextDetector maoriTxtDetector, String siteID,
117 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
118 throws IOException
119 {
120 // increment static counter of sites processed by a NutchTextDumpToMongoDB instance
121 SITE_COUNTER++;
122
123 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
124 this.siteID = siteID;
125 this.siteCrawlUnfinished = siteCrawlUnfinished;
126 this.siteCrawledTimestamp = lastModified;
127
128 this.maoriTxtDetector = maoriTxtDetector;
129 this.mongodbAccess = mongodbAccess;
130
131 pages = new ArrayList<TextDumpPage>();
132
133 String line = null;
134 StringBuilder pageDump = null;
135 try (
136 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
137 ) {
138
139 boolean readingText = false;
140 String prevLine = null;
141
142 while((line = reader.readLine()) != null) { // readLine removes newline separator
143 line = line.trim();
144 // iff outside of a page's body text, then an empty line marks the end of a page
145 // in nutch's text dump of a site.
146 // But note, there can be an empty line (or more?) between the start and end
147 // markers of a page's text, though.
148
149 if(isStartOfNewWebPageRecord(prevLine, line)) {
150
151 if(pageDump != null) { // should also be the case then: if(prevLine != null)
152 // finish old pageDump and begin new one
153
154 //debugPageDump(pageDump);
155
156 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
157 // parses the fields and body text of a webpage in nutch's txt dump of entire site
158 //page.parseFields();
159 //page.getText();
160 pages.add(page);
161 inspectPageURLPath(page);
162 pageDump = null;
163
164 }
165
166 // begin new webpage dump
167 pageDump = new StringBuilder();
168 pageDump.append(line);
169 pageDump.append("\n");
170
171 }
172 else if(!line.equals("")) {
173 pageDump.append(line);
174 pageDump.append("\n");
175
176 }
177 // can throw away any newlines between text start and end markers.
178
179 prevLine = line;
180 }
181
182 // process final webpage record:
183 //debugPageDump(pageDump);
184
185 if(pageDump == null) {
186 logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
187 } else {
188 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
189 pages.add(page);
190 inspectPageURLPath(page);
191 pageDump = null;
192 }
193
194
195 } catch (IOException ioe) {
196 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
197 }
198
199 // Just do this once: get and store domain of site.
200 // Passing true to get domain with protocol prefix
201 if(pages.size() > 0) {
202 TextDumpPage firstPage = pages.get(0);
203 String url = firstPage.getPageURL();
204 this.domainOfSite = Utility.getDomainForURL(url, true);
205 //this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
206 }
207 else {
208 this.domainOfSite = "UNKNOWN";
209 //this.baseSiteDomain = "UNKNOWN";
210 }
211
212 /* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above.
213 // For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path
214 for(TextDumpPage aPage : pages) {
215 inspectPageURLPath(aPage);
216 }
217 */
218 prepareSiteStats(mongodbAccess);
219 }
220
221 /** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL.
222 * This method is called on each page of a site as the page is created. */
223 private void inspectPageURLPath(TextDumpPage page) {
224 String url = page.getPageURL();
225 //logger.debug("@@@@ pageURL: " + url);
226
227 if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site,
228 // check if this page of the site contains /mi(/) or http(s)://mi in its URL path
229 if(url.contains("/mi/") || url.endsWith("/mi") || url.startsWith("https://mi.") || url.startsWith("http://mi.")) {
230 this.urlContainsLangCodeInPath = true;
231 }
232 }
233 }
234
235
236 private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
237
238 TextDumpPage page = null;
239 for(int i = 0; i < pages.size(); i++) {
240
241 page = pages.get(i);
242
243 String text = page.getPageText();
244
245 if(text.equals("")) {
246 // don't care about empty pages
247 continue;
248 }
249 else {
250 WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
251 countOfWebPagesWithBodyText++; // of this site alone
252
253 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
254 if(isMRI) {
255 numPagesInMRI++;
256 }
257
258 String[] sentences = maoriTxtDetector.getAllSentences(text);
259 int totalSentences = sentences.length;
260 int numSentencesInMRI = 0;
261 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
262 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
263
264 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
265 this.siteID/*SITE_COUNTER*/,
266 isMRI,
267 totalSentences,
268 singleSentences,
269 overlappingSentences);
270
271
272 for(SentenceInfo si : singleSentences) {
273 //LanguageInfo bestLanguage = si.languagesInfo[0];
274 //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
275 if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
276 numSentencesInMRI++;
277 }
278 }
279
280
281 webpage.setMRISentenceCount(numSentencesInMRI);
282 webpage.setContainsMRI((numSentencesInMRI > 0));
283 if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
284 // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
285 // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
286 numPagesContainingMRI++;
287 }
288
289 //mongodbAccess.insertWebpageInfo(webpage);
290 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
291 mongodbAccess.datastore.save(webpage);
292 }
293 }
294 }
295
296
297 public void websiteDataToDB() {
298
299
300 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
301 // LocalDateTime date =
302 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
303 // String crawlTimestamp =
304 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
305
306 boolean redoCrawl = false;
307
308 if(this.siteCrawlUnfinished) {
309 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
310 if(this.numPagesInMRI > 2) {
311 redoCrawl = true;
312 }
313 }
314
315 File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
316 try {
317 if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
318 this.geoLocationCountryCode = "UNKNOWN";
319 } else {
320 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
321 }
322 } catch(Exception e) {
323 logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e);
324
325 //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
326 //this.geoLocationCountryCode = "NZ";
327 //}
328
329 // Help along identification of domain's country by construing TLDs if 2 letters after last period mark
330 int periodIndex = domainOfSite.length()-3;
331 // .com|org etc extensions that have 3 chars afte period mark will remain unknown
332 // 2 letter extensions will be considered TLD
333 if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
334 // has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
335 String TLD = domainOfSite.substring(periodIndex+1);
336 this.geoLocationCountryCode = TLD.toUpperCase();
337 } else {
338 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
339 }
340 }
341
342 int totalPages = pages.size();
343
344 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID,
345 this.domainOfSite, //this.baseSiteDomain,
346 totalPages, this.countOfWebPagesWithBodyText,
347 this.numPagesInMRI, this.numPagesContainingMRI,
348 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
349 this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
350
351 //mongodbAccess.insertWebsiteInfo(website);
352 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
353 mongodbAccess.datastore.save(website);
354 }
355
356
357 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
358
359 public static void printUsage() {
360 System.err.println("Run this program as:");
361 System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
362 }
363
364 public static void main(String[] args) {
365 if(args.length != 1) {
366 printUsage();
367 return;
368 }
369
370 File sitesDir = new File(args[0]);
371 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
372 logger.error("Error: " + args[0] + " does not exist or is not a directory");
373 return;
374 }
375
376 NutchTextDumpToMongoDB.DEBUG_MODE = false;
377
378
379 try (
380 MongoDBAccess mongodb = new MongoDBAccess();
381 ) {
382
383 mongodb.connectToDB();
384 //mongodb.showCollections();
385
386 // print out the column headers for the websites csv file
387 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
388 // OPTIONAL TODO: creating collections can be done here if dropping and recreating
389
390 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
391 File[] sites = sitesDir.listFiles();
392
393 // sort site folders in alphabetical order
394 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
395 Arrays.sort(sites);
396
397 for(File siteDir : sites) { // e.g. 00001
398 if(siteDir.isDirectory()) {
399 // look for dump.txt
400 File txtDumpFile = new File(siteDir, "dump.txt");
401 if(!txtDumpFile.exists()) {
402 logger.error("Text dump file " + txtDumpFile + " did not exist");
403 continue;
404 }
405
406 else {
407 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
408
409 String siteID = siteDir.getName();
410 if(siteID.contains("_")) {
411 logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
412 continue;
413 }
414
415 long lastModified = siteDir.lastModified();
416 logger.debug("@@@ Processing siteID: " + siteID);
417 NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
418 mongodb, mriTxtDetector,
419 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
420 // now it's parsed all the web pages in the site's text dump
421
422 // Let's print stats on each web page's detected language being MRI or not
423 // and how many pages there were in the site in total.
424
425 //nutchTxtDump.printSiteStats();
426
427 nutchTxtDump.websiteDataToDB();
428 }
429 }
430
431 }
432
433 } catch(Exception e) {
434 // can get an exception when instantiating NutchTextDumpToMongoDB instance
435 // or with CSV file
436 logger.error(e.getMessage(), e);
437 }
438 }
439}
Note: See TracBrowser for help on using the repository browser.