source: other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33983

Last change on this file since 33983 was 33983, checked in by ak19, 4 years ago

More sensible name for method which had too long kept its old name from when all it did was keep track of site and page level statistics

File size: 16.1 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12//import org.bson.types.ObjectId;
13
14import org.greenstone.atea.morphia.*;
15
16
17/**
18 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
19 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
20 * This reads in the dump.txt file contained in each site folder within the input folder.
21 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
22 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
23 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
24 * which parses out the actual text body content of each webpage's section within a dump.txt.
25 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
26 * likely to be in Maori or not.
27 *
28 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
29 * nor even newline separator, it's hard to be sure that the entire page is in language.
30 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
31 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
32 *
33 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
34 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
35 * instead of running it over the entire html body's text.
36 *
37 * TO COMPILE OR RUN, FIRST DO:
38 * cd maori-lang-detection/apache-opennlp-1.9.1
39 * export OPENNLP_HOME=`pwd`
40 * cd maori-lang-detection/src
41 *
42 * TO COMPILE:
43 * maori-lang-detection/src$
44 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
45 *
46 * TO RUN:
47 * maori-lang-detection/src$
48 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
49 *
50 * or:
51 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
52 *
53*/
54public class NutchTextDumpToMongoDB {
55 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
56
57 static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
58
59 /** Counter for number of sites.
60 * Should be equal to number of times NutchTextDumpToMongoDB constructor
61 * is called: once per site.
62 */
63 static private int SITE_COUNTER = 0;
64 static private long WEBPAGE_COUNTER = 0;
65
66 private final MaoriTextDetector maoriTxtDetector;
67 private final MongoDBAccess mongodbAccess;
68
69 public final String siteID;
70 public final boolean siteCrawlUnfinished;
71 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
72
73 private int countOfWebPagesWithBodyText = 0;
74
75 private String geoLocationCountryCode = null; /** 2 letter country code */
76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */
77
78 private String domainOfSite;
79 //private String baseSiteDomain; // domainOfSite stripped of any http(s)://www.
80 private int numPagesInMRI = 0;
81 private int numPagesContainingMRI = 0;
82
83 /** keep a list to store the text of each page */
84 private ArrayList<TextDumpPage> pages;
85
86
87
88 /** Number of language and confidence results to return for storing in MongoDB
89 * MongoDB runs out of space if storing too many, as we store this info per sentence
90 * and a long text document becomes a very large MongoDB document presumably */
91 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
92
93
94 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
95 // The start of a new web page's record in nutch's text dump of an entire site
96 // is denoted by a newline followed by a URL (protocol)
97 // or the very start of the file with a URL (protocol)
98 return ((prevLine == null || prevLine.equals(""))
99 && (line.startsWith("http://") || line.startsWith("https://")));
100 }
101
102 public void debugPageDump(StringBuilder pageDump) {
103 if(DEBUG_MODE) {
104 // START DEBUG
105 logger.debug("__________________________________________");
106 logger.debug("@@@ Found page entry: ");
107 logger.debug("__________________________________________");
108 logger.debug(pageDump.toString());
109 logger.debug("------------------------------------------");
110 // END DEBUG
111 }
112 }
113
114 /** A NutchTextDumpToMongoDB processes the dump.txt for one site */
115 public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
116 MaoriTextDetector maoriTxtDetector, String siteID,
117 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
118 throws IOException
119 {
120 // increment static counter of sites processed by a NutchTextDumpToMongoDB instance
121 SITE_COUNTER++;
122
123 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
124 this.siteID = siteID;
125 this.siteCrawlUnfinished = siteCrawlUnfinished;
126 this.siteCrawledTimestamp = lastModified;
127
128 this.maoriTxtDetector = maoriTxtDetector;
129 this.mongodbAccess = mongodbAccess;
130
131 pages = new ArrayList<TextDumpPage>();
132
133 String line = null;
134 StringBuilder pageDump = null;
135 try (
136 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
137 ) {
138
139 boolean readingText = false;
140 String prevLine = null;
141
142 while((line = reader.readLine()) != null) { // readLine removes newline separator
143 line = line.trim();
144 // iff outside of a page's body text, then an empty line marks the end of a page
145 // in nutch's text dump of a site.
146 // But note, there can be an empty line (or more?) between the start and end
147 // markers of a page's text, though.
148
149 if(isStartOfNewWebPageRecord(prevLine, line)) {
150
151 if(pageDump != null) { // should also be the case then: if(prevLine != null)
152 // finish old pageDump and begin new one
153
154 //debugPageDump(pageDump);
155
156 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
157 // parses the fields and body text of a webpage in nutch's txt dump of entire site
158 //page.parseFields();
159 //page.getText();
160 pages.add(page);
161 inspectPageURLPath(page);
162 pageDump = null;
163
164 }
165
166 // begin new webpage dump
167 pageDump = new StringBuilder();
168 pageDump.append(line);
169 pageDump.append("\n");
170
171 }
172 else if(!line.equals("")) {
173 pageDump.append(line);
174 pageDump.append("\n");
175
176 }
177 // can throw away any newlines between text start and end markers.
178
179 prevLine = line;
180 }
181
182 // process final webpage record:
183 //debugPageDump(pageDump);
184
185 if(pageDump == null) {
186 logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
187 } else {
188 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
189 pages.add(page);
190 inspectPageURLPath(page);
191 pageDump = null;
192 }
193
194
195 } catch (IOException ioe) {
196 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
197 }
198
199 // Just do this once: get and store domain of site.
200 // Passing true to get domain with protocol prefix
201 if(pages.size() > 0) {
202 TextDumpPage firstPage = pages.get(0);
203 String url = firstPage.getPageURL();
204 this.domainOfSite = Utility.getDomainForURL(url, true);
205 //this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite);
206 }
207 else {
208 this.domainOfSite = "UNKNOWN";
209 //this.baseSiteDomain = "UNKNOWN";
210 }
211
212 /* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above.
213 // For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path
214 for(TextDumpPage aPage : pages) {
215 inspectPageURLPath(aPage);
216 }
217 */
218 webPageDataToMongoDB(mongodbAccess);
219 }
220
221
222 /** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL.
223 * This method is called on each page of a site as the page is created. */
224 private void inspectPageURLPath(TextDumpPage page) {
225 String url = page.getPageURL();
226 //logger.debug("@@@@ pageURL: " + url);
227
228 if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site,
229 // check if this page of the site contains /mi(/) or http(s)://mi in its URL path
230 if(url.contains("/mi/") || url.endsWith("/mi") || url.startsWith("https://mi.") || url.startsWith("http://mi.")) {
231 this.urlContainsLangCodeInPath = true;
232 }
233 }
234 }
235
236
237 private void webPageDataToMongoDB(MongoDBAccess mongodbAccess) throws IOException {
238
239 TextDumpPage page = null;
240 for(int i = 0; i < pages.size(); i++) {
241
242 page = pages.get(i);
243
244 String text = page.getPageText();
245
246 if(text.equals("")) {
247 // don't care about empty pages
248 continue;
249 }
250 else {
251 WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
252 countOfWebPagesWithBodyText++; // of this site alone
253
254 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
255 if(isMRI) {
256 numPagesInMRI++;
257 }
258
259 String[] sentences = maoriTxtDetector.getAllSentences(text);
260 int totalSentences = sentences.length;
261 int numSentencesInMRI = 0;
262 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
263 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
264
265 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
266 this.siteID/*SITE_COUNTER*/,
267 isMRI,
268 totalSentences,
269 singleSentences,
270 overlappingSentences);
271
272
273 for(SentenceInfo si : singleSentences) {
274 //LanguageInfo bestLanguage = si.languagesInfo[0];
275 //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
276 if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
277 numSentencesInMRI++;
278 }
279 }
280
281
282 webpage.setMRISentenceCount(numSentencesInMRI);
283 webpage.setContainsMRI((numSentencesInMRI > 0));
284 if(numSentencesInMRI > 0) { // if(numSentencesInMRI >= 5) {
285 // Not sure if we can trust that a single sentence detected as Maori on a page is really Maori
286 // But if at least 5 sentences are detected as Maori, it is more likely to be the case to be MRI?
287 numPagesContainingMRI++;
288 }
289
290 //mongodbAccess.insertWebpageInfo(webpage);
291 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
292 mongodbAccess.datastore.save(webpage);
293 }
294 }
295 }
296
297
298 public void websiteDataToDB() {
299
300
301 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
302 // LocalDateTime date =
303 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
304 // String crawlTimestamp =
305 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
306
307 boolean redoCrawl = false;
308
309 if(this.siteCrawlUnfinished) {
310 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
311 if(this.numPagesInMRI > 2) {
312 redoCrawl = true;
313 }
314 }
315
316 File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
317 try {
318 if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
319 this.geoLocationCountryCode = "UNKNOWN";
320 } else {
321 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
322 }
323 } catch(Exception e) {
324 logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e);
325
326 //if(this.domainOfSite.endsWith(".nz")) { // nz TLDs are worth counting
327 //this.geoLocationCountryCode = "NZ";
328 //}
329
330 // Help along identification of domain's country by construing TLDs if 2 letters after last period mark
331 int periodIndex = domainOfSite.length()-3;
332 // .com|org etc extensions that have 3 chars afte period mark will remain unknown
333 // 2 letter extensions will be considered TLD
334 if(periodIndex >=0 && domainOfSite.charAt(periodIndex) == '.' && ((periodIndex+1) < domainOfSite.length())) {
335 // has a 2 letter TLD. Make it uppercase to match return value of Utility.getCountryCodeOfDomain() above
336 String TLD = domainOfSite.substring(periodIndex+1);
337 this.geoLocationCountryCode = TLD.toUpperCase();
338 } else {
339 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
340 }
341 }
342
343 int totalPages = pages.size();
344
345 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID,
346 this.domainOfSite, //this.baseSiteDomain,
347 totalPages, this.countOfWebPagesWithBodyText,
348 this.numPagesInMRI, this.numPagesContainingMRI,
349 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
350 this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
351
352 //mongodbAccess.insertWebsiteInfo(website);
353 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
354 mongodbAccess.datastore.save(website);
355 }
356
357
358 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
359
360 public static void printUsage() {
361 System.err.println("Run this program as:");
362 System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
363 }
364
365 public static void main(String[] args) {
366 if(args.length != 1) {
367 printUsage();
368 return;
369 }
370
371 File sitesDir = new File(args[0]);
372 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
373 logger.error("Error: " + args[0] + " does not exist or is not a directory");
374 return;
375 }
376
377 NutchTextDumpToMongoDB.DEBUG_MODE = false;
378
379
380 try (
381 MongoDBAccess mongodb = new MongoDBAccess();
382 ) {
383
384 mongodb.connectToDB();
385 //mongodb.showCollections();
386
387 // print out the column headers for the websites csv file
388 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
389 // OPTIONAL TODO: creating collections can be done here if dropping and recreating
390
391 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
392 File[] sites = sitesDir.listFiles();
393
394 // sort site folders in alphabetical order
395 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
396 Arrays.sort(sites);
397
398 for(File siteDir : sites) { // e.g. 00001
399 if(siteDir.isDirectory()) {
400 // look for dump.txt
401 File txtDumpFile = new File(siteDir, "dump.txt");
402 if(!txtDumpFile.exists()) {
403 logger.error("Text dump file " + txtDumpFile + " did not exist");
404 continue;
405 }
406
407 else {
408 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
409
410 String siteID = siteDir.getName();
411 if(siteID.contains("_")) {
412 logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
413 continue;
414 }
415
416 long lastModified = siteDir.lastModified();
417 logger.debug("@@@ Processing siteID: " + siteID);
418 NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
419 mongodb, mriTxtDetector,
420 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
421 // now it's parsed all the web pages in the site's text dump
422
423 // Let's print stats on each web page's detected language being MRI or not
424 // and how many pages there were in the site in total.
425
426 //nutchTxtDump.printSiteStats();
427
428 nutchTxtDump.websiteDataToDB();
429 }
430 }
431
432 }
433
434 } catch(Exception e) {
435 // can get an exception when instantiating NutchTextDumpToMongoDB instance
436 // or with CSV file
437 logger.error(e.getMessage(), e);
438 }
439 }
440}
Note: See TracBrowser for help on using the repository browser.