source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 13.1 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12
13/**
14 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
15 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
16 * This reads in the dump.txt file contained in each site folder within the input folder.
17 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
18 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
19 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
20 * which parses out the actual text body content of each webpage's section within a dump.txt.
21 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
22 * likely to be in Maori or not.
23 *
24 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
25 * nor even newline separator, it's hard to be sure that the entire page is in language.
26 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
27 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
28 *
29 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
30 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
31 * instead of running it over the entire html body's text.
32 *
33 * TO COMPILE OR RUN, FIRST DO:
34 * cd maori-lang-detection/apache-opennlp-1.9.1
35 * export OPENNLP_HOME=`pwd`
36 * cd maori-lang-detection/src
37 *
38 * TO COMPILE:
39 * maori-lang-detection/src$
40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
41 *
42 * TO RUN:
43 * maori-lang-detection/src$
44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
45 *
46 * or:
47 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
48 *
49*/
50public class NutchTextDumpToMongoDB {
51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
52
53 static boolean DEBUG_MODE = true;
54
55 /** Counter for number of sites.
56 * Should be equal to number of times NutchTextDumpToMongoDB constructor
57 * is called: once per site.
58 */
59 static private int SITE_COUNTER = 0;
60 static private long WEBPAGE_COUNTER = 0;
61
62 private final MaoriTextDetector maoriTxtDetector;
63 private final MongoDBAccess mongodbAccess;
64
65 public final String siteID;
66 public final boolean siteCrawlUnfinished;
67 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
68
69 private int countOfWebPagesWithBodyText = 0;
70
71 private String geoLocationCountryCode = null; /** 2 letter country code */
72 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */
73
74 private String domainOfSite;
75 private int numPagesInMRI = 0;
76
77 /** keep a list to store the text of each page */
78 private ArrayList<TextDumpPage> pages;
79
80 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
81 // The start of a new web page's record in nutch's text dump of an entire site
82 // is denoted by a newline followed by a URL (protocol)
83 // or the very start of the file with a URL (protocol)
84 return ((prevLine == null || prevLine.equals(""))
85 && (line.startsWith("http://") || line.startsWith("https://")));
86 }
87
88 public void debugPageDump(StringBuilder pageDump) {
89 if(DEBUG_MODE) {
90 // START DEBUG
91 logger.debug("__________________________________________");
92 logger.debug("@@@ Found page entry: ");
93 logger.debug("__________________________________________");
94 logger.debug(pageDump.toString());
95 logger.debug("------------------------------------------");
96 // END DEBUG
97 }
98 }
99
100 /** A NutchTextDumpToMongoDB processes the dump.txt for one site */
101 public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
102 MaoriTextDetector maoriTxtDetector, String siteID,
103 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
104 throws IOException
105 {
106 // increment static counter of sites processed by a NutchTextDumpToMongoDB instance
107 SITE_COUNTER++;
108
109 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
110 this.siteID = siteID;
111 this.siteCrawlUnfinished = siteCrawlUnfinished;
112 this.siteCrawledTimestamp = lastModified;
113
114 this.maoriTxtDetector = maoriTxtDetector;
115 this.mongodbAccess = mongodbAccess;
116
117 pages = new ArrayList<TextDumpPage>();
118
119 String line = null;
120 StringBuilder pageDump = null;
121 try (
122 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
123 ) {
124
125 boolean readingText = false;
126 String prevLine = null;
127
128 while((line = reader.readLine()) != null) { // readLine removes newline separator
129 line = line.trim();
130 // iff outside of a page's body text, then an empty line marks the end of a page
131 // in nutch's text dump of a site.
132 // But note, there can be an empty line (or more?) between the start and end
133 // markers of a page's text, though.
134
135 if(isStartOfNewWebPageRecord(prevLine, line)) {
136
137 if(pageDump != null) { // should also be the case then: if(prevLine != null)
138 // finish old pageDump and begin new one
139
140 //debugPageDump(pageDump);
141
142 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
143 // parses the fields and body text of a webpage in nutch's txt dump of entire site
144 //page.parseFields();
145 //page.getText();
146 pages.add(page);
147 pageDump = null;
148
149 }
150
151 // begin new webpage dump
152 pageDump = new StringBuilder();
153 pageDump.append(line);
154 pageDump.append("\n");
155
156 }
157 else if(!line.equals("")) {
158 pageDump.append(line);
159 pageDump.append("\n");
160
161 }
162 // can throw away any newlines between text start and end markers.
163
164 prevLine = line;
165 }
166
167 // process final webpage record:
168 //debugPageDump(pageDump);
169
170 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
171 pages.add(page);
172 pageDump = null;
173
174 // for every site, we just need to work out if any of its pages
175 // contains /mi(/) in its URL
176 String url = page.getPageURL();
177 if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) {
178 this.urlContainsLangCodeInPath = true;
179 }
180
181 } catch (IOException ioe) {
182 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
183 }
184
185 // Just do this once: get and store domain of site.
186 // Passing true to get domain with protocol prefix
187 if(pages.size() > 0) {
188 TextDumpPage firstPage = pages.get(0);
189 String url = firstPage.getPageURL();
190 this.domainOfSite = Utility.getDomainForURL(url, true);
191 }
192 else {
193 this.domainOfSite = "UNKNOWN";
194 }
195
196
197 prepareSiteStats(mongodbAccess);
198 }
199
200
201 private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
202
203 TextDumpPage page = null;
204 for(int i = 0; i < pages.size(); i++) {
205
206 page = pages.get(i);
207
208 String text = page.getPageText();
209
210 if(text.equals("")) {
211 // don't care about empty pages
212 continue;
213 }
214 else {
215 WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
216 countOfWebPagesWithBodyText++; // of this site alone
217
218 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
219 if(isMRI) {
220 numPagesInMRI++;
221 }
222
223 String[] sentences = maoriTxtDetector.getAllSentences(text);
224 int totalSentences = sentences.length;
225 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences);
226 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences);
227
228 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER,
229 SITE_COUNTER,
230 isMRI,
231 totalSentences,
232 singleSentences,
233 overlappingSentences);
234
235
236 mongodbAccess.insertWebpageInfo(webpage);
237 }
238 }
239 }
240
241 /*
242 public void printSiteStats() {
243
244
245 logger.info("------------- " + this.siteID + " SITE STATS -----------");
246
247 logger.info("SITE DOMAIN: " + this.domainOfSite);
248 logger.info("Total number of web pages in site: " + pages.size());
249 logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
250
251 if(pagesInMRI.size() > 0) {
252 logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
253 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
254 logger.info(mriWebPageInfo.toString());
255 }
256 }
257
258 logger.info(" ----------- ");
259 if(pagesContainingMRI.size() > 0) {
260 logger.info("The following pages weren't detected as primarily being in Māori");
261 logger.info("But still contained sentences detected as Māori");
262 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
263 logger.info(mriWebPageInfo.toString());
264 }
265
266 } else {
267 logger.info("No further pages detected as containing any sentences in MRI");
268 }
269 logger.info(" ----------- ");
270 }
271 */
272
273
274
275 public void websiteDataToDB() {
276
277
278 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
279 // LocalDateTime date =
280 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
281 // String crawlTimestamp =
282 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
283
284 boolean redoCrawl = false;
285
286 if(this.siteCrawlUnfinished) {
287 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
288 if(this.numPagesInMRI > 2) {
289 redoCrawl = true;
290 }
291 }
292
293 //File geoLiteCityDatFile = new File(this.getClass().getResource("GeoLiteCity.dat").getFile());
294 //this.geoLocationCountryCode = getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
295
296 int totalPages = pages.size();
297
298 WebsiteInfo website = new WebsiteInfo(SITE_COUNTER, this.siteID, this.domainOfSite,
299 totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
300 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
301 this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
302
303 mongodbAccess.insertWebsiteInfo(website);
304
305 }
306
307
308 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
309
310 public static void printUsage() {
311 System.err.println("Run this program as:");
312 System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
313 }
314
315 public static void main(String[] args) {
316 if(args.length != 1) {
317 printUsage();
318 return;
319 }
320
321 File sitesDir = new File(args[0]);
322 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
323 logger.error("Error: " + args[0] + " does not exist or is not a directory");
324 return;
325 }
326
327 NutchTextDumpToMongoDB.DEBUG_MODE = false;
328
329
330 try (
331 MongoDBAccess mongodb = new MongoDBAccess();
332 ) {
333
334 mongodb.connectToDB();
335 //mongodb.showCollections();
336
337 // print out the column headers for the websites csv file
338 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
339 // OPTIONAL TODO: creating collections can be done here if dropping and recreating
340
341 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
342 File[] sites = sitesDir.listFiles();
343
344 // sort site folders in alphabetical order
345 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
346 Arrays.sort(sites);
347
348 for(File siteDir : sites) { // e.g. 00001
349 if(siteDir.isDirectory()) {
350 // look for dump.txt
351 File txtDumpFile = new File(siteDir, "dump.txt");
352 if(!txtDumpFile.exists()) {
353 logger.error("Text dump file " + txtDumpFile + " did not exist");
354 continue;
355 }
356
357 else {
358 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
359
360 String siteID = siteDir.getName();
361 long lastModified = siteDir.lastModified();
362 logger.debug("Found siteID: " + siteID);
363 NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
364 mongodb, mriTxtDetector,
365 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
366 // now it's parsed all the web pages in the site's text dump
367
368 // Let's print stats on each web page's detected language being MRI or not
369 // and how many pages there were in the site in total.
370
371 //nutchTxtDump.printSiteStats();
372
373 nutchTxtDump.websiteDataToDB();
374 }
375 }
376
377 }
378
379 } catch(Exception e) {
380 // can get an exception when instantiating NutchTextDumpToMongoDB instance
381 // or with CSV file
382 logger.error(e.getMessage(), e);
383 }
384 }
385}
Note: See TracBrowser for help on using the repository browser.