source: other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java@ 33800

Last change on this file since 33800 was 33800, checked in by ak19, 4 years ago

Removed an adult site from crawled contents and added its url to blacklist conf file (for if ever anyone crawls our MRI set of common crawl sites again)

File size: 13.8 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.time.LocalDateTime;
6import java.util.ArrayList;
7import java.util.Arrays;
8
9import org.apache.commons.csv.*;
10import org.apache.log4j.Logger;
11
12//import org.bson.types.ObjectId;
13
14import org.greenstone.atea.morphia.*;
15
16
17/**
18 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
19 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
20 * This reads in the dump.txt file contained in each site folder within the input folder.
21 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
22 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
23 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
24 * which parses out the actual text body content of each webpage's section within a dump.txt.
25 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
26 * likely to be in Maori or not.
27 *
28 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
29 * nor even newline separator, it's hard to be sure that the entire page is in language.
30 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
31 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
32 *
33 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
34 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
35 * instead of running it over the entire html body's text.
36 *
37 * TO COMPILE OR RUN, FIRST DO:
38 * cd maori-lang-detection/apache-opennlp-1.9.1
39 * export OPENNLP_HOME=`pwd`
40 * cd maori-lang-detection/src
41 *
42 * TO COMPILE:
43 * maori-lang-detection/src$
44 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB.java
45 *
46 * TO RUN:
47 * maori-lang-detection/src$
48 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small
49 *
50 * or:
51 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToMongoDB ../crawled-small > ../crawled-small/bla.txt 2>&1
52 *
53*/
54public class NutchTextDumpToMongoDB {
55 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
56
57 static boolean DEBUG_MODE = true;
58
59 /** Counter for number of sites.
60 * Should be equal to number of times NutchTextDumpToMongoDB constructor
61 * is called: once per site.
62 */
63 static private int SITE_COUNTER = 0;
64 static private long WEBPAGE_COUNTER = 0;
65
66 private final MaoriTextDetector maoriTxtDetector;
67 private final MongoDBAccess mongodbAccess;
68
69 public final String siteID;
70 public final boolean siteCrawlUnfinished;
71 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
72
73 private int countOfWebPagesWithBodyText = 0;
74
75 private String geoLocationCountryCode = null; /** 2 letter country code */
76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */
77
78 private String domainOfSite;
79 private int numPagesInMRI = 0;
80
81 /** keep a list to store the text of each page */
82 private ArrayList<TextDumpPage> pages;
83
84
85
86 /** Number of language and confidence results to return for storing in MongoDB
87 * MongoDB runs out of space if storing too many, as we store this info per sentence
88 * and a long text document becomes a very large MongoDB document presumable*/
89 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
90
91
92 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
93 // The start of a new web page's record in nutch's text dump of an entire site
94 // is denoted by a newline followed by a URL (protocol)
95 // or the very start of the file with a URL (protocol)
96 return ((prevLine == null || prevLine.equals(""))
97 && (line.startsWith("http://") || line.startsWith("https://")));
98 }
99
100 public void debugPageDump(StringBuilder pageDump) {
101 if(DEBUG_MODE) {
102 // START DEBUG
103 logger.debug("__________________________________________");
104 logger.debug("@@@ Found page entry: ");
105 logger.debug("__________________________________________");
106 logger.debug(pageDump.toString());
107 logger.debug("------------------------------------------");
108 // END DEBUG
109 }
110 }
111
112 /** A NutchTextDumpToMongoDB processes the dump.txt for one site */
113 public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
114 MaoriTextDetector maoriTxtDetector, String siteID,
115 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
116 throws IOException
117 {
118 // increment static counter of sites processed by a NutchTextDumpToMongoDB instance
119 SITE_COUNTER++;
120
121 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
122 this.siteID = siteID;
123 this.siteCrawlUnfinished = siteCrawlUnfinished;
124 this.siteCrawledTimestamp = lastModified;
125
126 this.maoriTxtDetector = maoriTxtDetector;
127 this.mongodbAccess = mongodbAccess;
128
129 pages = new ArrayList<TextDumpPage>();
130
131 String line = null;
132 StringBuilder pageDump = null;
133 try (
134 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
135 ) {
136
137 boolean readingText = false;
138 String prevLine = null;
139
140 while((line = reader.readLine()) != null) { // readLine removes newline separator
141 line = line.trim();
142 // iff outside of a page's body text, then an empty line marks the end of a page
143 // in nutch's text dump of a site.
144 // But note, there can be an empty line (or more?) between the start and end
145 // markers of a page's text, though.
146
147 if(isStartOfNewWebPageRecord(prevLine, line)) {
148
149 if(pageDump != null) { // should also be the case then: if(prevLine != null)
150 // finish old pageDump and begin new one
151
152 //debugPageDump(pageDump);
153
154 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
155 // parses the fields and body text of a webpage in nutch's txt dump of entire site
156 //page.parseFields();
157 //page.getText();
158 pages.add(page);
159 pageDump = null;
160
161 }
162
163 // begin new webpage dump
164 pageDump = new StringBuilder();
165 pageDump.append(line);
166 pageDump.append("\n");
167
168 }
169 else if(!line.equals("")) {
170 pageDump.append(line);
171 pageDump.append("\n");
172
173 }
174 // can throw away any newlines between text start and end markers.
175
176 prevLine = line;
177 }
178
179 // process final webpage record:
180 //debugPageDump(pageDump);
181
182 if(pageDump == null) {
183 logger.warn("siteID " + siteID + " had an empty dump.txt file. Reinspect site.");
184 } else {
185 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
186 pages.add(page);
187 pageDump = null;
188
189 // for every site, we just need to work out if any of its pages
190 // contains /mi(/) in its URL
191 String url = page.getPageURL();
192 if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) {
193 this.urlContainsLangCodeInPath = true;
194 }
195 }
196
197 } catch (IOException ioe) {
198 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
199 }
200
201 // Just do this once: get and store domain of site.
202 // Passing true to get domain with protocol prefix
203 if(pages.size() > 0) {
204 TextDumpPage firstPage = pages.get(0);
205 String url = firstPage.getPageURL();
206 this.domainOfSite = Utility.getDomainForURL(url, true);
207 }
208 else {
209 this.domainOfSite = "UNKNOWN";
210 }
211
212
213 prepareSiteStats(mongodbAccess);
214 }
215
216
217 private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
218
219 TextDumpPage page = null;
220 for(int i = 0; i < pages.size(); i++) {
221
222 page = pages.get(i);
223
224 String text = page.getPageText();
225
226 if(text.equals("")) {
227 // don't care about empty pages
228 continue;
229 }
230 else {
231 WEBPAGE_COUNTER++; // count of cumulative total of webpages for all sites
232 countOfWebPagesWithBodyText++; // of this site alone
233
234 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
235 if(isMRI) {
236 numPagesInMRI++;
237 }
238
239 String[] sentences = maoriTxtDetector.getAllSentences(text);
240 int totalSentences = sentences.length;
241 int numSentencesInMRI = 0;
242 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
243 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
244
245 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
246 this.siteID/*SITE_COUNTER*/,
247 isMRI,
248 totalSentences,
249 singleSentences,
250 overlappingSentences);
251
252
253 for(SentenceInfo si : singleSentences) {
254 //LanguageInfo bestLanguage = si.languagesInfo[0];
255 //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
256 if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
257 numSentencesInMRI++;
258 }
259 }
260
261
262 webpage.setMRISentenceCount(numSentencesInMRI);
263 webpage.setContainsMRI((numSentencesInMRI > 0));
264
265 //mongodbAccess.insertWebpageInfo(webpage);
266 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
267 mongodbAccess.datastore.save(webpage);
268 }
269 }
270 }
271
272
273 public void websiteDataToDB() {
274
275
276 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds
277 // LocalDateTime date =
278 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault());
279 // String crawlTimestamp =
280 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss"));
281
282 boolean redoCrawl = false;
283
284 if(this.siteCrawlUnfinished) {
285 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website
286 if(this.numPagesInMRI > 2) {
287 redoCrawl = true;
288 }
289 }
290
291 File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
292 try {
293 if(this.domainOfSite.equals("UNKNOWN")) { // for sites that had 0 webpages downloaded, we have no domain
294 this.geoLocationCountryCode = "UNKNOWN";
295 } else {
296 this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
297 }
298 } catch(Exception e) {
299 logger.error("*** For SiteID " + siteID + ", got exception: " + e.getMessage(), e);
300 this.geoLocationCountryCode = "UNKNOWN"; // couldn't get the country code, so should also be UNKNOWN not null
301 }
302
303 int totalPages = pages.size();
304
305 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite,
306 totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
307 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
308 this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
309
310 //mongodbAccess.insertWebsiteInfo(website);
311 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
312 mongodbAccess.datastore.save(website);
313 }
314
315
316 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
317
318 public static void printUsage() {
319 System.err.println("Run this program as:");
320 System.err.println("\tNutchTextDumpToMongoDB <path to 'crawled' folder>");
321 }
322
323 public static void main(String[] args) {
324 if(args.length != 1) {
325 printUsage();
326 return;
327 }
328
329 File sitesDir = new File(args[0]);
330 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
331 logger.error("Error: " + args[0] + " does not exist or is not a directory");
332 return;
333 }
334
335 NutchTextDumpToMongoDB.DEBUG_MODE = false;
336
337
338 try (
339 MongoDBAccess mongodb = new MongoDBAccess();
340 ) {
341
342 mongodb.connectToDB();
343 //mongodb.showCollections();
344
345 // print out the column headers for the websites csv file
346 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
347 // OPTIONAL TODO: creating collections can be done here if dropping and recreating
348
349 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
350 File[] sites = sitesDir.listFiles();
351
352 // sort site folders in alphabetical order
353 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
354 Arrays.sort(sites);
355
356 for(File siteDir : sites) { // e.g. 00001
357 if(siteDir.isDirectory()) {
358 // look for dump.txt
359 File txtDumpFile = new File(siteDir, "dump.txt");
360 if(!txtDumpFile.exists()) {
361 logger.error("Text dump file " + txtDumpFile + " did not exist");
362 continue;
363 }
364
365 else {
366 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED");
367
368 String siteID = siteDir.getName();
369 if(siteID.contains("_")) {
370 logger.warn("*** Skipping site " + siteID + " as its dir name indicates it wasn't crawled properly.");
371 continue;
372 }
373
374 long lastModified = siteDir.lastModified();
375 logger.debug("@@@ Processing siteID: " + siteID);
376 NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
377 mongodb, mriTxtDetector,
378 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
379 // now it's parsed all the web pages in the site's text dump
380
381 // Let's print stats on each web page's detected language being MRI or not
382 // and how many pages there were in the site in total.
383
384 //nutchTxtDump.printSiteStats();
385
386 nutchTxtDump.websiteDataToDB();
387 }
388 }
389
390 }
391
392 } catch(Exception e) {
393 // can get an exception when instantiating NutchTextDumpToMongoDB instance
394 // or with CSV file
395 logger.error(e.getMessage(), e);
396 }
397 }
398}
Note: See TracBrowser for help on using the repository browser.