source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33582

Last change on this file since 33582 was 33582, checked in by ak19, 5 years ago

NutchTextDumpProcessor prints each crawled site's stats: number of webpages per crawled site and how many of those were detected by OpenNLP as being in Maori (mri). Needed to make a reusable method in CCWETProcessor as public and static.

File size: 36.9 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.nio.charset.StandardCharsets;
6import java.util.Properties;
7import java.util.zip.GZIPInputStream;
8import java.util.Iterator;
9import java.util.HashMap;
10import java.util.Map;
11import java.util.Set;
12import java.util.TreeMap;
13import java.util.TreeSet;
14
15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16import org.apache.log4j.Logger;
17
18
19/**
20 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21 * the WET records in each, putting each WET record into a file. Each file is put into a
22 * keep or discard or greyListed folder, and its url listed written into a keep, discard
23 * or greylisted text file, based on based on
24 *
25 * 1. whether it's whitelisted, else greylisted else blacklisted
26 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27 * enough content. Formerly, content-length and number of lines were used to determine if
28 * the content was sufficient. Now it's just word count and number of MAX characters
29 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30 * in conf/config.properties.
31 *
32 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33 * into the conf folder to control any url patterns that are explicitly included or excluded or
34 * set aside for inspecting later. These filter text files don't use regexes, instead their
35 * format is:
36 * - precede URL by ^ to blacklist urls that match the given prefix
37 * - succeed URL by $ to blacklist urls that match the given suffix
38 * - ^url$ will blacklist urls that match the given url completely
39 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40 *
41 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
42 * over greylisting and which takes precedence over blacklisting in turn. However, even
43 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45 * also for nutch.
46 *
47 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48 * in the given input folder. Then use a single instance of the WETProcessor class to process
49 * each single unzipped warc.wet file.
50 *
51 * To compile, including the jars in lib/ for compiling.
52 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53 *
54 * To run, passing the log4j and other properties files in conf/ folder:
55 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56 *
57 * e.g. (from maori-lang-detection/src)
58 *
59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 | less
61 *
62*/
63
64public class CCWETProcessor {
65 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67 // Properties shared across WETProcessor instances
68 public final int MAX_WORD_LENGTH;
69 public final int MIN_NUM_WORDS;
70 public final int MAX_WORDS_CAMELCASE;
71
72 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74 public final String SINGLEPAGE = "SINGLEPAGE";
75 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77 /**
78 * Characters that need escaping if used as a string literal in a regex
79 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80 * https://www.regular-expressions.info/refcharacters.html
81 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82 * as would happen for any chars appearing earlier in this list than \\
83 */
84 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
85 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
86
87 private Properties configProperties = new Properties();
88
89 // File paths shared across WETProcessor instances
90 public final File commoncrawlDir;
91 public final File outputFolder;
92 public final File discardFolder;
93 public final File keepFolder;
94 public final File greyListedFolder;
95 public final File keepURLsFile;
96 public final File discardURLsFile;
97 public final File greyListedFile;
98
99 /** Possible values stored in the blackList/whiteList/greyList Maps */
100 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105 /**
106 * Store url patterns as keys and values indicated whether a url should
107 * match it exactly, start/end with it, or contain it
108 */
109 private HashMap<String, Integer> blackList;
110 private HashMap<String, Integer> greyList;
111 private HashMap<String, Integer> whiteList;
112
113 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
114 * with optional regex defining allowed exceptions, like subdomains or url suffixes
115 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117 * would be relevant.
118 * The map would store top site domain suffix and an optional regex string for allowable
119 * url patterns.
120 */
121 private HashMap<String, String> topSitesMap;
122
123 /** Map of domains we keep and the full urls we're keeping that are of that domain.
124 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125 * while a HashMap has no notion of ordering, because we just need to store urls with
126 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130 */
131 private Map<String, Set<String>> domainsToURLsMap;
132
133 // Keep a count of all the records that all WETProcessors instantiated
134 // by our main method combined have processed
135 private int totalRecordCount = 0;
136
137 private int wetFileCount = 0;
138
139 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142 this.commoncrawlDir = inFolder;
143 this.outputFolder = outFolder;
144
145 // load up the properties from the config file
146 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147 configProperties = new Properties();
148 configProperties.load(infile);
149 //infile.close(); // not explicitly called in examples of try-with-resources
150
151 } catch(Exception e) {
152 error("Exception attempting to read properties from config.properties.", e);
153 }
154
155 if(configProperties.size() == 0) {
156 warn("*** Warning: no values read into config properties. Using defaults.");
157 }
158
159 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164 this.discardFolder = new File(outFolder, "discard");
165 if(!discardFolder.exists()) {
166 discardFolder.mkdir();
167 }
168 this.keepFolder = new File(outFolder, "keep");
169 if(!keepFolder.exists()) {
170 keepFolder.mkdir();
171 }
172
173 this.greyListedFolder = new File(outFolder, "greylisted");
174 if(!greyListedFolder.exists()) {
175 greyListedFolder.mkdir();
176 }
177
178 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181 }
182 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185 }
186 this.greyListedFile = new File(outFolder, "greyListed.txt");
187 if(greyListedFile.exists() && !greyListedFile.delete()) {
188 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189 }
190
191 // prepare our blacklist, greylist (for inspection) and whitelist
192 info("Loading blacklist.");
193 blackList = new HashMap<String, Integer>();
194 initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196 info("Loading greylist.");
197 greyList = new HashMap<String, Integer>();
198 initURLFilterList(greyList, "url-greylist-filter.txt");
199
200 info("Loading whitelist.");
201 whiteList = new HashMap<String, Integer>();
202 initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204 // Create the map of topSites
205 info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206 topSitesMap = new HashMap<String, String>();
207
208 // Read in our csv file of topsites and what to do when one hits a match with a seedURL
209 // and put these in our topSitesMap
210 // https://commons.apache.org/proper/commons-csv/apidocs/index.html
211 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214 .withCommentMarker('#')
215 .withSkipHeaderRecord()
216 .withIgnoreSurroundingSpaces();
217
218 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220 try (
221 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222 ) {
223 for (CSVRecord csvRecord : parser) {
224 String topsite = csvRecord.get(0);
225 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226 topSitesMap.put(topsite, allowed_url_pattern);
227
228 //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230 }
231 } catch(Exception e) {
232 error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233 }
234
235 //debug("Prematurely terminating for testing purposes.");
236 //System.exit(-1);
237 }
238
239 /** Work out the 'domain' for a given url.
240 * This retains any www. or subdomain prefix.
241 */
242 public static String getDomainForURL(String url, boolean withProtocol) {
243 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
244 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
245 // the keep the URL around in case param withProtocol=true
246 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
247
248 String domain = url.substring(startIndex);
249 int endIndex = domain.indexOf("/");
250 if(endIndex == -1) endIndex = domain.length();
251 domain = domain.substring(0, endIndex);
252
253 if(withProtocol) {
254 // now that we have the domain (everything to the first / when there is no protocol)
255 // can glue the protocol back on
256 domain = protocol + domain;
257 }
258
259 return domain;
260 }
261
262 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
263 private String escapeStringForRegex(String str) {
264 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
265 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
266 str = str.replace(Character.toString(c), "\\"+c);
267 }
268 return str;
269 }
270
271 /**
272 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
273 * as output the URL seed list and regex-urlfilter text files required by nutch, see
274 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
275 */
276 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
277 File domainURLsFile, File topSiteMatchesFile) {
278 // Maintain a Map of unique domains mapped to seed urls at that domain
279 // TreeSet: by default, "the elements are ordered using their natural ordering"
280 // (or by a Comparator provided at set creation time).
281 // Whereas HashSet doesn't guarantee ordering.
282 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
283 // Would be a similar distinction for Maps.
284 domainsToURLsMap = new TreeMap<String, Set<String>>();
285
286 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
287 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
288
289 try (
290 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
291 ) {
292
293 // read a URL at a time from urlsFile
294 String url = null;
295 String domainWithProtocol = null;
296 while((url = reader.readLine()) != null) { // readLine removes newline separator
297
298 // work out domain. This retains any www. or subdomain prefix
299 // passing true to further also retain the http(s) protocol
300 domainWithProtocol = getDomainForURL(url, true);
301
302 Set<String> urlsSet;
303 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
304 urlsSet = new TreeSet<String>();
305 urlsSet.add(url);
306 domainsToURLsMap.put(domainWithProtocol, urlsSet);
307 } else {
308 urlsSet = domainsToURLsMap.get(domainWithProtocol);
309 urlsSet.add(url);
310 }
311
312 }
313 } catch (IOException ioe) {
314 error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
315 }
316
317 // We'd have pruned out duplicates by now and have a sorted list of domains,
318 // each of which maps to seed URLs in the commoncrawl for that domain
319
320 int domainCount = 0;
321 File sitesFolder = new File(outputFolder, "sites");
322 if(!sitesFolder.exists()) {
323 sitesFolder.mkdir();
324 }
325 final String FORMATSTR = "%05d";
326
327 // write out each domain followed in sequence by all urls we found in that domain
328 // (urls with tab up front)
329 try (
330 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
331 // Also a global file listing any urls that matched top sites that didn't specify
332 // allowed regex patterns
333 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
334 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
335 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
336 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
337 ) {
338
339 // initialise topSiteMatchesFile with some instructional text.
340 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
341 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
342 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
343
344 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
345 Set<String> domainsSet = domainsToURLsMap.keySet();
346 Iterator<String> domainIterator = domainsSet.iterator();
347
348 /*
349 // DEBUG
350 String value = topSitesMap.get("wikipedia.org");
351 if(value == null) {
352 debug("### wikipedia.org had null value");
353 } else {
354 debug("### wikipedia.org had value: " + value);
355 } // DEBUG
356 */
357
358 while(domainIterator.hasNext()) {
359 String domainWithProtocol = domainIterator.next();
360 // Also get domain without protocol prefix
361 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
362 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
363 String domain = domainWithProtocol.substring(startIndex);
364
365 /*if(domain.contains("docs.google.com")) {
366 debug("domain with protocol: " + domainWithProtocol);
367 debug("domain: " + domain);
368 }*/
369
370 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
371 // If the domain is of a topsite for which no allowed URL pattern has been provided
372 // in sites-too-big-to-exhaustively-crawl.txt,
373 // then we don't know how to crawl the site. Warn the user by writing the affected
374 // domain and seedURLs to the topSiteMatchesFile.
375 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
376
377 // topsite, but we don't (yet) know what portion can be crawled
378 // Append the top site and url to a global/toplevel file that
379 // the user needs to check later and we're done with this domain as it
380 // won't go into any other file hereafter
381
382 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
383 for(String url : urlsForDomainSet) {
384 topSiteMatchesWriter.write("\t" + url + "\n");
385 }
386
387 continue; // done with this domain
388 }
389
390 // start counting the domains we're actually going to process
391 domainCount++;
392
393 String siteID = String.format(FORMATSTR, domainCount);
394 File domainFolder = new File(sitesFolder, siteID);
395 domainFolder.mkdir();
396
397 // write out the domain
398 //seedURLsWriter.write(domainWithProtocol + "\n");
399
400
401 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
402 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
403 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
404 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
405 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
406 try (
407 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
408 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
409 ) {
410
411 // write all sorted unique domains into global domains file
412 // Using the domain withuot protocol since the global domains file is for
413 // informational purposes
414 domainURLsWriter.write(domain + "\n");
415
416 // Only write urls and no domain into single global seedurls file
417 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
418 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
419 // If we ever run nutch on a single seedURLs listing containing
420 // all seed pages to crawl sites from, the above two files will work for that.
421
422 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file
423 // also write into the global seeds file (with a tab prefixed to each?)
424 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
425 for(String url : urlsForDomainSet) {
426 seedURLsWriter.write(url + "\n"); // global seedURLs file
427 siteURLsWriter.write(url + "\n");
428 }
429
430
431 if(allowedURLPatternRegex == null) { // entire site can be crawled
432 siteURLsWriter.write(domainWithProtocol + "\n");
433
434 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
435 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
436 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
437 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
438 urlFilterWriter.write(regexed_domain + "\n"); //global file
439 siteRegexWriter.write(regexed_domain + "\n"); // site file
440 }
441 else { // domain belongs to a top site where only portion of site can be crawled
442
443 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
444 siteURLsWriter.write(domainWithProtocol + "\n");
445 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
446 // and not for all of blogspot.com
447
448 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
449 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
450 urlFilterWriter.write(regexed_domain + "\n");
451 siteRegexWriter.write(regexed_domain + "\n");
452
453 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
454 // don't write out domain. We want individual pages
455 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
456
457 // don't write out domain as a regex expression url filter either,
458 // write out the individual seed urls for the domain instead
459 // since we will only be downloading the single page
460
461 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
462 for(String urlInDomain : urlsForDomainSet) {
463 // don't append slash to end this time
464 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
465 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
466 urlFilterWriter.write(regexed_url + "\n");
467 siteRegexWriter.write(regexed_url + "\n");
468 }
469 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
470
471 // DON'T write out domain into siteURLs file,
472 // BUT DO write it into urlFilter file
473 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
474
475 urlFilterWriter.write(regexed_domain + "\n");
476 siteRegexWriter.write(regexed_domain + "\n");
477 } else { // allowedURLPatternRegex is a url-form - convert to regex
478 if(!allowedURLPatternRegex.endsWith("/")) {
479 allowedURLPatternRegex += "/";
480 }
481 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
482 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
483 siteURLsWriter.write(domainWithProtocol + "\n");
484 urlFilterWriter.write(regexed_pattern + "\n");
485 siteRegexWriter.write(regexed_pattern + "\n");
486
487 }
488 }
489
490 } catch (IOException ioe) {
491 error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
492 }
493
494 }
495
496 } catch (IOException ioe) {
497 error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
498 + "\n\t" + urlFilterFile
499 + "\n\t" + domainURLsFile
500 + "\n\t" + topSiteMatchesFile, ioe);
501 }
502
503 /*
504 // BEGIN DEBUG
505 debug("@@@@ TopSitesMap contains: ");
506 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
507 String topSite = entry.getKey();
508 String urlPattern = entry.getValue();
509 debug(topSite + " - " + urlPattern);
510 } // END DEBUG
511 */
512 }
513
514 private String stripSubDomain(String url) {
515 int index = url.indexOf(".");
516 if(index != -1) {
517 url = url.substring(index+1);
518 }
519 return url;
520 }
521
522
523 /**
524 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
525 * with or without www. prefix. This method tests for such as case as it would be dangerous
526 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
527 */
528 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
529 // check for an exact match as-is
530 if(seedURLDomain.equals(domain)) {
531 return true;
532 }
533
534 // else check if with or without a www. prefix we have an exact match with domain
535 if(seedURLDomain.startsWith("www.")) {
536 if(seedURLDomain.substring(4).equals(domain)) {
537 return true;
538 }
539 } else {
540 if(domain.equals("www."+seedURLDomain)) {
541 return true;
542 }
543 }
544
545 return false;
546 }
547
548
549 /**
550 * Check if the domain of the seedurl, either in its entirety or when stripped of
551 * www/subdomains, is in the list of top sites.
552 * If it is, and the given url matches the regex for that topsite, then add the url to the
553 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
554 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
555 * @return one of the following values:
556 * - This function returns null if the seedURL's domain does not match any of the topsites.
557 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
558 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
559 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
560 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
561 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
562 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
563 * URL pattern.
564 */
565 private String isURLinTopSitesMap(String fullSeedDomain) {
566 boolean keepLooping = true;
567
568 String domain = fullSeedDomain;
569
570 // domain aprameter will have retained www or subdomains, but is stripped of protocol
571
572 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
573 // if it does, return the value for that topsite domain in the topSitesMap
574 // If no match at all, return null.
575 do {
576
577 String allowed_url_pattern = topSitesMap.get(domain);
578 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
579 // there's an entry for the URL in the topSitesMap
580 debug("##### A top site matches URL domain " + domain);
581
582 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
583 // www prefix, should not exactly match the topSitesMap domain
584 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
585 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
586
587 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
588 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
589 }
590 return allowed_url_pattern;
591 }
592 // else, no entry for the URL in the topSitesMap
593 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
594
595 String newDomain = stripSubDomain(domain);
596 if(domain.equals(newDomain)) {
597 keepLooping = false;
598 } else {
599 domain = newDomain;
600 }
601 } while(keepLooping);
602
603 // url in entirety or stripped of subdomains did not match any of the topsites
604 return null;
605 }
606
607 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
608 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
609 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
610 //while(i.hasNext()) {
611 // Map.Entry<String, Integer> entry = i.next();
612 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
613 String urlPattern = entry.getKey();
614 Integer matchRule = entry.getValue();
615
616 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
617 return true;
618 }
619 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
620 return true;
621 }
622 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
623 return true;
624 }
625 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
626 return true;
627 }
628 // else check the rest of the filter list against this url
629 // before returning false to be certain it's not been listed in the filter list
630 }
631
632 return false;
633 }
634
635 /**
636 * Returns true if the url or pattern is found in the blacklist file.
637 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
638 * it won't get blacklisted after all. But that's not implemented here.
639 */
640 public boolean isBlacklisted(String url) {
641 boolean isBlackListed = isListedInFilterList(blackList, url);
642
643 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
644 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
645 String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
646 if(!isBlackListed && url.contains("jasmin")) {
647 warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
648 blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
649 }
650 return isBlackListed;
651 }
652
653 /**
654 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
655 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
656 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
657 */
658 public boolean isGreylisted(String url) {
659 // auto-translated product sites
660 return isListedInFilterList(greyList, url);
661 }
662
663 /**
664 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
665 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
666 */
667 public boolean isWhitelisted(String url) {
668 return isListedInFilterList(whiteList, url);
669 }
670
671 /**
672 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
673 * whether it is in the mentioned black|grey|white list.
674 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
675 * By not having this method deal with actual regex for filters, this has the advantage that
676 * we don't have to remember to escape or double escape each filter to turn it into a regex.
677 */
678 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
679
680 // if filterListFilename does not exist in the conf folder, just return
681 if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
682 warn("Filter list filename: " + filterListFilename + " does not exist");
683 return;
684 }
685
686 try (
687 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
688 ) {
689 String filter = null;
690 while((filter = reader.readLine()) != null) {
691 // skip comments and empty lines
692 filter = filter.trim();
693 if(filter.equals("") || filter.startsWith("#")) {
694 continue;
695 }
696
697 if(filter.startsWith("^") && filter.endsWith("$")) {
698 filter = filter.substring(1, filter.length()-1);
699 list.put(filter, LIST_ENTRY_MATCHES);
700 }
701 else if(filter.startsWith("^")) {
702 filter = filter.substring(1);
703 list.put(filter, LIST_ENTRY_STARTSWITH);
704 //debug("Match filter startswith: " + filter);
705 }
706 else if(filter.endsWith("$")) {
707 filter = filter.substring(0, filter.length()-1);
708 list.put(filter, LIST_ENTRY_ENDSWITH);
709 //debug("@@@ Match filter endswith: " + filter);
710 }
711 else {
712 list.put(filter, LIST_ENTRY_CONTAINS);
713 }
714 //debug("Got filter: " + filter);
715 }
716
717 } catch (IOException ioe) {
718 error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
719 }
720
721 }
722
723 /** Maintain a count of all WET files processed. */
724 public void setWETFileCount(int count) { this.wetFileCount = count; }
725
726 /** Maintain a count of all WET records processed. */
727 //public int getRecordCount() { return this.totalRecordCount; }
728 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
729 public void setRecordCount(int count) { this.totalRecordCount = count; }
730
731 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
732
733 // Will list all the warc.wet files in the input directory or else their gzipped versions
734 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
735
736 int wetRecordCount = 0;
737 int wetFileCount = 0;
738
739 for(int i = 0; i < WETFiles.length; i++) {
740 File WETFile = WETFiles[i];
741 debug("Processing WETfile: " + WETFile);
742
743 // Any .gz files listed means they haven't been unzipped yet. So unzip.
744 String WETFilename = WETFile.toString();
745 if(WETFilename.endsWith(".gz")) {
746 File GZippedWETFile = WETFile;
747 String WETGZippedFilename = WETFilename;
748 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
749
750 WETFile = new File(WETFilename);
751 Utility.unzipFile(GZippedWETFile, WETFile);
752 }
753 // hereafter all WETFiles should refer to the unzipped version
754 // Check the unzipped WETFile exists
755
756 if(!WETFile.exists() || !WETFile.isFile()) {
757 error("Error: " + WETFile + " does not exist (failure to unzip?)");
758 return;
759 }
760
761 // Finally, we can process this WETFile's records into the keep and discard pile
762 wetFileCount++;
763 debug("Off to process " + WETFile);
764 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
765 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
766 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
767 wetFileProcessor.processWETFile();
768 wetRecordCount += wetFileProcessor.getRecordCount();
769 }
770
771 // for information purposes
772 this.setWETFileCount(wetFileCount);
773 this.setRecordCount(wetRecordCount);
774 }
775
776
777 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
778 public static void info(String msg) {
779 System.err.println(msg);
780 logger.info(msg);
781 }
782 public static void debug(String msg) {
783 System.err.println(msg);
784 logger.debug(msg);
785 }
786 public static void warn(String msg) {
787 System.err.println(msg);
788 logger.warn(msg);
789 }
790 public static void error(String msg) {
791 System.err.println(msg);
792 logger.error(msg);
793 }
794 public static void error(String msg, Exception e) {
795 logger.error(msg, e);
796 System.err.println("\n"+msg);
797 e.printStackTrace();
798 }
799
800 public static void printUsage() {
801 info("Run this program as:");
802 info("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
803 }
804
805 /** Filename filter to only list warc.wet files or else warc.wet.gz files
806 * for which unzipped warc.wet equivalents don't yet exist.
807 */
808 private static class WETFilenameFilter implements FilenameFilter {
809
810 public boolean accept(File dir, String name) {
811 if(name.endsWith(".warc.wet")) {
812 debug("Will include " + name + " for processing.");
813 return true;
814 }
815
816 if(name.endsWith(".warc.wet.gz")) {
817 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
818 File unzippedVersion = new File(dir, nameWithoutGZext);
819 if(unzippedVersion.exists()) {
820 debug("--- Unzipped version " + unzippedVersion + " exists.");
821 debug("Skipping " + name);
822 return false; // don't count gzipped version if unzipped version exists.
823 }
824 else {
825 debug("Only zipped version " + name + " exists.");
826 return true; // No unzipped version, so have to work with gzipped version
827 }
828 }
829
830 // we're not even interested in any other file extensions
831 debug("Not a WET file. Skipping " + name);
832 return false;
833 }
834 }
835
836
837 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
838
839 public boolean accept(File dir, String name) {
840 File f = new File (dir, name);
841 if(f.isDirectory()) {
842 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
843 return true;
844 }
845 }
846 else {
847 info("File " + f + " is not a directory");
848 }
849 return false;
850 }
851 }
852
853 public static void main(String[] args) {
854 if(args.length != 2) {
855 printUsage();
856 return;
857 }
858
859 File commoncrawlDir = new File(args[0]);
860 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
861 error("Error: " + args[0] + " does not exist or is not a directory");
862 return;
863 }
864
865 File outFolder = new File(args[1]);
866 if(!outFolder.exists() || !outFolder.isDirectory()) {
867 error("Error: " + args[1] + " does not exist or is not a directory.");
868 return;
869 }
870
871 try {
872 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
873
874 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
875
876 for(int i = 0; i < ccrawlFolders.length; i++) {
877 File ccrawlFolder = ccrawlFolders[i];
878 info("About to process commoncrawl WET files folder: " + ccrawlFolder);
879 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
880 }
881
882 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
883 // The former is the only unique one. seedURLs and regex-urlfilters are
884 // repeated on a per site/domain basis too, stored in the sites folder
885 File seedURLsFile = new File(outFolder, "seedURLs.txt");
886 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
887 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
888 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
889
890 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
891
892 info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
893
894 info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
895
896
897 } catch(Exception e) {
898 // can get an exception when instantiating CCWETProcessor instance
899 error(e.getMessage(), e);
900 }
901
902 return;
903
904 }
905}
Note: See TracBrowser for help on using the repository browser.