source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33615

Last change on this file since 33615 was 33615, checked in by ak19, 4 years ago
  1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 38.9 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.nio.charset.StandardCharsets;
6import java.util.Properties;
7import java.util.zip.GZIPInputStream;
8import java.util.Iterator;
9import java.util.HashMap;
10import java.util.Map;
11import java.util.Set;
12import java.util.TreeMap;
13import java.util.TreeSet;
14
15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16import org.apache.log4j.Logger;
17
18
19/**
20 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21 * the WET records in each, putting each WET record into a file. Each file is put into a
22 * keep or discard or greyListed folder, and its url listed written into a keep, discard
23 * or greylisted text file, based on based on
24 *
25 * 1. whether it's whitelisted, else greylisted else blacklisted
26 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27 * enough content. Formerly, content-length and number of lines were used to determine if
28 * the content was sufficient. Now it's just word count and number of MAX characters
29 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30 * in conf/config.properties.
31 *
32 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33 * into the conf folder to control any url patterns that are explicitly included or excluded or
34 * set aside for inspecting later. These filter text files don't use regexes, instead their
35 * format is:
36 * - precede URL by ^ to blacklist urls that match the given prefix
37 * - succeed URL by $ to blacklist urls that match the given suffix
38 * - ^url$ will blacklist urls that match the given url completely
39 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40 *
41 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
42 * over greylisting and which takes precedence over blacklisting in turn. However, even
43 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45 * also for nutch.
46 *
47 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48 * in the given input folder. Then use a single instance of the WETProcessor class to process
49 * each single unzipped warc.wet file.
50 *
51 * To compile, including the jars in lib/ for compiling.
52 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53 *
54 * To run, passing the log4j and other properties files in conf/ folder:
55 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56 *
57 * e.g. (from maori-lang-detection/src)
58 *
59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 | less
61 *
62*/
63
64public class CCWETProcessor {
65 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67 // Properties shared across WETProcessor instances
68 public final int MAX_WORD_LENGTH;
69 public final int MIN_NUM_WORDS;
70 public final int MAX_WORDS_CAMELCASE;
71
72 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74 public final String SINGLEPAGE = "SINGLEPAGE";
75 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77 /**
78 * Characters that need escaping if used as a string literal in a regex
79 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80 * https://www.regular-expressions.info/refcharacters.html
81 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82 * as would happen for any chars appearing earlier in this list than \\
83 */
84 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
85 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
86
87 private Properties configProperties = new Properties();
88
89 // File paths shared across WETProcessor instances
90 public final File commoncrawlDir;
91 public final File outputFolder;
92 public final File discardFolder;
93 public final File keepFolder;
94 public final File greyListedFolder;
95 public final File keepURLsFile;
96 public final File discardURLsFile;
97 public final File greyListedFile;
98
99 /** Possible values stored in the blackList/whiteList/greyList Maps */
100 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105 /**
106 * Store url patterns as keys and values indicated whether a url should
107 * match it exactly, start/end with it, or contain it
108 */
109 private HashMap<String, Integer> blackList;
110 private HashMap<String, Integer> greyList;
111 private HashMap<String, Integer> whiteList;
112
113 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
114 * with optional regex defining allowed exceptions, like subdomains or url suffixes
115 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117 * would be relevant.
118 * The map would store top site domain suffix and an optional regex string for allowable
119 * url patterns.
120 */
121 private HashMap<String, String> topSitesMap;
122
123 /** Map of domains we keep and the full urls we're keeping that are of that domain.
124 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125 * while a HashMap has no notion of ordering, because we just need to store urls with
126 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130 */
131 private Map<String, Set<String>> domainsToURLsMap;
132
133 // Keep a count of all the records that all WETProcessors instantiated
134 // by our main method combined have processed
135 private int totalRecordCount = 0;
136
137 private int wetFileCount = 0;
138
139 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142 this.commoncrawlDir = inFolder;
143 this.outputFolder = outFolder;
144
145 // load up the properties from the config file
146 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147 configProperties = new Properties();
148 configProperties.load(infile);
149 //infile.close(); // not explicitly called in examples of try-with-resources
150
151 } catch(Exception e) {
152 logger.error("Exception attempting to read properties from config.properties.", e);
153 }
154
155 if(configProperties.size() == 0) {
156 logger.warn("*** Warning: no values read into config properties. Using defaults.");
157 }
158
159 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164 this.discardFolder = new File(outFolder, "discard");
165 if(!discardFolder.exists()) {
166 discardFolder.mkdir();
167 }
168 this.keepFolder = new File(outFolder, "keep");
169 if(!keepFolder.exists()) {
170 keepFolder.mkdir();
171 }
172
173 this.greyListedFolder = new File(outFolder, "greylisted");
174 if(!greyListedFolder.exists()) {
175 greyListedFolder.mkdir();
176 }
177
178 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181 }
182 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185 }
186 this.greyListedFile = new File(outFolder, "greyListed.txt");
187 if(greyListedFile.exists() && !greyListedFile.delete()) {
188 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189 }
190
191 // prepare our blacklist, greylist (for inspection) and whitelist
192 logger.info("Loading blacklist.");
193 blackList = new HashMap<String, Integer>();
194 initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196 logger.info("Loading greylist.");
197 greyList = new HashMap<String, Integer>();
198 initURLFilterList(greyList, "url-greylist-filter.txt");
199
200 logger.info("Loading whitelist.");
201 whiteList = new HashMap<String, Integer>();
202 initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204 // Create the map of topSites
205 logger.info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206 topSitesMap = new HashMap<String, String>();
207
208 // Read in our csv file of topsites and what to do when one hits a match with a seedURL
209 // and put these in our topSitesMap
210 // https://commons.apache.org/proper/commons-csv/apidocs/index.html
211 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214 .withCommentMarker('#')
215 .withSkipHeaderRecord()
216 .withIgnoreSurroundingSpaces();
217
218 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220 try (
221 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222 ) {
223 for (CSVRecord csvRecord : parser) {
224 String topsite = csvRecord.get(0);
225 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226 topSitesMap.put(topsite, allowed_url_pattern);
227
228 //logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230 }
231 } catch(Exception e) {
232 logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233 }
234
235 //logger.debug("Prematurely terminating for testing purposes.");
236 //System.exit(-1);
237 }
238
239 /** Work out the 'domain' for a given url.
240 * This retains any www. or subdomain prefix.
241 */
242 public static String getDomainForURL(String url, boolean withProtocol) {
243 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
244 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
245 // the keep the URL around in case param withProtocol=true
246 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
247
248 String domain = url.substring(startIndex);
249 int endIndex = domain.indexOf("/");
250 if(endIndex == -1) endIndex = domain.length();
251 domain = domain.substring(0, endIndex);
252
253 if(withProtocol) {
254 // now that we have the domain (everything to the first / when there is no protocol)
255 // can glue the protocol back on
256 domain = protocol + domain;
257 }
258
259 return domain;
260 }
261
262 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
263 private String escapeStringForRegex(String str) {
264 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
265 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
266 str = str.replace(Character.toString(c), "\\"+c);
267 }
268 return str;
269 }
270
271 /**
272 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
273 * as output the URL seed list and regex-urlfilter text files required by nutch, see
274 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
275 */
276 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
277 File domainURLsFile, File topSiteMatchesFile,
278 File possibleProductSitesFile) {
279 // Maintain a Map of unique domains mapped to seed urls at that domain
280 // TreeSet: by default, "the elements are ordered using their natural ordering"
281 // (or by a Comparator provided at set creation time).
282 // Whereas HashSet doesn't guarantee ordering.
283 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
284 // Would be a similar distinction for Maps.
285 domainsToURLsMap = new TreeMap<String, Set<String>>();
286
287 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
288 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
289
290 // keep an eye out on URLs we need to inspect later
291 Set<String> possibleProductDomains = new TreeSet<String>();
292 File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
293
294 try (
295 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
296 BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
297 ) {
298
299 // read a URL at a time from urlsFile
300 String url = null;
301 String domainWithProtocol = null;
302 while((url = reader.readLine()) != null) { // readLine removes newline separator
303
304 // work out domain. This retains any www. or subdomain prefix
305 // passing true to further also retain the http(s) protocol
306 domainWithProtocol = getDomainForURL(url, true);
307
308 Set<String> urlsSet;
309 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
310 urlsSet = new TreeSet<String>();
311 urlsSet.add(url);
312 domainsToURLsMap.put(domainWithProtocol, urlsSet);
313 } else {
314 urlsSet = domainsToURLsMap.get(domainWithProtocol);
315 urlsSet.add(url);
316 }
317
318 // Dr Nichols said that a url that was located outside the country and
319 // which had /mi/ URLs was more likely to be an autotranslated (product) site.
320 // Following Dr Nichols' idea, let's keep a look out for more product sites:
321 // if any URL contains /mi AND the tld of its domain is outside of New Zealand
322 // then add that domain (if not already added) and that url into a file
323 // for later manual inspection
324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
325 /*
326 if(!possibleProductDomains.contains(domainWithProtocol)) {
327
328 String countryCode = "";
329 try {
330 // more expensive test, so do this only if above conditions are true:
331 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
332 System.err.println("@@@@ Got country code: " + countryCode);
333 } catch(Exception exceptObj) {
334 countryCode = ""; // forces domain to be included for inspection
335
336 logger.error("Could not check if domain " + domainWithProtocol
337 + " was in country: " + countryCode,
338 exceptObj);
339 }
340
341 boolean isInNZ = countryCode.toLowerCase().equals("nz");
342
343
344 //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
345 if(!isInNZ) {
346 possibleProductDomains.add(domainWithProtocol);
347 // write both domain and a sample URL on that site out to file
348 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
349 possibleProductSitesWriter.write("\t" + url + "\n");
350 }
351 }*/ /*else {
352 // already wrote out domain to file at some point, write just the URL out to file
353 possibleProductSitesWriter.write("\t" + url + "\n");
354 }*/
355 }
356 }
357 } catch (IOException ioe) {
358 logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
359 }
360
361 // We'd have pruned out duplicates by now and have a sorted list of domains,
362 // each of which maps to seed URLs in the commoncrawl for that domain
363
364 int domainCount = 0;
365 File sitesFolder = new File(outputFolder, "sites");
366 if(!sitesFolder.exists()) {
367 sitesFolder.mkdir();
368 }
369 final String FORMATSTR = "%05d";
370
371 // write out each domain followed in sequence by all urls we found in that domain
372 // (urls with tab up front)
373 try (
374 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
375 // Also a global file listing any urls that matched top sites that didn't specify
376 // allowed regex patterns
377 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
378 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
379 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
380 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));
381 ) {
382
383 // initialise topSiteMatchesFile with some instructional text.
384 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
385 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
386 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
387
388 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
389 Set<String> domainsSet = domainsToURLsMap.keySet();
390 Iterator<String> domainIterator = domainsSet.iterator();
391
392 /*
393 // DEBUG
394 String value = topSitesMap.get("wikipedia.org");
395 if(value == null) {
396 logger.debug("### wikipedia.org had null value");
397 } else {
398 logger.debug("### wikipedia.org had value: " + value);
399 } // DEBUG
400 */
401
402 while(domainIterator.hasNext()) {
403 String domainWithProtocol = domainIterator.next();
404 // Also get domain without protocol prefix
405 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
406 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
407 String domain = domainWithProtocol.substring(startIndex);
408
409 /*if(domain.contains("docs.google.com")) {
410 logger.debug("domain with protocol: " + domainWithProtocol);
411 logger.debug("domain: " + domain);
412 }*/
413
414 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
415 // If the domain is of a topsite for which no allowed URL pattern has been provided
416 // in sites-too-big-to-exhaustively-crawl.txt,
417 // then we don't know how to crawl the site. Warn the user by writing the affected
418 // domain and seedURLs to the topSiteMatchesFile.
419 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
420
421 // topsite, but we don't (yet) know what portion can be crawled
422 // Append the top site and url to a global/toplevel file that
423 // the user needs to check later and we're done with this domain as it
424 // won't go into any other file hereafter
425
426 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
427 for(String url : urlsForDomainSet) {
428 topSiteMatchesWriter.write("\t" + url + "\n");
429 }
430
431 continue; // done with this domain
432 }
433
434 // start counting the domains we're actually going to process
435 domainCount++;
436
437 String siteID = String.format(FORMATSTR, domainCount);
438 File domainFolder = new File(sitesFolder, siteID);
439 domainFolder.mkdir();
440
441 // write out the domain
442 //seedURLsWriter.write(domainWithProtocol + "\n");
443
444
445 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
446 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
447 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
448 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
449 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
450 try (
451 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
452 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
453 ) {
454
455 // write all sorted unique domains into global domains file
456 // Using the domain withuot protocol since the global domains file is for
457 // informational purposes
458 domainURLsWriter.write(domain + "\n");
459
460 // Only write urls and no domain into single global seedurls file
461 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
462 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
463 // If we ever run nutch on a single seedURLs listing containing
464 // all seed pages to crawl sites from, the above two files will work for that.
465
466 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file
467 // also write into the global seeds file (with a tab prefixed to each?)
468 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
469 for(String url : urlsForDomainSet) {
470 seedURLsWriter.write(url + "\n"); // global seedURLs file
471 siteURLsWriter.write(url + "\n");
472 }
473
474 if(allowedURLPatternRegex == null) { // entire site can be crawled
475 siteURLsWriter.write(domainWithProtocol + "\n");
476
477 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
478 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
479 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
480 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
481 urlFilterWriter.write(regexed_domain + "\n"); //global file
482 siteRegexWriter.write(regexed_domain + "\n"); // site file
483 }
484 else { // domain belongs to a top site where only portion of site can be crawled
485
486 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
487 siteURLsWriter.write(domainWithProtocol + "\n");
488 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
489 // and not for all of blogspot.com
490
491 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
492 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
493 urlFilterWriter.write(regexed_domain + "\n");
494 siteRegexWriter.write(regexed_domain + "\n");
495
496 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
497 // don't write out domain. We want individual pages
498 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
499
500 // don't write out domain as a regex expression url filter either,
501 // write out the individual seed urls for the domain instead
502 // since we will only be downloading the single page
503
504 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
505 for(String urlInDomain : urlsForDomainSet) {
506 // don't append slash to end this time
507 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
508 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
509 urlFilterWriter.write(regexed_url + "\n");
510 siteRegexWriter.write(regexed_url + "\n");
511 }
512 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
513
514 // DON'T write out domain into siteURLs file,
515 // BUT DO write it into urlFilter file
516 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
517
518 urlFilterWriter.write(regexed_domain + "\n");
519 siteRegexWriter.write(regexed_domain + "\n");
520 } else { // allowedURLPatternRegex is a url-form - convert to regex
521 if(!allowedURLPatternRegex.endsWith("/")) {
522 allowedURLPatternRegex += "/";
523 }
524 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
525 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
526 siteURLsWriter.write(domainWithProtocol + "\n");
527 urlFilterWriter.write(regexed_pattern + "\n");
528 siteRegexWriter.write(regexed_pattern + "\n");
529
530 }
531 }
532
533 } catch (IOException ioe) {
534 logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
535 }
536
537 }
538
539 } catch (IOException ioe) {
540 logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
541 + "\n\t" + urlFilterFile
542 + "\n\t" + domainURLsFile
543 + "\n\t" + topSiteMatchesFile, ioe);
544 }
545
546 /*
547 // BEGIN DEBUG
548 logger.debug("@@@@ TopSitesMap contains: ");
549 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
550 String topSite = entry.getKey();
551 String urlPattern = entry.getValue();
552 logger.debug(topSite + " - " + urlPattern);
553 } // END DEBUG
554 */
555 }
556
557 private String stripSubDomain(String url) {
558 int index = url.indexOf(".");
559 if(index != -1) {
560 url = url.substring(index+1);
561 }
562 return url;
563 }
564
565
566 /**
567 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
568 * with or without www. prefix. This method tests for such as case as it would be dangerous
569 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
570 */
571 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
572 // check for an exact match as-is
573 if(seedURLDomain.equals(domain)) {
574 return true;
575 }
576
577 // else check if with or without a www. prefix we have an exact match with domain
578 if(seedURLDomain.startsWith("www.")) {
579 if(seedURLDomain.substring(4).equals(domain)) {
580 return true;
581 }
582 } else {
583 if(domain.equals("www."+seedURLDomain)) {
584 return true;
585 }
586 }
587
588 return false;
589 }
590
591
592 /**
593 * Check if the domain of the seedurl, either in its entirety or when stripped of
594 * www/subdomains, is in the list of top sites.
595 * If it is, and the given url matches the regex for that topsite, then add the url to the
596 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
597 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
598 * @return one of the following values:
599 * - This function returns null if the seedURL's domain does not match any of the topsites.
600 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
601 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
602 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
603 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
604 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
605 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
606 * URL pattern.
607 */
608 private String isURLinTopSitesMap(String fullSeedDomain) {
609 boolean keepLooping = true;
610
611 String domain = fullSeedDomain;
612
613 // domain aprameter will have retained www or subdomains, but is stripped of protocol
614
615 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
616 // if it does, return the value for that topsite domain in the topSitesMap
617 // If no match at all, return null.
618 do {
619
620 String allowed_url_pattern = topSitesMap.get(domain);
621 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
622 // there's an entry for the URL in the topSitesMap
623 logger.debug("##### A top site matches URL domain " + domain);
624
625 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
626 // www prefix, should not exactly match the topSitesMap domain
627 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
628 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
629
630 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
631 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
632 }
633 return allowed_url_pattern;
634 }
635 // else, no entry for the URL in the topSitesMap
636 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
637
638 String newDomain = stripSubDomain(domain);
639 if(domain.equals(newDomain)) {
640 keepLooping = false;
641 } else {
642 domain = newDomain;
643 }
644 } while(keepLooping);
645
646 // url in entirety or stripped of subdomains did not match any of the topsites
647 return null;
648 }
649
650 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
651 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
652 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
653 //while(i.hasNext()) {
654 // Map.Entry<String, Integer> entry = i.next();
655 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
656 String urlPattern = entry.getKey();
657 Integer matchRule = entry.getValue();
658
659 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
660 return true;
661 }
662 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
663 return true;
664 }
665 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
666 return true;
667 }
668 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
669 return true;
670 }
671 // else check the rest of the filter list against this url
672 // before returning false to be certain it's not been listed in the filter list
673 }
674
675 return false;
676 }
677
678 /**
679 * Returns true if the url or pattern is found in the blacklist file.
680 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
681 * it won't get blacklisted after all. But that's not implemented here.
682 */
683 public boolean isBlacklisted(String url) {
684 boolean isBlackListed = isListedInFilterList(blackList, url);
685
686 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
687 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
688 String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
689 if(!isBlackListed && url.contains("jasmin")) {
690 logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
691 blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
692 }
693 return isBlackListed;
694 }
695
696 /**
697 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
698 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
699 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
700 */
701 public boolean isGreylisted(String url) {
702 // auto-translated product sites
703 return isListedInFilterList(greyList, url);
704 }
705
706 /**
707 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
708 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
709 */
710 public boolean isWhitelisted(String url) {
711 return isListedInFilterList(whiteList, url);
712 }
713
714 /**
715 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
716 * whether it is in the mentioned black|grey|white list.
717 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
718 * By not having this method deal with actual regex for filters, this has the advantage that
719 * we don't have to remember to escape or double escape each filter to turn it into a regex.
720 */
721 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
722
723 // if filterListFilename does not exist in the conf folder, just return
724 if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
725 logger.warn("Filter list filename: " + filterListFilename + " does not exist");
726 return;
727 }
728
729 try (
730 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
731 ) {
732 String filter = null;
733 while((filter = reader.readLine()) != null) {
734 // skip comments and empty lines
735 filter = filter.trim();
736 if(filter.equals("") || filter.startsWith("#")) {
737 continue;
738 }
739
740 if(filter.startsWith("^") && filter.endsWith("$")) {
741 filter = filter.substring(1, filter.length()-1);
742 list.put(filter, LIST_ENTRY_MATCHES);
743 }
744 else if(filter.startsWith("^")) {
745 filter = filter.substring(1);
746 list.put(filter, LIST_ENTRY_STARTSWITH);
747 //logger.debug("Match filter startswith: " + filter);
748 }
749 else if(filter.endsWith("$")) {
750 filter = filter.substring(0, filter.length()-1);
751 list.put(filter, LIST_ENTRY_ENDSWITH);
752 //logger.debug("@@@ Match filter endswith: " + filter);
753 }
754 else {
755 list.put(filter, LIST_ENTRY_CONTAINS);
756 }
757 //logger.debug("Got filter: " + filter);
758 }
759
760 } catch (IOException ioe) {
761 logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
762 }
763
764 }
765
766 /** Maintain a count of all WET files processed. */
767 public void setWETFileCount(int count) { this.wetFileCount = count; }
768
769 /** Maintain a count of all WET records processed. */
770 //public int getRecordCount() { return this.totalRecordCount; }
771 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
772 public void setRecordCount(int count) { this.totalRecordCount = count; }
773
774 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
775
776 // Will list all the warc.wet files in the input directory or else their gzipped versions
777 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
778
779 int wetRecordCount = 0;
780 int wetFileCount = 0;
781
782 for(int i = 0; i < WETFiles.length; i++) {
783 File WETFile = WETFiles[i];
784 logger.debug("Processing WETfile: " + WETFile);
785
786 // Any .gz files listed means they haven't been unzipped yet. So unzip.
787 String WETFilename = WETFile.toString();
788 if(WETFilename.endsWith(".gz")) {
789 File GZippedWETFile = WETFile;
790 String WETGZippedFilename = WETFilename;
791 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
792
793 WETFile = new File(WETFilename);
794 Utility.unzipFile(GZippedWETFile, WETFile);
795 }
796 // hereafter all WETFiles should refer to the unzipped version
797 // Check the unzipped WETFile exists
798
799 if(!WETFile.exists() || !WETFile.isFile()) {
800 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
801 return;
802 }
803
804 // Finally, we can process this WETFile's records into the keep and discard pile
805 wetFileCount++;
806 logger.debug("Off to process " + WETFile);
807 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
808 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
809 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
810 wetFileProcessor.processWETFile();
811 wetRecordCount += wetFileProcessor.getRecordCount();
812 }
813
814 // for information purposes
815 this.setWETFileCount(wetFileCount);
816 this.setRecordCount(wetRecordCount);
817 }
818
819
820 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
821 public static void printUsage() {
822 System.err.println("Run this program as:");
823 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
824 }
825
826 /** Filename filter to only list warc.wet files or else warc.wet.gz files
827 * for which unzipped warc.wet equivalents don't yet exist.
828 */
829 private static class WETFilenameFilter implements FilenameFilter {
830
831 public boolean accept(File dir, String name) {
832 if(name.endsWith(".warc.wet")) {
833 logger.debug("Will include " + name + " for processing.");
834 return true;
835 }
836
837 if(name.endsWith(".warc.wet.gz")) {
838 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
839 File unzippedVersion = new File(dir, nameWithoutGZext);
840 if(unzippedVersion.exists()) {
841 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
842 logger.debug("Skipping " + name);
843 return false; // don't count gzipped version if unzipped version exists.
844 }
845 else {
846 logger.debug("Only zipped version " + name + " exists.");
847 return true; // No unzipped version, so have to work with gzipped version
848 }
849 }
850
851 // we're not even interested in any other file extensions
852 logger.debug("Not a WET file. Skipping " + name);
853 return false;
854 }
855 }
856
857
858 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
859
860 public boolean accept(File dir, String name) {
861 File f = new File (dir, name);
862 if(f.isDirectory()) {
863 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
864 return true;
865 }
866 }
867 else {
868 logger.info("File " + f + " is not a directory");
869 }
870 return false;
871 }
872 }
873
874 public static void main(String[] args) {
875 if(args.length != 2) {
876 printUsage();
877 return;
878 }
879
880 File commoncrawlDir = new File(args[0]);
881 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
882 logger.error("Error: " + args[0] + " does not exist or is not a directory");
883 return;
884 }
885
886 File outFolder = new File(args[1]);
887 if(!outFolder.exists() || !outFolder.isDirectory()) {
888 logger.error("Error: " + args[1] + " does not exist or is not a directory.");
889 return;
890 }
891
892 try {
893 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
894
895 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
896
897 for(int i = 0; i < ccrawlFolders.length; i++) {
898 File ccrawlFolder = ccrawlFolders[i];
899 logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder);
900 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
901 }
902
903 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
904 // The former is the only unique one. seedURLs and regex-urlfilters are
905 // repeated on a per site/domain basis too, stored in the sites folder
906 File seedURLsFile = new File(outFolder, "seedURLs.txt");
907 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
908 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
909 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
910 File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
911
912 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
913
914 logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
915
916 logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
917
918
919 } catch(Exception e) {
920 // can get an exception when instantiating CCWETProcessor instance
921 logger.error(e.getMessage(), e);
922 }
923
924 return;
925
926 }
927}
Note: See TracBrowser for help on using the repository browser.