source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33568

Last change on this file since 33568 was 33568, checked in by ak19, 5 years ago
  1. More sites greylisted and blacklisted, discovered as I attempted to crawl them and afterwards learnt to investigate sites first. Should all .ru and .pl domains be on the greylist? 2. Adjusted instruction comments in CCWETProcessor for compiling and running
File size: 36.7 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.nio.charset.StandardCharsets;
6import java.util.Properties;
7import java.util.zip.GZIPInputStream;
8import java.util.Iterator;
9import java.util.HashMap;
10import java.util.Map;
11import java.util.Set;
12import java.util.TreeMap;
13import java.util.TreeSet;
14
15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16import org.apache.log4j.Logger;
17
18
19/**
20 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21 * the WET records in each, putting each WET record into a file. Each file is put into a
22 * keep or discard or greyListed folder, and its url listed written into a keep, discard
23 * or greylisted text file, based on based on
24 *
25 * 1. whether it's whitelisted, else greylisted else blacklisted
26 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27 * enough content. Formerly, content-length and number of lines were used to determine if
28 * the content was sufficient. Now it's just word count and number of MAX characters
29 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30 * in conf/config.properties.
31 *
32 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33 * into the conf folder to control any url patterns that are explicitly included or excluded or
34 * set aside for inspecting later. These filter text files don't use regexes, instead their
35 * format is:
36 * - precede URL by ^ to blacklist urls that match the given prefix
37 * - succeed URL by $ to blacklist urls that match the given suffix
38 * - ^url$ will blacklist urls that match the given url completely
39 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40 *
41 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
42 * over greylisting and which takes precedence over blacklisting in turn. However, even
43 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45 * also for nutch.
46 *
47 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48 * in the given input folder. Then use a single instance of the WETProcessor class to process
49 * each single unzipped warc.wet file.
50 *
51 * To compile, including the jars in lib/ for compiling.
52 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53 *
54 * To run, passing the log4j and other properties files in conf/ folder:
55 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56 *
57 * e.g. (from maori-lang-detection/src)
58 *
59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 | less
61 *
62*/
63
64public class CCWETProcessor {
65 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67 // Properties shared across WETProcessor instances
68 public final int MAX_WORD_LENGTH;
69 public final int MIN_NUM_WORDS;
70 public final int MAX_WORDS_CAMELCASE;
71
72 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74 public final String SINGLEPAGE = "SINGLEPAGE";
75 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77 /**
78 * Characters that need escaping if used as a string literal in a regex
79 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80 * https://www.regular-expressions.info/refcharacters.html
81 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82 * as would happen for any chars appearing earlier in this list than \\
83 */
84 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
85 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
86
87 private Properties configProperties = new Properties();
88
89 // File paths shared across WETProcessor instances
90 public final File commoncrawlDir;
91 public final File outputFolder;
92 public final File discardFolder;
93 public final File keepFolder;
94 public final File greyListedFolder;
95 public final File keepURLsFile;
96 public final File discardURLsFile;
97 public final File greyListedFile;
98
99 /** Possible values stored in the blackList/whiteList/greyList Maps */
100 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105 /**
106 * Store url patterns as keys and values indicated whether a url should
107 * match it exactly, start/end with it, or contain it
108 */
109 private HashMap<String, Integer> blackList;
110 private HashMap<String, Integer> greyList;
111 private HashMap<String, Integer> whiteList;
112
113 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
114 * with optional regex defining allowed exceptions, like subdomains or url suffixes
115 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117 * would be relevant.
118 * The map would store top site domain suffix and an optional regex string for allowable
119 * url patterns.
120 */
121 private HashMap<String, String> topSitesMap;
122
123 /** Map of domains we keep and the full urls we're keeping that are of that domain.
124 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125 * while a HashMap has no notion of ordering, because we just need to store urls with
126 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130 */
131 private Map<String, Set<String>> domainsToURLsMap;
132
133 // Keep a count of all the records that all WETProcessors instantiated
134 // by our main method combined have processed
135 private int totalRecordCount = 0;
136
137 private int wetFileCount = 0;
138
139 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142 this.commoncrawlDir = inFolder;
143 this.outputFolder = outFolder;
144
145 // load up the properties from the config file
146 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147 configProperties = new Properties();
148 configProperties.load(infile);
149 //infile.close(); // not explicitly called in examples of try-with-resources
150
151 } catch(Exception e) {
152 System.err.println("Exception attempting to read properties from config.properties.");
153 logger.error("Exception attempting to read properties from config.properties.");
154 e.printStackTrace();
155 }
156
157 if(configProperties.size() == 0) {
158 System.err.println("*** Warning: no values read into config properties. Using defaults.");
159 }
160
161 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
162 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
163 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
164
165
166 this.discardFolder = new File(outFolder, "discard");
167 if(!discardFolder.exists()) {
168 discardFolder.mkdir();
169 }
170 this.keepFolder = new File(outFolder, "keep");
171 if(!keepFolder.exists()) {
172 keepFolder.mkdir();
173 }
174
175 this.greyListedFolder = new File(outFolder, "greylisted");
176 if(!greyListedFolder.exists()) {
177 greyListedFolder.mkdir();
178 }
179
180 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
181 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
182 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
183 }
184 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
185 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
186 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
187 }
188 this.greyListedFile = new File(outFolder, "greyListed.txt");
189 if(greyListedFile.exists() && !greyListedFile.delete()) {
190 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
191 }
192
193 // prepare our blacklist, greylist (for inspection) and whitelist
194 System.err.println("Loading blacklist.");
195 blackList = new HashMap<String, Integer>();
196 initURLFilterList(blackList, "url-blacklist-filter.txt");
197
198 System.err.println("Loading greylist.");
199 greyList = new HashMap<String, Integer>();
200 initURLFilterList(greyList, "url-greylist-filter.txt");
201
202 System.err.println("Loading whitelist.");
203 whiteList = new HashMap<String, Integer>();
204 initURLFilterList(whiteList, "url-whitelist-filter.txt");
205
206 // Create the map of topSites
207 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
208 topSitesMap = new HashMap<String, String>();
209
210 // Read in our csv file of topsites and what to do when one hits a match with a seedURL
211 // and put these in our topSitesMap
212 // https://commons.apache.org/proper/commons-csv/apidocs/index.html
213 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
214 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
215 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
216 .withCommentMarker('#')
217 .withSkipHeaderRecord()
218 .withIgnoreSurroundingSpaces();
219
220 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
221 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
222 try (
223 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
224 ) {
225 for (CSVRecord csvRecord : parser) {
226 String topsite = csvRecord.get(0);
227 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
228 topSitesMap.put(topsite, allowed_url_pattern);
229
230 //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
231
232 }
233 } catch(Exception e) {
234 e.printStackTrace();
235 System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
236 }
237
238
239
240 //System.err.println("Prematurely terminating for testing purposes.");
241 //System.exit(-1);
242 }
243
244 /** Work out the 'domain' for a given url.
245 * This retains any www. or subdomain prefix.
246 */
247 private String getDomainForURL(String url, boolean withProtocol) {
248 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
249 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
250 // the keep the URL around in case param withProtocol=true
251 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
252
253 String domain = url.substring(startIndex);
254 int endIndex = domain.indexOf("/");
255 if(endIndex == -1) endIndex = domain.length();
256 domain = domain.substring(0, endIndex);
257
258 if(withProtocol) {
259 // now that we have the domain (everything to the first / when there is no protocol)
260 // can glue the protocol back on
261 domain = protocol + domain;
262 }
263
264 return domain;
265 }
266
267 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
268 private String escapeStringForRegex(String str) {
269 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
270 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
271 str = str.replace(Character.toString(c), "\\"+c);
272 }
273 return str;
274 }
275
276 /**
277 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
278 * as output the URL seed list and regex-urlfilter text files required by nutch, see
279 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
280 */
281 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
282 File domainURLsFile, File topSiteMatchesFile) {
283 // Maintain a Map of unique domains mapped to seed urls at that domain
284 // TreeSet: by default, "the elements are ordered using their natural ordering"
285 // (or by a Comparator provided at set creation time).
286 // Whereas HashSet doesn't guarantee ordering.
287 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
288 // Would be a similar distinction for Maps.
289 domainsToURLsMap = new TreeMap<String, Set<String>>();
290
291 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
292 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
293
294 try (
295 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
296 ) {
297
298 // read a URL at a time from urlsFile
299 String url = null;
300 String domainWithProtocol = null;
301 while((url = reader.readLine()) != null) { // readLine removes newline separator
302
303 // work out domain. This retains any www. or subdomain prefix
304 // passing true to further also retain the http(s) protocol
305 domainWithProtocol = getDomainForURL(url, true);
306
307 Set<String> urlsSet;
308 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
309 urlsSet = new TreeSet<String>();
310 urlsSet.add(url);
311 domainsToURLsMap.put(domainWithProtocol, urlsSet);
312 } else {
313 urlsSet = domainsToURLsMap.get(domainWithProtocol);
314 urlsSet.add(url);
315 }
316
317 }
318 } catch (IOException ioe) {
319 ioe.printStackTrace();
320 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
321 }
322
323 // We'd have pruned out duplicates by now and have a sorted list of domains,
324 // each of which maps to seed URLs in the commoncrawl for that domain
325
326 int domainCount = 0;
327 File sitesFolder = new File(outputFolder, "sites");
328 if(!sitesFolder.exists()) {
329 sitesFolder.mkdir();
330 }
331 final String FORMATSTR = "%05d";
332
333 // write out each domain followed in sequence by all urls we found in that domain
334 // (urls with tab up front)
335 try (
336 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
337 // Also a global file listing any urls that matched top sites that didn't specify
338 // allowed regex patterns
339 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
340 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
341 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
342 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
343 ) {
344
345 // initialise topSiteMatchesFile with some instructional text.
346 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
347 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
348 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
349
350 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
351 Set<String> domainsSet = domainsToURLsMap.keySet();
352 Iterator<String> domainIterator = domainsSet.iterator();
353
354 /*
355 // DEBUG
356 String value = topSitesMap.get("wikipedia.org");
357 if(value == null) {
358 System.err.println("### wikipedia.org had null value");
359 } else {
360 System.err.println("### wikipedia.org had value: " + value);
361 } // DEBUG
362 */
363
364 while(domainIterator.hasNext()) {
365 String domainWithProtocol = domainIterator.next();
366 // Also get domain without protocol prefix
367 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
368 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
369 String domain = domainWithProtocol.substring(startIndex);
370
371 /*if(domain.contains("docs.google.com")) {
372 System.err.println("domain with protocol: " + domainWithProtocol);
373 System.err.println("domain: " + domain);
374 }*/
375
376 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
377 // If the domain is of a topsite for which no allowed URL pattern has been provided
378 // in sites-too-big-to-exhaustively-crawl.txt,
379 // then we don't know how to crawl the site. Warn the user by writing the affected
380 // domain and seedURLs to the topSiteMatchesFile.
381 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
382
383 // topsite, but we don't (yet) know what portion can be crawled
384 // Append the top site and url to a global/toplevel file that
385 // the user needs to check later and we're done with this domain as it
386 // won't go into any other file hereafter
387
388 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
389 for(String url : urlsForDomainSet) {
390 topSiteMatchesWriter.write("\t" + url + "\n");
391 }
392
393 continue; // done with this domain
394 }
395
396 // start counting the domains we're actually going to process
397 domainCount++;
398
399 String siteID = String.format(FORMATSTR, domainCount);
400 File domainFolder = new File(sitesFolder, siteID);
401 domainFolder.mkdir();
402
403 // write out the domain
404 //seedURLsWriter.write(domainWithProtocol + "\n");
405
406
407 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
408 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
409 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
410 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
411 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
412 try (
413 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
414 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
415 ) {
416
417 // write all sorted unique domains into global domains file
418 // Using the domain withuot protocol since the global domains file is for
419 // informational purposes
420 domainURLsWriter.write(domain + "\n");
421
422 // Only write urls and no domain into single global seedurls file
423 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
424 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
425 // If we ever run nutch on a single seedURLs listing containing
426 // all seed pages to crawl sites from, the above two files will work for that.
427
428 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file
429 // also write into the global seeds file (with a tab prefixed to each?)
430 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
431 for(String url : urlsForDomainSet) {
432 seedURLsWriter.write(url + "\n"); // global seedURLs file
433 siteURLsWriter.write(url + "\n");
434 }
435
436
437 if(allowedURLPatternRegex == null) { // entire site can be crawled
438 siteURLsWriter.write(domainWithProtocol + "\n");
439
440 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
441 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
442 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
443 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
444 urlFilterWriter.write(regexed_domain + "\n"); //global file
445 siteRegexWriter.write(regexed_domain + "\n"); // site file
446 }
447 else { // domain belongs to a top site where only portion of site can be crawled
448
449 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
450 siteURLsWriter.write(domainWithProtocol + "\n");
451 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
452 // and not for all of blogspot.com
453
454 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
455 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
456 urlFilterWriter.write(regexed_domain + "\n");
457 siteRegexWriter.write(regexed_domain + "\n");
458
459 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
460 // don't write out domain. We want individual pages
461 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
462
463 // don't write out domain as a regex expression url filter either,
464 // write out the individual seed urls for the domain instead
465 // since we will only be downloading the single page
466
467 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
468 for(String urlInDomain : urlsForDomainSet) {
469 // don't append slash to end this time
470 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
471 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
472 urlFilterWriter.write(regexed_url + "\n");
473 siteRegexWriter.write(regexed_url + "\n");
474 }
475 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
476
477 // DON'T write out domain into siteURLs file,
478 // BUT DO write it into urlFilter file
479 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
480
481 urlFilterWriter.write(regexed_domain + "\n");
482 siteRegexWriter.write(regexed_domain + "\n");
483 } else { // allowedURLPatternRegex is a url-form - convert to regex
484 if(!allowedURLPatternRegex.endsWith("/")) {
485 allowedURLPatternRegex += "/";
486 }
487 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
488 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
489 siteURLsWriter.write(domainWithProtocol + "\n");
490 urlFilterWriter.write(regexed_pattern + "\n");
491 siteRegexWriter.write(regexed_pattern + "\n");
492
493 }
494 }
495
496 } catch (IOException ioe) {
497 ioe.printStackTrace();
498 System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
499 }
500
501 }
502
503 } catch (IOException ioe) {
504 ioe.printStackTrace();
505 System.err.println("\n@@@@@@@@@ Error writing to one of: ");
506 System.err.println("\t" + seedURLsFile);
507 System.err.println("\t" + urlFilterFile);
508 System.err.println("\t" + domainURLsFile);
509 System.err.println("\t" + topSiteMatchesFile);
510 }
511
512 /*
513 // BEGIN DEBUG
514 System.err.println("@@@@ TopSitesMap contains: ");
515 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
516 String topSite = entry.getKey();
517 String urlPattern = entry.getValue();
518 System.err.println(topSite + " - " + urlPattern);
519 } // END DEBUG
520 */
521 }
522
523 private String stripSubDomain(String url) {
524 int index = url.indexOf(".");
525 if(index != -1) {
526 url = url.substring(index+1);
527 }
528 return url;
529 }
530
531
532 /**
533 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
534 * with or without www. prefix. This method tests for such as case as it would be dangerous
535 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
536 */
537 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
538 // check for an exact match as-is
539 if(seedURLDomain.equals(domain)) {
540 return true;
541 }
542
543 // else check if with or without a www. prefix we have an exact match with domain
544 if(seedURLDomain.startsWith("www.")) {
545 if(seedURLDomain.substring(4).equals(domain)) {
546 return true;
547 }
548 } else {
549 if(domain.equals("www."+seedURLDomain)) {
550 return true;
551 }
552 }
553
554 return false;
555 }
556
557
558 /**
559 * Check if the domain of the seedurl, either in its entirety or when stripped of
560 * www/subdomains, is in the list of top sites.
561 * If it is, and the given url matches the regex for that topsite, then add the url to the
562 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
563 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
564 * @return one of the following values:
565 * - This function returns null if the seedURL's domain does not match any of the topsites.
566 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
567 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
568 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
569 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
570 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
571 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
572 * URL pattern.
573 */
574 private String isURLinTopSitesMap(String fullSeedDomain) {
575 boolean keepLooping = true;
576
577 String domain = fullSeedDomain;
578
579 // domain aprameter will have retained www or subdomains, but is stripped of protocol
580
581 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
582 // if it does, return the value for that topsite domain in the topSitesMap
583 // If no match at all, return null.
584 do {
585
586 String allowed_url_pattern = topSitesMap.get(domain);
587 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
588 // there's an entry for the URL in the topSitesMap
589 System.err.println("##### A top site matches URL domain " + domain);
590
591 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
592 // www prefix, should not exactly match the topSitesMap domain
593 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
594 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
595
596 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
597 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
598 }
599 return allowed_url_pattern;
600 }
601 // else, no entry for the URL in the topSitesMap
602 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
603
604 String newDomain = stripSubDomain(domain);
605 if(domain.equals(newDomain)) {
606 keepLooping = false;
607 } else {
608 domain = newDomain;
609 }
610 } while(keepLooping);
611
612 // url in entirety or stripped of subdomains did not match any of the topsites
613 return null;
614 }
615
616 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
617 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
618 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
619 //while(i.hasNext()) {
620 // Map.Entry<String, Integer> entry = i.next();
621 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
622 String urlPattern = entry.getKey();
623 Integer matchRule = entry.getValue();
624
625 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
626 return true;
627 }
628 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
629 return true;
630 }
631 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
632 return true;
633 }
634 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
635 return true;
636 }
637 // else check the rest of the filter list against this url
638 // before returning false to be certain it's not been listed in the filter list
639 }
640
641 return false;
642 }
643
644 /**
645 * Returns true if the url or pattern is found in the blacklist file.
646 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
647 * it won't get blacklisted after all. But that's not implemented here.
648 */
649 public boolean isBlacklisted(String url) {
650 return isListedInFilterList(blackList, url);
651 }
652
653 /**
654 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
655 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
656 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
657 */
658 public boolean isGreylisted(String url) {
659 // auto-translated product sites
660 return isListedInFilterList(greyList, url);
661 }
662
663 /**
664 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
665 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
666 */
667 public boolean isWhitelisted(String url) {
668 return isListedInFilterList(whiteList, url);
669 }
670
671 /**
672 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
673 * whether it is in the mentioned black|grey|white list.
674 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
675 * By not having this method deal with actual regex for filters, this has the advantage that
676 * we don't have to remember to escape or double escape each filter to turn it into a regex.
677 */
678 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
679
680 // if filterListFilename does not exist in the conf folder, just return
681 if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
682 System.err.println(filterListFilename + " does not exist");
683 return;
684 }
685
686 try (
687 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
688 ) {
689 String filter = null;
690 while((filter = reader.readLine()) != null) {
691 // skip comments and empty lines
692 filter = filter.trim();
693 if(filter.equals("") || filter.startsWith("#")) {
694 continue;
695 }
696
697 if(filter.startsWith("^") && filter.endsWith("$")) {
698 filter = filter.substring(1, filter.length()-1);
699 list.put(filter, LIST_ENTRY_MATCHES);
700 }
701 else if(filter.startsWith("^")) {
702 filter = filter.substring(1);
703 list.put(filter, LIST_ENTRY_STARTSWITH);
704 System.err.println("Match filter startswith: " + filter);
705 }
706 else if(filter.endsWith("$")) {
707 filter = filter.substring(0, filter.length()-1);
708 list.put(filter, LIST_ENTRY_ENDSWITH);
709 }
710 else {
711 list.put(filter, LIST_ENTRY_CONTAINS);
712 }
713 //System.err.println("Got filter: " + filter);
714 }
715
716 } catch (IOException ioe) {
717 ioe.printStackTrace();
718 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
719 }
720
721 }
722
723 /** Maintain a count of all WET files processed. */
724 public void setWETFileCount(int count) { this.wetFileCount = count; }
725
726 /** Maintain a count of all WET records processed. */
727 //public int getRecordCount() { return this.totalRecordCount; }
728 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
729 public void setRecordCount(int count) { this.totalRecordCount = count; }
730
731 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
732
733 // Will list all the warc.wet files in the input directory or else their gzipped versions
734 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
735
736 int wetRecordCount = 0;
737 int wetFileCount = 0;
738
739 for(int i = 0; i < WETFiles.length; i++) {
740 File WETFile = WETFiles[i];
741 logger.debug("Processing WETfile: " + WETFile);
742
743 // Any .gz files listed means they haven't been unzipped yet. So unzip.
744 String WETFilename = WETFile.toString();
745 if(WETFilename.endsWith(".gz")) {
746 File GZippedWETFile = WETFile;
747 String WETGZippedFilename = WETFilename;
748 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
749
750 WETFile = new File(WETFilename);
751 Utility.unzipFile(GZippedWETFile, WETFile);
752 }
753 // hereafter all WETFiles should refer to the unzipped version
754 // Check the unzipped WETFile exists
755
756 if(!WETFile.exists() || !WETFile.isFile()) {
757 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
758 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
759 return;
760 }
761
762 // Finally, we can process this WETFile's records into the keep and discard pile
763 wetFileCount++;
764 logger.debug("Off to process " + WETFile);
765 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
766 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
767 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
768 wetFileProcessor.processWETFile();
769 wetRecordCount += wetFileProcessor.getRecordCount();
770 }
771
772 // for information purposes
773 this.setWETFileCount(wetFileCount);
774 this.setRecordCount(wetRecordCount);
775 }
776
777
778 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
779 public static void printUsage() {
780 System.err.println("Run this program as:");
781 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
782 }
783
784 /** Filename filter to only list warc.wet files or else warc.wet.gz files
785 * for which unzipped warc.wet equivalents don't yet exist.
786 */
787 private static class WETFilenameFilter implements FilenameFilter {
788
789 public boolean accept(File dir, String name) {
790 if(name.endsWith(".warc.wet")) {
791 logger.debug("Will include " + name + " for processing.");
792 return true;
793 }
794
795 if(name.endsWith(".warc.wet.gz")) {
796 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
797 File unzippedVersion = new File(dir, nameWithoutGZext);
798 if(unzippedVersion.exists()) {
799 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
800 logger.debug("Skipping " + name);
801 return false; // don't count gzipped version if unzipped version exists.
802 }
803 else {
804 logger.debug("Only zipped version " + name + " exists.");
805 return true; // No unzipped version, so have to work with gzipped version
806 }
807 }
808
809 // we're not even interested in any other file extensions
810 logger.debug("Not a WET file. Skipping " + name);
811 return false;
812 }
813 }
814
815
816 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
817
818 public boolean accept(File dir, String name) {
819 File f = new File (dir, name);
820 if(f.isDirectory()) {
821 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
822 return true;
823 }
824 }
825 else {
826 System.err.println("File " + f + " is not a directory");
827 }
828 return false;
829 }
830 }
831
832 public static void main(String[] args) {
833 if(args.length != 2) {
834 printUsage();
835 return;
836 }
837
838 File commoncrawlDir = new File(args[0]);
839 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
840 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
841 return;
842 }
843
844 File outFolder = new File(args[1]);
845 if(!outFolder.exists() || !outFolder.isDirectory()) {
846 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
847 return;
848 }
849
850 try {
851 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
852
853 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
854
855 for(int i = 0; i < ccrawlFolders.length; i++) {
856 File ccrawlFolder = ccrawlFolders[i];
857 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
858 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
859 }
860
861 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
862 // The former is the only unique one. seedURLs and regex-urlfilters are
863 // repeated on a per site/domain basis too, stored in the sites folder
864 File seedURLsFile = new File(outFolder, "seedURLs.txt");
865 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
866 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
867 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
868
869 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
870
871 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
872
873 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
874
875
876 } catch(Exception e) {
877 // can get an exception when instantiating CCWETProcessor instance
878 e.printStackTrace();
879 System.err.println(e.getMessage());
880 }
881
882 return;
883
884 }
885}
Note: See TracBrowser for help on using the repository browser.