source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33565

Last change on this file since 33565 was 33565, checked in by ak19, 5 years ago

CCWETProcessor: domain url now goes in as a seedURL after the individual seedURLs, after Dr Bainbridge explained why the original ordering didn't make sense. 2. conf: we inspected the first site to be crawled. It was a non-top site, but we still wanted to control the crawling of it in the same way we control topsites. 3. Documented use of the nutch command for testing which urls pass and fail the existing regex-urlfilter checks.

File size: 36.6 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.nio.charset.StandardCharsets;
6import java.util.Properties;
7import java.util.zip.GZIPInputStream;
8import java.util.Iterator;
9import java.util.HashMap;
10import java.util.Map;
11import java.util.Set;
12import java.util.TreeMap;
13import java.util.TreeSet;
14
15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16import org.apache.log4j.Logger;
17
18
19/**
20 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21 * the WET records in each, putting each WET record into a file. Each file is put into a
22 * keep or discard or greyListed folder, and its url listed written into a keep, discard
23 * or greylisted text file, based on based on
24 *
25 * 1. whether it's whitelisted, else greylisted else blacklisted
26 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27 * enough content. Formerly, content-length and number of lines were used to determine if
28 * the content was sufficient. Now it's just word count and number of MAX characters
29 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30 * in conf/config.properties.
31 *
32 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33 * into the conf folder to control any url patterns that are explicitly included or excluded or
34 * set aside for inspecting later. These filter text files don't use regexes, instead their
35 * format is:
36 * - precede URL by ^ to blacklist urls that match the given prefix
37 * - succeed URL by $ to blacklist urls that match the given suffix
38 * - ^url$ will blacklist urls that match the given url completely
39 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40 *
41 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
42 * over greylisting and which takes precedence over blacklisting in turn. However, even
43 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45 * also for nutch.
46 *
47 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48 * in the given input folder. Then use a single instance of the WETProcessor class to process
49 * each single unzipped warc.wet file.
50 *
51 * To compile, including the jars in lib/ for compiling.
52 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53 *
54 * To run, passing the log4j and other properties files in conf/ folder:
55 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
56 *
57 * e.g.
58 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
60 *
61*/
62
63public class CCWETProcessor {
64 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
65
66 // Properties shared across WETProcessor instances
67 public final int MAX_WORD_LENGTH;
68 public final int MIN_NUM_WORDS;
69 public final int MAX_WORDS_CAMELCASE;
70
71 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
72 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
73 public final String SINGLEPAGE = "SINGLEPAGE";
74 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
75
76 /**
77 * Characters that need escaping if used as a string literal in a regex
78 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
79 * https://www.regular-expressions.info/refcharacters.html
80 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
81 * as would happen for any chars appearing earlier in this list than \\
82 */
83 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
84 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
85
86 private Properties configProperties = new Properties();
87
88 // File paths shared across WETProcessor instances
89 public final File commoncrawlDir;
90 public final File outputFolder;
91 public final File discardFolder;
92 public final File keepFolder;
93 public final File greyListedFolder;
94 public final File keepURLsFile;
95 public final File discardURLsFile;
96 public final File greyListedFile;
97
98 /** Possible values stored in the blackList/whiteList/greyList Maps */
99 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
100 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
101 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
102 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
103
104 /**
105 * Store url patterns as keys and values indicated whether a url should
106 * match it exactly, start/end with it, or contain it
107 */
108 private HashMap<String, Integer> blackList;
109 private HashMap<String, Integer> greyList;
110 private HashMap<String, Integer> whiteList;
111
112 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
113 * with optional regex defining allowed exceptions, like subdomains or url suffixes
114 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
115 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
116 * would be relevant.
117 * The map would store top site domain suffix and an optional regex string for allowable
118 * url patterns.
119 */
120 private HashMap<String, String> topSitesMap;
121
122 /** Map of domains we keep and the full urls we're keeping that are of that domain.
123 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
124 * while a HashMap has no notion of ordering, because we just need to store urls with
125 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
126 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
127 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
128 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
129 */
130 private Map<String, Set<String>> domainsToURLsMap;
131
132 // Keep a count of all the records that all WETProcessors instantiated
133 // by our main method combined have processed
134 private int totalRecordCount = 0;
135
136 private int wetFileCount = 0;
137
138 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
139
140 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
141 this.commoncrawlDir = inFolder;
142 this.outputFolder = outFolder;
143
144 // load up the properties from the config file
145 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
146 configProperties = new Properties();
147 configProperties.load(infile);
148 //infile.close(); // not explicitly called in examples of try-with-resources
149
150 } catch(Exception e) {
151 System.err.println("Exception attempting to read properties from config.properties.");
152 logger.error("Exception attempting to read properties from config.properties.");
153 e.printStackTrace();
154 }
155
156 if(configProperties.size() == 0) {
157 System.err.println("*** Warning: no values read into config properties. Using defaults.");
158 }
159
160 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
161 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
162 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
163
164
165 this.discardFolder = new File(outFolder, "discard");
166 if(!discardFolder.exists()) {
167 discardFolder.mkdir();
168 }
169 this.keepFolder = new File(outFolder, "keep");
170 if(!keepFolder.exists()) {
171 keepFolder.mkdir();
172 }
173
174 this.greyListedFolder = new File(outFolder, "greylisted");
175 if(!greyListedFolder.exists()) {
176 greyListedFolder.mkdir();
177 }
178
179 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
180 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
181 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
182 }
183 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
184 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
185 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
186 }
187 this.greyListedFile = new File(outFolder, "greyListed.txt");
188 if(greyListedFile.exists() && !greyListedFile.delete()) {
189 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
190 }
191
192 // prepare our blacklist, greylist (for inspection) and whitelist
193 System.err.println("Loading blacklist.");
194 blackList = new HashMap<String, Integer>();
195 initURLFilterList(blackList, "url-blacklist-filter.txt");
196
197 System.err.println("Loading greylist.");
198 greyList = new HashMap<String, Integer>();
199 initURLFilterList(greyList, "url-greylist-filter.txt");
200
201 System.err.println("Loading whitelist.");
202 whiteList = new HashMap<String, Integer>();
203 initURLFilterList(whiteList, "url-whitelist-filter.txt");
204
205 // Create the map of topSites
206 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
207 topSitesMap = new HashMap<String, String>();
208
209 // Read in our csv file of topsites and what to do when one hits a match with a seedURL
210 // and put these in our topSitesMap
211 // https://commons.apache.org/proper/commons-csv/apidocs/index.html
212 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
213 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
214 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
215 .withCommentMarker('#')
216 .withSkipHeaderRecord()
217 .withIgnoreSurroundingSpaces();
218
219 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
220 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
221 try (
222 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
223 ) {
224 for (CSVRecord csvRecord : parser) {
225 String topsite = csvRecord.get(0);
226 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
227 topSitesMap.put(topsite, allowed_url_pattern);
228
229 //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
230
231 }
232 } catch(Exception e) {
233 e.printStackTrace();
234 System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
235 }
236
237
238
239 //System.err.println("Prematurely terminating for testing purposes.");
240 //System.exit(-1);
241 }
242
243 /** Work out the 'domain' for a given url.
244 * This retains any www. or subdomain prefix.
245 */
246 private String getDomainForURL(String url, boolean withProtocol) {
247 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
248 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
249 // the keep the URL around in case param withProtocol=true
250 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
251
252 String domain = url.substring(startIndex);
253 int endIndex = domain.indexOf("/");
254 if(endIndex == -1) endIndex = domain.length();
255 domain = domain.substring(0, endIndex);
256
257 if(withProtocol) {
258 // now that we have the domain (everything to the first / when there is no protocol)
259 // can glue the protocol back on
260 domain = protocol + domain;
261 }
262
263 return domain;
264 }
265
266 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
267 private String escapeStringForRegex(String str) {
268 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
269 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
270 str = str.replace(Character.toString(c), "\\"+c);
271 }
272 return str;
273 }
274
275 /**
276 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
277 * as output the URL seed list and regex-urlfilter text files required by nutch, see
278 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
279 */
280 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
281 File domainURLsFile, File topSiteMatchesFile) {
282 // Maintain a Map of unique domains mapped to seed urls at that domain
283 // TreeSet: by default, "the elements are ordered using their natural ordering"
284 // (or by a Comparator provided at set creation time).
285 // Whereas HashSet doesn't guarantee ordering.
286 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
287 // Would be a similar distinction for Maps.
288 domainsToURLsMap = new TreeMap<String, Set<String>>();
289
290 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
291 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
292
293 try (
294 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
295 ) {
296
297 // read a URL at a time from urlsFile
298 String url = null;
299 String domainWithProtocol = null;
300 while((url = reader.readLine()) != null) { // readLine removes newline separator
301
302 // work out domain. This retains any www. or subdomain prefix
303 // passing true to further also retain the http(s) protocol
304 domainWithProtocol = getDomainForURL(url, true);
305
306 Set<String> urlsSet;
307 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
308 urlsSet = new TreeSet<String>();
309 urlsSet.add(url);
310 domainsToURLsMap.put(domainWithProtocol, urlsSet);
311 } else {
312 urlsSet = domainsToURLsMap.get(domainWithProtocol);
313 urlsSet.add(url);
314 }
315
316 }
317 } catch (IOException ioe) {
318 ioe.printStackTrace();
319 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
320 }
321
322 // We'd have pruned out duplicates by now and have a sorted list of domains,
323 // each of which maps to seed URLs in the commoncrawl for that domain
324
325 int domainCount = 0;
326 File sitesFolder = new File(outputFolder, "sites");
327 if(!sitesFolder.exists()) {
328 sitesFolder.mkdir();
329 }
330 final String FORMATSTR = "%05d";
331
332 // write out each domain followed in sequence by all urls we found in that domain
333 // (urls with tab up front)
334 try (
335 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
336 // Also a global file listing any urls that matched top sites that didn't specify
337 // allowed regex patterns
338 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
339 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
340 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
341 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
342 ) {
343
344 // initialise topSiteMatchesFile with some instructional text.
345 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
346 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
347 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
348
349 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
350 Set<String> domainsSet = domainsToURLsMap.keySet();
351 Iterator<String> domainIterator = domainsSet.iterator();
352
353 /*
354 // DEBUG
355 String value = topSitesMap.get("wikipedia.org");
356 if(value == null) {
357 System.err.println("### wikipedia.org had null value");
358 } else {
359 System.err.println("### wikipedia.org had value: " + value);
360 } // DEBUG
361 */
362
363 while(domainIterator.hasNext()) {
364 String domainWithProtocol = domainIterator.next();
365 // Also get domain without protocol prefix
366 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
367 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
368 String domain = domainWithProtocol.substring(startIndex);
369
370 /*if(domain.contains("docs.google.com")) {
371 System.err.println("domain with protocol: " + domainWithProtocol);
372 System.err.println("domain: " + domain);
373 }*/
374
375 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
376 // If the domain is of a topsite for which no allowed URL pattern has been provided
377 // in sites-too-big-to-exhaustively-crawl.txt,
378 // then we don't know how to crawl the site. Warn the user by writing the affected
379 // domain and seedURLs to the topSiteMatchesFile.
380 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
381
382 // topsite, but we don't (yet) know what portion can be crawled
383 // Append the top site and url to a global/toplevel file that
384 // the user needs to check later and we're done with this domain as it
385 // won't go into any other file hereafter
386
387 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
388 for(String url : urlsForDomainSet) {
389 topSiteMatchesWriter.write("\t" + url + "\n");
390 }
391
392 continue; // done with this domain
393 }
394
395 // start counting the domains we're actually going to process
396 domainCount++;
397
398 String siteID = String.format(FORMATSTR, domainCount);
399 File domainFolder = new File(sitesFolder, siteID);
400 domainFolder.mkdir();
401
402 // write out the domain
403 //seedURLsWriter.write(domainWithProtocol + "\n");
404
405
406 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
407 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
408 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
409 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
410 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
411 try (
412 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
413 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
414 ) {
415
416 // write all sorted unique domains into global domains file
417 // Using the domain withuot protocol since the global domains file is for
418 // informational purposes
419 domainURLsWriter.write(domain + "\n");
420
421 // Only write urls and no domain into single global seedurls file
422 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
423 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
424 // If we ever run nutch on a single seedURLs listing containing
425 // all seed pages to crawl sites from, the above two files will work for that.
426
427 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file
428 // also write into the global seeds file (with a tab prefixed to each?)
429 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
430 for(String url : urlsForDomainSet) {
431 seedURLsWriter.write(url + "\n"); // global seedURLs file
432 siteURLsWriter.write(url + "\n");
433 }
434
435
436 if(allowedURLPatternRegex == null) { // entire site can be crawled
437 siteURLsWriter.write(domainWithProtocol + "\n");
438
439 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
440 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
441 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
442 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
443 urlFilterWriter.write(regexed_domain + "\n"); //global file
444 siteRegexWriter.write(regexed_domain + "\n"); // site file
445 }
446 else { // domain belongs to a top site where only portion of site can be crawled
447
448 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
449 siteURLsWriter.write(domainWithProtocol + "\n");
450 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
451 // and not for all of blogspot.com
452
453 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
454 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
455 urlFilterWriter.write(regexed_domain + "\n");
456 siteRegexWriter.write(regexed_domain + "\n");
457
458 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
459 // don't write out domain. We want individual pages
460 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
461
462 // don't write out domain as a regex expression url filter either,
463 // write out the individual seed urls for the domain instead
464 // since we will only be downloading the single page
465
466 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
467 for(String urlInDomain : urlsForDomainSet) {
468 // don't append slash to end this time
469 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
470 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
471 urlFilterWriter.write(regexed_url + "\n");
472 siteRegexWriter.write(regexed_url + "\n");
473 }
474 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
475
476 // DON'T write out domain into siteURLs file,
477 // BUT DO write it into urlFilter file
478 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
479
480 urlFilterWriter.write(regexed_domain + "\n");
481 siteRegexWriter.write(regexed_domain + "\n");
482 } else { // allowedURLPatternRegex is a url-form - convert to regex
483 if(!allowedURLPatternRegex.endsWith("/")) {
484 allowedURLPatternRegex += "/";
485 }
486 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
487 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
488 siteURLsWriter.write(domainWithProtocol + "\n");
489 urlFilterWriter.write(regexed_pattern + "\n");
490 siteRegexWriter.write(regexed_pattern + "\n");
491
492 }
493 }
494
495 } catch (IOException ioe) {
496 ioe.printStackTrace();
497 System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
498 }
499
500 }
501
502 } catch (IOException ioe) {
503 ioe.printStackTrace();
504 System.err.println("\n@@@@@@@@@ Error writing to one of: ");
505 System.err.println("\t" + seedURLsFile);
506 System.err.println("\t" + urlFilterFile);
507 System.err.println("\t" + domainURLsFile);
508 System.err.println("\t" + topSiteMatchesFile);
509 }
510
511 /*
512 // BEGIN DEBUG
513 System.err.println("@@@@ TopSitesMap contains: ");
514 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
515 String topSite = entry.getKey();
516 String urlPattern = entry.getValue();
517 System.err.println(topSite + " - " + urlPattern);
518 } // END DEBUG
519 */
520 }
521
522 private String stripSubDomain(String url) {
523 int index = url.indexOf(".");
524 if(index != -1) {
525 url = url.substring(index+1);
526 }
527 return url;
528 }
529
530
531 /**
532 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
533 * with or without www. prefix. This method tests for such as case as it would be dangerous
534 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
535 */
536 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
537 // check for an exact match as-is
538 if(seedURLDomain.equals(domain)) {
539 return true;
540 }
541
542 // else check if with or without a www. prefix we have an exact match with domain
543 if(seedURLDomain.startsWith("www.")) {
544 if(seedURLDomain.substring(4).equals(domain)) {
545 return true;
546 }
547 } else {
548 if(domain.equals("www."+seedURLDomain)) {
549 return true;
550 }
551 }
552
553 return false;
554 }
555
556
557 /**
558 * Check if the domain of the seedurl, either in its entirety or when stripped of
559 * www/subdomains, is in the list of top sites.
560 * If it is, and the given url matches the regex for that topsite, then add the url to the
561 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
562 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
563 * @return one of the following values:
564 * - This function returns null if the seedURL's domain does not match any of the topsites.
565 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
566 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
567 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
568 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
569 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
570 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
571 * URL pattern.
572 */
573 private String isURLinTopSitesMap(String fullSeedDomain) {
574 boolean keepLooping = true;
575
576 String domain = fullSeedDomain;
577
578 // domain aprameter will have retained www or subdomains, but is stripped of protocol
579
580 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
581 // if it does, return the value for that topsite domain in the topSitesMap
582 // If no match at all, return null.
583 do {
584
585 String allowed_url_pattern = topSitesMap.get(domain);
586 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
587 // there's an entry for the URL in the topSitesMap
588 System.err.println("##### A top site matches URL domain " + domain);
589
590 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
591 // www prefix, should not exactly match the topSitesMap domain
592 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
593 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
594
595 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
596 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
597 }
598 return allowed_url_pattern;
599 }
600 // else, no entry for the URL in the topSitesMap
601 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
602
603 String newDomain = stripSubDomain(domain);
604 if(domain.equals(newDomain)) {
605 keepLooping = false;
606 } else {
607 domain = newDomain;
608 }
609 } while(keepLooping);
610
611 // url in entirety or stripped of subdomains did not match any of the topsites
612 return null;
613 }
614
615 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
616 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
617 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
618 //while(i.hasNext()) {
619 // Map.Entry<String, Integer> entry = i.next();
620 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
621 String urlPattern = entry.getKey();
622 Integer matchRule = entry.getValue();
623
624 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
625 return true;
626 }
627 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
628 return true;
629 }
630 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
631 return true;
632 }
633 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
634 return true;
635 }
636 // else check the rest of the filter list against this url
637 // before returning false to be certain it's not been listed in the filter list
638 }
639
640 return false;
641 }
642
643 /**
644 * Returns true if the url or pattern is found in the blacklist file.
645 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
646 * it won't get blacklisted after all. But that's not implemented here.
647 */
648 public boolean isBlacklisted(String url) {
649 return isListedInFilterList(blackList, url);
650 }
651
652 /**
653 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
654 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
655 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
656 */
657 public boolean isGreylisted(String url) {
658 // auto-translated product sites
659 return isListedInFilterList(greyList, url);
660 }
661
662 /**
663 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
664 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
665 */
666 public boolean isWhitelisted(String url) {
667 return isListedInFilterList(whiteList, url);
668 }
669
670 /**
671 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
672 * whether it is in the mentioned black|grey|white list.
673 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
674 * By not having this method deal with actual regex for filters, this has the advantage that
675 * we don't have to remember to escape or double escape each filter to turn it into a regex.
676 */
677 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
678
679 // if filterListFilename does not exist in the conf folder, just return
680 if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
681 System.err.println(filterListFilename + " does not exist");
682 return;
683 }
684
685 try (
686 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
687 ) {
688 String filter = null;
689 while((filter = reader.readLine()) != null) {
690 // skip comments and empty lines
691 filter = filter.trim();
692 if(filter.equals("") || filter.startsWith("#")) {
693 continue;
694 }
695
696 if(filter.startsWith("^") && filter.endsWith("$")) {
697 filter = filter.substring(1, filter.length()-1);
698 list.put(filter, LIST_ENTRY_MATCHES);
699 }
700 else if(filter.startsWith("^")) {
701 filter = filter.substring(1);
702 list.put(filter, LIST_ENTRY_STARTSWITH);
703 System.err.println("Match filter startswith: " + filter);
704 }
705 else if(filter.endsWith("$")) {
706 filter = filter.substring(0, filter.length()-1);
707 list.put(filter, LIST_ENTRY_ENDSWITH);
708 }
709 else {
710 list.put(filter, LIST_ENTRY_CONTAINS);
711 }
712 //System.err.println("Got filter: " + filter);
713 }
714
715 } catch (IOException ioe) {
716 ioe.printStackTrace();
717 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
718 }
719
720 }
721
722 /** Maintain a count of all WET files processed. */
723 public void setWETFileCount(int count) { this.wetFileCount = count; }
724
725 /** Maintain a count of all WET records processed. */
726 //public int getRecordCount() { return this.totalRecordCount; }
727 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
728 public void setRecordCount(int count) { this.totalRecordCount = count; }
729
730 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
731
732 // Will list all the warc.wet files in the input directory or else their gzipped versions
733 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
734
735 int wetRecordCount = 0;
736 int wetFileCount = 0;
737
738 for(int i = 0; i < WETFiles.length; i++) {
739 File WETFile = WETFiles[i];
740 logger.debug("Processing WETfile: " + WETFile);
741
742 // Any .gz files listed means they haven't been unzipped yet. So unzip.
743 String WETFilename = WETFile.toString();
744 if(WETFilename.endsWith(".gz")) {
745 File GZippedWETFile = WETFile;
746 String WETGZippedFilename = WETFilename;
747 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
748
749 WETFile = new File(WETFilename);
750 Utility.unzipFile(GZippedWETFile, WETFile);
751 }
752 // hereafter all WETFiles should refer to the unzipped version
753 // Check the unzipped WETFile exists
754
755 if(!WETFile.exists() || !WETFile.isFile()) {
756 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
757 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
758 return;
759 }
760
761 // Finally, we can process this WETFile's records into the keep and discard pile
762 wetFileCount++;
763 logger.debug("Off to process " + WETFile);
764 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
765 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
766 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
767 wetFileProcessor.processWETFile();
768 wetRecordCount += wetFileProcessor.getRecordCount();
769 }
770
771 // for information purposes
772 this.setWETFileCount(wetFileCount);
773 this.setRecordCount(wetRecordCount);
774 }
775
776
777 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
778 public static void printUsage() {
779 System.err.println("Run this program as:");
780 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
781 }
782
783 /** Filename filter to only list warc.wet files or else warc.wet.gz files
784 * for which unzipped warc.wet equivalents don't yet exist.
785 */
786 private static class WETFilenameFilter implements FilenameFilter {
787
788 public boolean accept(File dir, String name) {
789 if(name.endsWith(".warc.wet")) {
790 logger.debug("Will include " + name + " for processing.");
791 return true;
792 }
793
794 if(name.endsWith(".warc.wet.gz")) {
795 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
796 File unzippedVersion = new File(dir, nameWithoutGZext);
797 if(unzippedVersion.exists()) {
798 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
799 logger.debug("Skipping " + name);
800 return false; // don't count gzipped version if unzipped version exists.
801 }
802 else {
803 logger.debug("Only zipped version " + name + " exists.");
804 return true; // No unzipped version, so have to work with gzipped version
805 }
806 }
807
808 // we're not even interested in any other file extensions
809 logger.debug("Not a WET file. Skipping " + name);
810 return false;
811 }
812 }
813
814
815 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
816
817 public boolean accept(File dir, String name) {
818 File f = new File (dir, name);
819 if(f.isDirectory()) {
820 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
821 return true;
822 }
823 }
824 else {
825 System.err.println("File " + f + " is not a directory");
826 }
827 return false;
828 }
829 }
830
831 public static void main(String[] args) {
832 if(args.length != 2) {
833 printUsage();
834 return;
835 }
836
837 File commoncrawlDir = new File(args[0]);
838 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
839 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
840 return;
841 }
842
843 File outFolder = new File(args[1]);
844 if(!outFolder.exists() || !outFolder.isDirectory()) {
845 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
846 return;
847 }
848
849 try {
850 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
851
852 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
853
854 for(int i = 0; i < ccrawlFolders.length; i++) {
855 File ccrawlFolder = ccrawlFolders[i];
856 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
857 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
858 }
859
860 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
861 // The former is the only unique one. seedURLs and regex-urlfilters are
862 // repeated on a per site/domain basis too, stored in the sites folder
863 File seedURLsFile = new File(outFolder, "seedURLs.txt");
864 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
865 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
866 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
867
868 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
869
870 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
871
872 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
873
874
875 } catch(Exception e) {
876 // can get an exception when instantiating CCWETProcessor instance
877 e.printStackTrace();
878 System.err.println(e.getMessage());
879 }
880
881 return;
882
883 }
884}
Note: See TracBrowser for help on using the repository browser.