source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33560

Last change on this file since 33560 was 33560, checked in by ak19, 5 years ago
  1. Incorporated Dr Bainbridge's suggested improvements: only when there is a subdomain to a seed URL's domain should SUBDOMAIN-COPY be active, otherwise it should be deactivated on topsites match. For example if seedURL's domain is pinky.blogspot.com, then SUBDOMAIN-COPY can crawl that site as it's not all of blogspot. But if the seedURL domain was blogspot.com it would still match the topsite blogspot.com for which SUBDOMAIN-COPY is the value, but the value should be overridden so as not to crawl the site. 2. More complete regex escaping for the regex-urlfilter.txt file. 3. domainToURLs map now contains the domain WITH protocol prefix, which required adjustments to be made in the rest of the code. 4. Together with the changes to the blacklist, whitelist and topsites file (sites-too-big-to-exhaustively crawl file), I think the code is dealing with all the known wanted urls among the topsites now and generating the correct output for the seedURLs and regex-urlfilter file.
File size: 35.7 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeMap;
12import java.util.TreeSet;
13
14import org.apache.log4j.Logger;
15
16/**
17 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18 * the WET records in each, putting each WET record into a file. Each file is put into a
19 * keep or discard or greyListed folder, and its url listed written into a keep, discard
20 * or greylisted text file, based on based on
21 *
22 * 1. whether it's whitelisted, else greylisted else blacklisted
23 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24 * enough content. Formerly, content-length and number of lines were used to determine if
25 * the content was sufficient. Now it's just word count and number of MAX characters
26 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27 * in conf/config.properties.
28 *
29 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30 * into the conf folder to control any url patterns that are explicitly included or excluded or
31 * set aside for inspecting later. These filter text files don't use regexes, instead their
32 * format is:
33 * - precede URL by ^ to blacklist urls that match the given prefix
34 * - succeed URL by $ to blacklist urls that match the given suffix
35 * - ^url$ will blacklist urls that match the given url completely
36 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37 *
38 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
39 * over greylisting and which takes precedence over blacklisting in turn. However, even
40 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42 * also for nutch.
43 *
44 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45 * in the given input folder. Then use a single instance of the WETProcessor class to process
46 * each single unzipped warc.wet file.
47 *
48 * To compile, including the jars in lib/ for compiling.
49 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50 *
51 * To run, passing the log4j and other properties files in conf/ folder:
52 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53 *
54 * e.g.
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
57 *
58*/
59
60public class CCWETProcessor {
61 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63 // Properties shared across WETProcessor instances
64 public final int MAX_WORD_LENGTH;
65 public final int MIN_NUM_WORDS;
66 public final int MAX_WORDS_CAMELCASE;
67
68 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
69 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
70 public final String SINGLEPAGE = "SINGLEPAGE";
71
72 /**
73 * Characters that need escaping if used as a string literal in a regex
74 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
75 * https://www.regular-expressions.info/refcharacters.html
76 */
77 //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"];
78 // put the \\ at start so we don't the escape character for chars escaped earlier
79 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
80
81 private Properties configProperties = new Properties();
82
83 // File paths shared across WETProcessor instances
84 public final File commoncrawlDir;
85 public final File outputFolder;
86 public final File discardFolder;
87 public final File keepFolder;
88 public final File greyListedFolder;
89 public final File keepURLsFile;
90 public final File discardURLsFile;
91 public final File greyListedFile;
92
93 /** Possible values stored in the blackList/whiteList/greyList Maps */
94 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
95 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
96 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
97 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
98
99 /**
100 * Store url patterns as keys and values indicated whether a url should
101 * match it exactly, start/end with it, or contain it
102 */
103 private HashMap<String, Integer> blackList;
104 private HashMap<String, Integer> greyList;
105 private HashMap<String, Integer> whiteList;
106
107 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
108 * with optional regex defining allowed exceptions, like subdomains or url suffixes
109 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
110 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
111 * would be relevant.
112 * The map would store top site domain suffix and an optional regex string for allowable
113 * url patterns.
114 */
115 private HashMap<String, String> topSitesMap;
116
117 /** Map of domains we keep and the full urls we're keeping that are of that domain.
118 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
119 * while a HashMap has no notion of ordering, because we just need to store urls with
120 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
121 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
122 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
123 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
124 */
125 private Map<String, Set<String>> domainsToURLsMap;
126
127 // Keep a count of all the records that all WETProcessors instantiated
128 // by our main method combined have processed
129 private int totalRecordCount = 0;
130
131 private int wetFileCount = 0;
132
133 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
134 this.commoncrawlDir = inFolder;
135 this.outputFolder = outFolder;
136
137 // load up the properties from the config file
138 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
139 configProperties = new Properties();
140 configProperties.load(infile);
141 //infile.close(); // not explicitly called in examples of try-with-resources
142
143 } catch(Exception e) {
144 System.err.println("Exception attempting to read properties from config.properties.");
145 logger.error("Exception attempting to read properties from config.properties.");
146 e.printStackTrace();
147 }
148
149 if(configProperties.size() == 0) {
150 System.err.println("*** Warning: no values read into config properties. Using defaults.");
151 }
152
153 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
154 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
155 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
156
157
158 this.discardFolder = new File(outFolder, "discard");
159 if(!discardFolder.exists()) {
160 discardFolder.mkdir();
161 }
162 this.keepFolder = new File(outFolder, "keep");
163 if(!keepFolder.exists()) {
164 keepFolder.mkdir();
165 }
166
167 this.greyListedFolder = new File(outFolder, "greylisted");
168 if(!greyListedFolder.exists()) {
169 greyListedFolder.mkdir();
170 }
171
172 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
173 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
174 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
175 }
176 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
177 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
178 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
179 }
180 this.greyListedFile = new File(outFolder, "greyListed.txt");
181 if(greyListedFile.exists() && !greyListedFile.delete()) {
182 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
183 }
184
185 // prepare our blacklist, greylist (for inspection) and whitelist
186 System.err.println("Loading blacklist.");
187 blackList = new HashMap<String, Integer>();
188 initURLFilterList(blackList, "url-blacklist-filter.txt");
189
190 System.err.println("Loading greylist.");
191 greyList = new HashMap<String, Integer>();
192 initURLFilterList(greyList, "url-greylist-filter.txt");
193
194 System.err.println("Loading whitelist.");
195 whiteList = new HashMap<String, Integer>();
196 initURLFilterList(whiteList, "url-whitelist-filter.txt");
197
198 // Create the map of topSites
199 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
200 topSitesMap = new HashMap<String, String>();
201 //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt");
202
203 try (
204 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));
205 ) {
206
207 String str = null;
208 while((str = reader.readLine()) != null) {
209 str = str.trim();
210 if(str.equals("") || str.startsWith("#")) {
211 continue;
212 }
213
214 int tabindex = str.indexOf("\t");
215 if(tabindex == -1) {
216 topSitesMap.put(str, "");
217 } else {
218 String topsite = str.substring(0, tabindex).trim();
219 String allowed_url_pattern = str.substring(tabindex+1).trim();
220 topSitesMap.put(topsite, allowed_url_pattern);
221 }
222 }
223 } catch (IOException ioe) {
224 ioe.printStackTrace();
225 System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt");
226 }
227
228 //System.err.println("Prematurely terminating for testing purposes.");
229 //System.exit(-1);
230 }
231
232 /** Work out the 'domain' for a given url.
233 * This retains any www. or subdomain prefix.
234 */
235 private String getDomainForURL(String url, boolean withProtocol) {
236 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
237 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
238 // the keep the URL around in case param withProtocol=true
239 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
240
241 String domain = url.substring(startIndex);
242 int endIndex = domain.indexOf("/");
243 if(endIndex == -1) endIndex = domain.length();
244 domain = domain.substring(0, endIndex);
245
246 if(withProtocol) {
247 // now that we have the domain (everything to the first / when there is no protocol)
248 // can glue the protocol back on
249 domain = protocol + domain;
250 }
251
252 return domain;
253 }
254
255 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
256 private String escapeStringForRegex(String str) {
257 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
258 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
259 str = str.replace(Character.toString(c), "\\"+c);
260 }
261 return str;
262 }
263
264 /**
265 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
266 * as output the URL seed list and regex-urlfilter text files required by nutch, see
267 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
268 */
269 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
270 File domainURLsFile, File topSiteMatchesFile) {
271 // Maintain a Map of unique domains mapped to seed urls at that domain
272 // TreeSet: by default, "the elements are ordered using their natural ordering"
273 // (or by a Comparator provided at set creation time).
274 // Whereas HashSet doesn't guarantee ordering.
275 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
276 // Would be a similar distinction for Maps.
277 domainsToURLsMap = new TreeMap<String, Set<String>>();
278
279 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
280 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
281
282 try (
283 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
284 ) {
285
286 // read a URL at a time from urlsFile
287 String url = null;
288 String domainWithProtocol = null;
289 while((url = reader.readLine()) != null) { // readLine removes newline separator
290
291 // work out domain. This retains any www. or subdomain prefix
292 // passing true to further also retain the http(s) protocol
293 domainWithProtocol = getDomainForURL(url, true);
294
295 Set<String> urlsSet;
296 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
297 urlsSet = new TreeSet<String>();
298 urlsSet.add(url);
299 domainsToURLsMap.put(domainWithProtocol, urlsSet);
300 } else {
301 urlsSet = domainsToURLsMap.get(domainWithProtocol);
302 urlsSet.add(url);
303 }
304
305 }
306 } catch (IOException ioe) {
307 ioe.printStackTrace();
308 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
309 }
310
311 // We'd have pruned out duplicates by now and have a sorted list of domains,
312 // each of which maps to seed URLs in the commoncrawl for that domain
313
314 int domainCount = 0;
315 File sitesFolder = new File(outputFolder, "sites");
316 if(!sitesFolder.exists()) {
317 sitesFolder.mkdir();
318 }
319 final String FORMATSTR = "%05d";
320
321 // write out each domain followed in sequence by all urls we found in that domain
322 // (urls with tab up front)
323 try (
324 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
325 // Also a global file listing any urls that matched top sites that didn't specify
326 // allowed regex patterns
327 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
328 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
329 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
330 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
331 ) {
332
333 // initialise topSiteMatchesFile with some instructional text.
334 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
335 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
336 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
337
338 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
339 Set<String> domainsSet = domainsToURLsMap.keySet();
340 Iterator<String> domainIterator = domainsSet.iterator();
341
342 /*
343 // DEBUG
344 String value = topSitesMap.get("wikipedia.org");
345 if(value == null) {
346 System.err.println("### wikipedia.org had null value");
347 } else {
348 System.err.println("### wikipedia.org had value: " + value);
349 } // DEBUG
350 */
351
352 while(domainIterator.hasNext()) {
353 String domainWithProtocol = domainIterator.next();
354 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
355 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
356 String domain = domainWithProtocol.substring(startIndex);
357
358 System.err.println("domain with protocol: " + domainWithProtocol);
359 System.err.println("domain: " + domain);
360
361 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
362 // If the domain is of a topsite for which no allowed URL pattern has been provided
363 // in sites-too-big-to-exhaustively-crawl.txt,
364 // then we don't know how to crawl the site. Warn the user by writing the affected
365 // domain and seedURLs to the topSiteMatchesFile.
366 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
367
368 // topsite, but we don't (yet) know what portion can be crawled
369 // Append the top site and url to a global/toplevel file that
370 // the user needs to check later and we're done with this domain as it
371 // won't go into any other file hereafter
372
373 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
374 Iterator<String> urlIterator = urlsForDomainSet.iterator();
375 while(urlIterator.hasNext()) {
376 String url = urlIterator.next();
377 topSiteMatchesWriter.write("\t" + url + "\n");
378 }
379
380 continue; // done with this domain
381 }
382
383 // start counting the domains we're actually going to process
384 domainCount++;
385
386 String siteID = String.format(FORMATSTR, domainCount);
387 File domainFolder = new File(sitesFolder, siteID);
388 domainFolder.mkdir();
389
390 // write out the domain
391 //seedURLsWriter.write(domainWithProtocol + "\n");
392
393
394 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
395 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
396 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
397 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
398 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
399 try (
400 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
401 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
402 ) {
403
404 // write all sorted unique domains into global domains file
405 // Using the domain withuot protocol since the global domains file is for
406 // informational purposes
407 domainURLsWriter.write(domain + "\n");
408
409 // Only write urls and no domain into single global seedurls file
410 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
411 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
412 // If we ever run nutch on a single seedURLs listing containing
413 // all seed pages to crawl sites from, the above two files will work for that.
414
415 if(allowedURLPatternRegex == null) { // entire site can be crawled
416 siteURLsWriter.write(domainWithProtocol + "\n");
417
418 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
419 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
420 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
421 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
422 urlFilterWriter.write(regexed_domain + "\n"); //global file
423 siteRegexWriter.write(regexed_domain + "\n"); // site file
424 }
425 else { // domain belongs to a top site where only portion of site can be crawled
426
427 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
428 siteURLsWriter.write(domainWithProtocol + "\n");
429 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
430 // and not for all of blogspot.com
431
432 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
433 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
434 urlFilterWriter.write(regexed_domain + "\n");
435 siteRegexWriter.write(regexed_domain + "\n");
436
437 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
438 // don't write out domain. We want individual pages
439 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
440
441 // don't write out domain as a regex expression url filter either,
442 // write out the individual seed urls for the domain instead
443 // since we will only be downloading the single page
444
445 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
446 for(String urlInDomain : urlsForDomainSet) {
447 // don't append slash to end this time
448 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
449 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
450 urlFilterWriter.write(regexed_url + "\n");
451 siteRegexWriter.write(regexed_url + "\n");
452 }
453 } else { // allowedURLPatternRegex is a url-form - convert to regex
454 if(!allowedURLPatternRegex.endsWith("/")) {
455 allowedURLPatternRegex += "/";
456 }
457 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
458 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
459 siteURLsWriter.write(domainWithProtocol + "\n");
460 urlFilterWriter.write(regexed_pattern + "\n");
461 siteRegexWriter.write(regexed_pattern + "\n");
462
463 }
464 }
465
466 // next write out the urls for the domain into the sites/0000x/seedURLs.txt file
467 // also write into the global seeds file (with a tab prefixed to each?)
468 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
469 Iterator<String> urlIterator = urlsForDomainSet.iterator();
470 while(urlIterator.hasNext()) {
471 String url = urlIterator.next();
472 seedURLsWriter.write(url + "\n"); // global seedURLs file
473 siteURLsWriter.write(url + "\n");
474 }
475 } catch (IOException ioe) {
476 ioe.printStackTrace();
477 System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
478 }
479
480 }
481
482 } catch (IOException ioe) {
483 ioe.printStackTrace();
484 System.err.println("\n@@@@@@@@@ Error writing to one of: ");
485 System.err.println("\t" + seedURLsFile);
486 System.err.println("\t" + urlFilterFile);
487 System.err.println("\t" + domainURLsFile);
488 System.err.println("\t" + topSiteMatchesFile);
489 }
490
491 /*
492 // BEGIN DEBUG
493 System.err.println("@@@@ TopSitesMap contains: ");
494 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
495 String topSite = entry.getKey();
496 String urlPattern = entry.getValue();
497 System.err.println(topSite + " - " + urlPattern);
498 } // END DEBUG
499 */
500 }
501
502 private String stripSubDomain(String url) {
503 int index = url.indexOf(".");
504 if(index != -1) {
505 url = url.substring(index+1);
506 }
507 return url;
508 }
509
510
511 /**
512 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
513 * with or without www. prefix. This method tests for such as case as it would be dangerous
514 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
515 */
516 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
517 // check for an exact match as-is
518 if(seedURLDomain.equals(domain)) {
519 return true;
520 }
521
522 // else check if with or without a www. prefix we have an exact match with domain
523 if(seedURLDomain.startsWith("www.")) {
524 if(seedURLDomain.substring(4).equals(domain)) {
525 return true;
526 }
527 } else {
528 if(domain.equals("www."+seedURLDomain)) {
529 return true;
530 }
531 }
532
533 return false;
534 }
535
536
537 /**
538 * Check if the domain of the seedurl, either in its entirety or when stripped of
539 * www/subdomains, is in the list of top sites.
540 * If it is, and the given url matches the regex for that topsite, then add the url to the
541 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
542 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
543 * @return one of the following values:
544 * - This function returns null if the seedURL's domain does not match any of the topsites.
545 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
546 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
547 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
548 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
549 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
550 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
551 * URL pattern.
552 */
553 private String isURLinTopSitesMap(String fullSeedDomain) {
554 boolean keepLooping = true;
555
556 String domain = fullSeedDomain;
557
558 // domain aprameter will have retained www or subdomains, but is stripped of protocol
559
560 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
561 // if it does, return the value for that topsite domain in the topSitesMap
562 // If no match at all, return null.
563 do {
564
565 String allowed_url_pattern = topSitesMap.get(domain);
566 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
567 // there's an entry for the URL in the topSitesMap
568 System.err.println("##### A top site matches URL domain " + domain);
569
570 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
571 // www prefix, should not exactly match the topSitesMap domain
572 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
573 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
574
575 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
576 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
577 }
578 return allowed_url_pattern;
579 }
580 // else, no entry for the URL in the topSitesMap
581 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
582
583 String newDomain = stripSubDomain(domain);
584 if(domain.equals(newDomain)) {
585 keepLooping = false;
586 } else {
587 domain = newDomain;
588 }
589 } while(keepLooping);
590
591 // url in entirety or stripped of subdomains did not match any of the topsites
592 return null;
593 }
594
595 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
596 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
597 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
598 //while(i.hasNext()) {
599 // Map.Entry<String, Integer> entry = i.next();
600 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
601 String urlPattern = entry.getKey();
602 Integer matchRule = entry.getValue();
603
604 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
605 return true;
606 }
607 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
608 return true;
609 }
610 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
611 return true;
612 }
613 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
614 return true;
615 }
616 // else check the rest of the filter list against this url
617 // before returning false to be certain it's not been listed in the filter list
618 }
619
620 return false;
621 }
622
623 /**
624 * Returns true if the url or pattern is found in the blacklist file.
625 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
626 * it won't get blacklisted after all. But that's not implemented here.
627 */
628 public boolean isBlacklisted(String url) {
629 return isListedInFilterList(blackList, url);
630 }
631
632 /**
633 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
634 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
635 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
636 */
637 public boolean isGreylisted(String url) {
638 // auto-translated product sites
639 return isListedInFilterList(greyList, url);
640 }
641
642 /**
643 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
644 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
645 */
646 public boolean isWhitelisted(String url) {
647 return isListedInFilterList(whiteList, url);
648 }
649
650 /**
651 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
652 * whether it is in the mentioned black|grey|white list.
653 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
654 * By not having this method deal with actual regex for filters, this has the advantage that
655 * we don't have to remember to escape or double escape each filter to turn it into a regex.
656 */
657 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
658
659 // if filterListFilename does not exist in the conf folder, just return
660 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
661 System.err.println(filterListFilename + " does not exist");
662 return;
663 }
664
665 try (
666 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
667 ) {
668 String filter = null;
669 while((filter = reader.readLine()) != null) {
670 // skip comments and empty lines
671 filter = filter.trim();
672 if(filter.equals("") || filter.startsWith("#")) {
673 continue;
674 }
675
676 if(filter.startsWith("^") && filter.endsWith("$")) {
677 filter = filter.substring(1, filter.length()-1);
678 list.put(filter, LIST_ENTRY_MATCHES);
679 }
680 else if(filter.startsWith("^")) {
681 filter = filter.substring(1);
682 list.put(filter, LIST_ENTRY_STARTSWITH);
683 System.err.println("Match filter startswith: " + filter);
684 }
685 else if(filter.endsWith("$")) {
686 filter = filter.substring(0, filter.length()-1);
687 list.put(filter, LIST_ENTRY_ENDSWITH);
688 }
689 else {
690 list.put(filter, LIST_ENTRY_CONTAINS);
691 }
692 //System.err.println("Got filter: " + filter);
693 }
694
695 } catch (IOException ioe) {
696 ioe.printStackTrace();
697 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
698 }
699
700 }
701
702 /** Maintain a count of all WET files processed. */
703 public void setWETFileCount(int count) { this.wetFileCount = count; }
704
705 /** Maintain a count of all WET records processed. */
706 //public int getRecordCount() { return this.totalRecordCount; }
707 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
708 public void setRecordCount(int count) { this.totalRecordCount = count; }
709
710 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
711
712 // Will list all the warc.wet files in the input directory or else their gzipped versions
713 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
714
715 int wetRecordCount = 0;
716 int wetFileCount = 0;
717
718 for(int i = 0; i < WETFiles.length; i++) {
719 File WETFile = WETFiles[i];
720 logger.debug("Processing WETfile: " + WETFile);
721
722 // Any .gz files listed means they haven't been unzipped yet. So unzip.
723 String WETFilename = WETFile.toString();
724 if(WETFilename.endsWith(".gz")) {
725 File GZippedWETFile = WETFile;
726 String WETGZippedFilename = WETFilename;
727 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
728
729 WETFile = new File(WETFilename);
730 Utility.unzipFile(GZippedWETFile, WETFile);
731 }
732 // hereafter all WETFiles should refer to the unzipped version
733 // Check the unzipped WETFile exists
734
735 if(!WETFile.exists() || !WETFile.isFile()) {
736 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
737 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
738 return;
739 }
740
741 // Finally, we can process this WETFile's records into the keep and discard pile
742 wetFileCount++;
743 logger.debug("Off to process " + WETFile);
744 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
745 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
746 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
747 wetFileProcessor.processWETFile();
748 wetRecordCount += wetFileProcessor.getRecordCount();
749 }
750
751 // for information purposes
752 this.setWETFileCount(wetFileCount);
753 this.setRecordCount(wetRecordCount);
754 }
755
756
757 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
758 public static void printUsage() {
759 System.err.println("Run this program as:");
760 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
761 }
762
763 /** Filename filter to only list warc.wet files or else warc.wet.gz files
764 * for which unzipped warc.wet equivalents don't yet exist.
765 */
766 private static class WETFilenameFilter implements FilenameFilter {
767
768 public boolean accept(File dir, String name) {
769 if(name.endsWith(".warc.wet")) {
770 logger.debug("Will include " + name + " for processing.");
771 return true;
772 }
773
774 if(name.endsWith(".warc.wet.gz")) {
775 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
776 File unzippedVersion = new File(dir, nameWithoutGZext);
777 if(unzippedVersion.exists()) {
778 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
779 logger.debug("Skipping " + name);
780 return false; // don't count gzipped version if unzipped version exists.
781 }
782 else {
783 logger.debug("Only zipped version " + name + " exists.");
784 return true; // No unzipped version, so have to work with gzipped version
785 }
786 }
787
788 // we're not even interested in any other file extensions
789 logger.debug("Not a WET file. Skipping " + name);
790 return false;
791 }
792 }
793
794
795 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
796
797 public boolean accept(File dir, String name) {
798 File f = new File (dir, name);
799 if(f.isDirectory()) {
800 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
801 return true;
802 }
803 }
804 else {
805 System.err.println("File " + f + " is not a directory");
806 }
807 return false;
808 }
809 }
810
811 public static void main(String[] args) {
812 if(args.length != 2) {
813 printUsage();
814 return;
815 }
816
817 File commoncrawlDir = new File(args[0]);
818 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
819 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
820 return;
821 }
822
823 File outFolder = new File(args[1]);
824 if(!outFolder.exists() || !outFolder.isDirectory()) {
825 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
826 return;
827 }
828
829 try {
830 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
831
832 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
833
834 for(int i = 0; i < ccrawlFolders.length; i++) {
835 File ccrawlFolder = ccrawlFolders[i];
836 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
837 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
838 }
839
840
841 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
842 // The former is the only unique one. seedURLs and regex-urlfilters are
843 // repeated on a per site/domain basis too, stored in the sites folder
844 File seedURLsFile = new File(outFolder, "seedURLs.txt");
845 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
846 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
847 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
848
849 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
850
851 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
852
853 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
854
855
856 } catch(Exception e) {
857 // can get an exception when instantiating CCWETProcessor instance
858 e.printStackTrace();
859 System.err.println(e.getMessage());
860 }
861
862 return;
863
864 }
865}
Note: See TracBrowser for help on using the repository browser.