source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33561

Last change on this file since 33561 was 33561, checked in by ak19, 5 years ago
  1. sites-too-big-to-exhaustively-crawl.txt is now a comma separated list. 2. After the discussion with Dr Bainbridge that SINGLEPAGE is not what we want for docs.google.com, I found that the tentative switch to SUBDOMAIN-COPY for docs.google.com will not work precisely because of the important change we had to make yesterday: if SUBDOMAIN-COPY, then only copy SUBdomains, and not root domains. If root domain with SUBDOMAIN-COPY, then the seedURL gets written out to unprocessed-topsite-matches.txt and its site doesn't get crawled. 3. This revealed a lacuna in sites-too-big-to-exhaustively-crawl.txt possible list of values and I had to invent a new value which I introduce and have tested with this commit: FOLLOW_LINKS_WITHIN_TOPSITE. This value so far applies only to docs.google.com and will keep following any links originating in a seedURL on docs.google.com but only as long as it's within that topsite domain (docs.google.com). 4. Tidied some old fashioned use of Iterator, replaced with newer style of for loops that work with Types. Comitting before update code to use the apache csv API.
File size: 36.2 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeMap;
12import java.util.TreeSet;
13
14import org.apache.log4j.Logger;
15
16/**
17 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18 * the WET records in each, putting each WET record into a file. Each file is put into a
19 * keep or discard or greyListed folder, and its url listed written into a keep, discard
20 * or greylisted text file, based on based on
21 *
22 * 1. whether it's whitelisted, else greylisted else blacklisted
23 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24 * enough content. Formerly, content-length and number of lines were used to determine if
25 * the content was sufficient. Now it's just word count and number of MAX characters
26 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27 * in conf/config.properties.
28 *
29 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30 * into the conf folder to control any url patterns that are explicitly included or excluded or
31 * set aside for inspecting later. These filter text files don't use regexes, instead their
32 * format is:
33 * - precede URL by ^ to blacklist urls that match the given prefix
34 * - succeed URL by $ to blacklist urls that match the given suffix
35 * - ^url$ will blacklist urls that match the given url completely
36 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37 *
38 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
39 * over greylisting and which takes precedence over blacklisting in turn. However, even
40 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42 * also for nutch.
43 *
44 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45 * in the given input folder. Then use a single instance of the WETProcessor class to process
46 * each single unzipped warc.wet file.
47 *
48 * To compile, including the jars in lib/ for compiling.
49 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50 *
51 * To run, passing the log4j and other properties files in conf/ folder:
52 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53 *
54 * e.g.
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
57 *
58*/
59
60public class CCWETProcessor {
61 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63 // Properties shared across WETProcessor instances
64 public final int MAX_WORD_LENGTH;
65 public final int MIN_NUM_WORDS;
66 public final int MAX_WORDS_CAMELCASE;
67
68 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
69 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
70 public final String SINGLEPAGE = "SINGLEPAGE";
71 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
72
73 /**
74 * Characters that need escaping if used as a string literal in a regex
75 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
76 * https://www.regular-expressions.info/refcharacters.html
77 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
78 * as would happen for any chars appearing earlier in this list than \\
79 */
80 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
81 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
82
83 private Properties configProperties = new Properties();
84
85 // File paths shared across WETProcessor instances
86 public final File commoncrawlDir;
87 public final File outputFolder;
88 public final File discardFolder;
89 public final File keepFolder;
90 public final File greyListedFolder;
91 public final File keepURLsFile;
92 public final File discardURLsFile;
93 public final File greyListedFile;
94
95 /** Possible values stored in the blackList/whiteList/greyList Maps */
96 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
97 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
98 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
99 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
100
101 /**
102 * Store url patterns as keys and values indicated whether a url should
103 * match it exactly, start/end with it, or contain it
104 */
105 private HashMap<String, Integer> blackList;
106 private HashMap<String, Integer> greyList;
107 private HashMap<String, Integer> whiteList;
108
109 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
110 * with optional regex defining allowed exceptions, like subdomains or url suffixes
111 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
112 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
113 * would be relevant.
114 * The map would store top site domain suffix and an optional regex string for allowable
115 * url patterns.
116 */
117 private HashMap<String, String> topSitesMap;
118
119 /** Map of domains we keep and the full urls we're keeping that are of that domain.
120 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
121 * while a HashMap has no notion of ordering, because we just need to store urls with
122 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
123 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
124 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
125 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
126 */
127 private Map<String, Set<String>> domainsToURLsMap;
128
129 // Keep a count of all the records that all WETProcessors instantiated
130 // by our main method combined have processed
131 private int totalRecordCount = 0;
132
133 private int wetFileCount = 0;
134
135 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
136 this.commoncrawlDir = inFolder;
137 this.outputFolder = outFolder;
138
139 // load up the properties from the config file
140 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
141 configProperties = new Properties();
142 configProperties.load(infile);
143 //infile.close(); // not explicitly called in examples of try-with-resources
144
145 } catch(Exception e) {
146 System.err.println("Exception attempting to read properties from config.properties.");
147 logger.error("Exception attempting to read properties from config.properties.");
148 e.printStackTrace();
149 }
150
151 if(configProperties.size() == 0) {
152 System.err.println("*** Warning: no values read into config properties. Using defaults.");
153 }
154
155 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
156 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
157 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
158
159
160 this.discardFolder = new File(outFolder, "discard");
161 if(!discardFolder.exists()) {
162 discardFolder.mkdir();
163 }
164 this.keepFolder = new File(outFolder, "keep");
165 if(!keepFolder.exists()) {
166 keepFolder.mkdir();
167 }
168
169 this.greyListedFolder = new File(outFolder, "greylisted");
170 if(!greyListedFolder.exists()) {
171 greyListedFolder.mkdir();
172 }
173
174 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
175 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
176 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
177 }
178 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
179 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
180 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
181 }
182 this.greyListedFile = new File(outFolder, "greyListed.txt");
183 if(greyListedFile.exists() && !greyListedFile.delete()) {
184 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
185 }
186
187 // prepare our blacklist, greylist (for inspection) and whitelist
188 System.err.println("Loading blacklist.");
189 blackList = new HashMap<String, Integer>();
190 initURLFilterList(blackList, "url-blacklist-filter.txt");
191
192 System.err.println("Loading greylist.");
193 greyList = new HashMap<String, Integer>();
194 initURLFilterList(greyList, "url-greylist-filter.txt");
195
196 System.err.println("Loading whitelist.");
197 whiteList = new HashMap<String, Integer>();
198 initURLFilterList(whiteList, "url-whitelist-filter.txt");
199
200 // Create the map of topSites
201 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
202 topSitesMap = new HashMap<String, String>();
203 //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt");
204
205 try (
206 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));
207 ) {
208
209 String str = null;
210 while((str = reader.readLine()) != null) {
211 str = str.trim();
212 if(str.equals("") || str.startsWith("#")) {
213 continue;
214 }
215
216 // comma separated list of values
217 int splitindex = str.indexOf(",");
218 if(splitindex == -1) {
219 topSitesMap.put(str, "");
220 } else {
221 String topsite = str.substring(0, splitindex).trim();
222 String allowed_url_pattern = str.substring(splitindex+1).trim();
223 topSitesMap.put(topsite, allowed_url_pattern);
224 }
225 }
226 } catch (IOException ioe) {
227 ioe.printStackTrace();
228 System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt");
229 }
230
231 //System.err.println("Prematurely terminating for testing purposes.");
232 //System.exit(-1);
233 }
234
235 /** Work out the 'domain' for a given url.
236 * This retains any www. or subdomain prefix.
237 */
238 private String getDomainForURL(String url, boolean withProtocol) {
239 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
240 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
241 // the keep the URL around in case param withProtocol=true
242 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
243
244 String domain = url.substring(startIndex);
245 int endIndex = domain.indexOf("/");
246 if(endIndex == -1) endIndex = domain.length();
247 domain = domain.substring(0, endIndex);
248
249 if(withProtocol) {
250 // now that we have the domain (everything to the first / when there is no protocol)
251 // can glue the protocol back on
252 domain = protocol + domain;
253 }
254
255 return domain;
256 }
257
258 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
259 private String escapeStringForRegex(String str) {
260 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
261 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
262 str = str.replace(Character.toString(c), "\\"+c);
263 }
264 return str;
265 }
266
267 /**
268 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
269 * as output the URL seed list and regex-urlfilter text files required by nutch, see
270 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
271 */
272 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
273 File domainURLsFile, File topSiteMatchesFile) {
274 // Maintain a Map of unique domains mapped to seed urls at that domain
275 // TreeSet: by default, "the elements are ordered using their natural ordering"
276 // (or by a Comparator provided at set creation time).
277 // Whereas HashSet doesn't guarantee ordering.
278 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
279 // Would be a similar distinction for Maps.
280 domainsToURLsMap = new TreeMap<String, Set<String>>();
281
282 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
283 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
284
285 try (
286 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
287 ) {
288
289 // read a URL at a time from urlsFile
290 String url = null;
291 String domainWithProtocol = null;
292 while((url = reader.readLine()) != null) { // readLine removes newline separator
293
294 // work out domain. This retains any www. or subdomain prefix
295 // passing true to further also retain the http(s) protocol
296 domainWithProtocol = getDomainForURL(url, true);
297
298 Set<String> urlsSet;
299 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
300 urlsSet = new TreeSet<String>();
301 urlsSet.add(url);
302 domainsToURLsMap.put(domainWithProtocol, urlsSet);
303 } else {
304 urlsSet = domainsToURLsMap.get(domainWithProtocol);
305 urlsSet.add(url);
306 }
307
308 }
309 } catch (IOException ioe) {
310 ioe.printStackTrace();
311 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
312 }
313
314 // We'd have pruned out duplicates by now and have a sorted list of domains,
315 // each of which maps to seed URLs in the commoncrawl for that domain
316
317 int domainCount = 0;
318 File sitesFolder = new File(outputFolder, "sites");
319 if(!sitesFolder.exists()) {
320 sitesFolder.mkdir();
321 }
322 final String FORMATSTR = "%05d";
323
324 // write out each domain followed in sequence by all urls we found in that domain
325 // (urls with tab up front)
326 try (
327 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
328 // Also a global file listing any urls that matched top sites that didn't specify
329 // allowed regex patterns
330 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
331 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
332 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
333 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
334 ) {
335
336 // initialise topSiteMatchesFile with some instructional text.
337 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
338 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
339 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
340
341 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
342 Set<String> domainsSet = domainsToURLsMap.keySet();
343 Iterator<String> domainIterator = domainsSet.iterator();
344
345 /*
346 // DEBUG
347 String value = topSitesMap.get("wikipedia.org");
348 if(value == null) {
349 System.err.println("### wikipedia.org had null value");
350 } else {
351 System.err.println("### wikipedia.org had value: " + value);
352 } // DEBUG
353 */
354
355 while(domainIterator.hasNext()) {
356 String domainWithProtocol = domainIterator.next();
357 // Also get domain without protocol prefix
358 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
359 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
360 String domain = domainWithProtocol.substring(startIndex);
361
362 /*if(domain.contains("docs.google.com")) {
363 System.err.println("domain with protocol: " + domainWithProtocol);
364 System.err.println("domain: " + domain);
365 }*/
366
367 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
368 // If the domain is of a topsite for which no allowed URL pattern has been provided
369 // in sites-too-big-to-exhaustively-crawl.txt,
370 // then we don't know how to crawl the site. Warn the user by writing the affected
371 // domain and seedURLs to the topSiteMatchesFile.
372 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
373
374 // topsite, but we don't (yet) know what portion can be crawled
375 // Append the top site and url to a global/toplevel file that
376 // the user needs to check later and we're done with this domain as it
377 // won't go into any other file hereafter
378
379 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
380 for(String url : urlsForDomainSet) {
381 topSiteMatchesWriter.write("\t" + url + "\n");
382 }
383
384 continue; // done with this domain
385 }
386
387 // start counting the domains we're actually going to process
388 domainCount++;
389
390 String siteID = String.format(FORMATSTR, domainCount);
391 File domainFolder = new File(sitesFolder, siteID);
392 domainFolder.mkdir();
393
394 // write out the domain
395 //seedURLsWriter.write(domainWithProtocol + "\n");
396
397
398 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
399 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
400 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
401 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
402 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
403 try (
404 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
405 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
406 ) {
407
408 // write all sorted unique domains into global domains file
409 // Using the domain withuot protocol since the global domains file is for
410 // informational purposes
411 domainURLsWriter.write(domain + "\n");
412
413 // Only write urls and no domain into single global seedurls file
414 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
415 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
416 // If we ever run nutch on a single seedURLs listing containing
417 // all seed pages to crawl sites from, the above two files will work for that.
418
419 if(allowedURLPatternRegex == null) { // entire site can be crawled
420 siteURLsWriter.write(domainWithProtocol + "\n");
421
422 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
423 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
424 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
425 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
426 urlFilterWriter.write(regexed_domain + "\n"); //global file
427 siteRegexWriter.write(regexed_domain + "\n"); // site file
428 }
429 else { // domain belongs to a top site where only portion of site can be crawled
430
431 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
432 siteURLsWriter.write(domainWithProtocol + "\n");
433 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
434 // and not for all of blogspot.com
435
436 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
437 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
438 urlFilterWriter.write(regexed_domain + "\n");
439 siteRegexWriter.write(regexed_domain + "\n");
440
441 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
442 // don't write out domain. We want individual pages
443 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
444
445 // don't write out domain as a regex expression url filter either,
446 // write out the individual seed urls for the domain instead
447 // since we will only be downloading the single page
448
449 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
450 for(String urlInDomain : urlsForDomainSet) {
451 // don't append slash to end this time
452 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
453 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
454 urlFilterWriter.write(regexed_url + "\n");
455 siteRegexWriter.write(regexed_url + "\n");
456 }
457 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
458
459 // DON'T write out domain into siteURLs file,
460 // BUT DO write it into urlFilter file
461 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
462
463 urlFilterWriter.write(regexed_domain + "\n");
464 siteRegexWriter.write(regexed_domain + "\n");
465 } else { // allowedURLPatternRegex is a url-form - convert to regex
466 if(!allowedURLPatternRegex.endsWith("/")) {
467 allowedURLPatternRegex += "/";
468 }
469 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
470 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
471 siteURLsWriter.write(domainWithProtocol + "\n");
472 urlFilterWriter.write(regexed_pattern + "\n");
473 siteRegexWriter.write(regexed_pattern + "\n");
474
475 }
476 }
477
478 // next write out the urls for the domain into the sites/0000x/seedURLs.txt file
479 // also write into the global seeds file (with a tab prefixed to each?)
480 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
481 for(String url : urlsForDomainSet) {
482 seedURLsWriter.write(url + "\n"); // global seedURLs file
483 siteURLsWriter.write(url + "\n");
484 }
485
486 } catch (IOException ioe) {
487 ioe.printStackTrace();
488 System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
489 }
490
491 }
492
493 } catch (IOException ioe) {
494 ioe.printStackTrace();
495 System.err.println("\n@@@@@@@@@ Error writing to one of: ");
496 System.err.println("\t" + seedURLsFile);
497 System.err.println("\t" + urlFilterFile);
498 System.err.println("\t" + domainURLsFile);
499 System.err.println("\t" + topSiteMatchesFile);
500 }
501
502 /*
503 // BEGIN DEBUG
504 System.err.println("@@@@ TopSitesMap contains: ");
505 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
506 String topSite = entry.getKey();
507 String urlPattern = entry.getValue();
508 System.err.println(topSite + " - " + urlPattern);
509 } // END DEBUG
510 */
511 }
512
513 private String stripSubDomain(String url) {
514 int index = url.indexOf(".");
515 if(index != -1) {
516 url = url.substring(index+1);
517 }
518 return url;
519 }
520
521
522 /**
523 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
524 * with or without www. prefix. This method tests for such as case as it would be dangerous
525 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
526 */
527 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
528 // check for an exact match as-is
529 if(seedURLDomain.equals(domain)) {
530 return true;
531 }
532
533 // else check if with or without a www. prefix we have an exact match with domain
534 if(seedURLDomain.startsWith("www.")) {
535 if(seedURLDomain.substring(4).equals(domain)) {
536 return true;
537 }
538 } else {
539 if(domain.equals("www."+seedURLDomain)) {
540 return true;
541 }
542 }
543
544 return false;
545 }
546
547
548 /**
549 * Check if the domain of the seedurl, either in its entirety or when stripped of
550 * www/subdomains, is in the list of top sites.
551 * If it is, and the given url matches the regex for that topsite, then add the url to the
552 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
553 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
554 * @return one of the following values:
555 * - This function returns null if the seedURL's domain does not match any of the topsites.
556 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
557 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
558 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
559 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
560 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
561 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
562 * URL pattern.
563 */
564 private String isURLinTopSitesMap(String fullSeedDomain) {
565 boolean keepLooping = true;
566
567 String domain = fullSeedDomain;
568
569 // domain aprameter will have retained www or subdomains, but is stripped of protocol
570
571 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
572 // if it does, return the value for that topsite domain in the topSitesMap
573 // If no match at all, return null.
574 do {
575
576 String allowed_url_pattern = topSitesMap.get(domain);
577 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
578 // there's an entry for the URL in the topSitesMap
579 System.err.println("##### A top site matches URL domain " + domain);
580
581 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
582 // www prefix, should not exactly match the topSitesMap domain
583 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
584 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
585
586 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
587 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
588 }
589 return allowed_url_pattern;
590 }
591 // else, no entry for the URL in the topSitesMap
592 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
593
594 String newDomain = stripSubDomain(domain);
595 if(domain.equals(newDomain)) {
596 keepLooping = false;
597 } else {
598 domain = newDomain;
599 }
600 } while(keepLooping);
601
602 // url in entirety or stripped of subdomains did not match any of the topsites
603 return null;
604 }
605
606 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
607 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
608 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
609 //while(i.hasNext()) {
610 // Map.Entry<String, Integer> entry = i.next();
611 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
612 String urlPattern = entry.getKey();
613 Integer matchRule = entry.getValue();
614
615 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
616 return true;
617 }
618 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
619 return true;
620 }
621 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
622 return true;
623 }
624 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
625 return true;
626 }
627 // else check the rest of the filter list against this url
628 // before returning false to be certain it's not been listed in the filter list
629 }
630
631 return false;
632 }
633
634 /**
635 * Returns true if the url or pattern is found in the blacklist file.
636 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
637 * it won't get blacklisted after all. But that's not implemented here.
638 */
639 public boolean isBlacklisted(String url) {
640 return isListedInFilterList(blackList, url);
641 }
642
643 /**
644 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
645 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
646 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
647 */
648 public boolean isGreylisted(String url) {
649 // auto-translated product sites
650 return isListedInFilterList(greyList, url);
651 }
652
653 /**
654 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
655 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
656 */
657 public boolean isWhitelisted(String url) {
658 return isListedInFilterList(whiteList, url);
659 }
660
661 /**
662 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
663 * whether it is in the mentioned black|grey|white list.
664 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
665 * By not having this method deal with actual regex for filters, this has the advantage that
666 * we don't have to remember to escape or double escape each filter to turn it into a regex.
667 */
668 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
669
670 // if filterListFilename does not exist in the conf folder, just return
671 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
672 System.err.println(filterListFilename + " does not exist");
673 return;
674 }
675
676 try (
677 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
678 ) {
679 String filter = null;
680 while((filter = reader.readLine()) != null) {
681 // skip comments and empty lines
682 filter = filter.trim();
683 if(filter.equals("") || filter.startsWith("#")) {
684 continue;
685 }
686
687 if(filter.startsWith("^") && filter.endsWith("$")) {
688 filter = filter.substring(1, filter.length()-1);
689 list.put(filter, LIST_ENTRY_MATCHES);
690 }
691 else if(filter.startsWith("^")) {
692 filter = filter.substring(1);
693 list.put(filter, LIST_ENTRY_STARTSWITH);
694 System.err.println("Match filter startswith: " + filter);
695 }
696 else if(filter.endsWith("$")) {
697 filter = filter.substring(0, filter.length()-1);
698 list.put(filter, LIST_ENTRY_ENDSWITH);
699 }
700 else {
701 list.put(filter, LIST_ENTRY_CONTAINS);
702 }
703 //System.err.println("Got filter: " + filter);
704 }
705
706 } catch (IOException ioe) {
707 ioe.printStackTrace();
708 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
709 }
710
711 }
712
713 /** Maintain a count of all WET files processed. */
714 public void setWETFileCount(int count) { this.wetFileCount = count; }
715
716 /** Maintain a count of all WET records processed. */
717 //public int getRecordCount() { return this.totalRecordCount; }
718 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
719 public void setRecordCount(int count) { this.totalRecordCount = count; }
720
721 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
722
723 // Will list all the warc.wet files in the input directory or else their gzipped versions
724 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
725
726 int wetRecordCount = 0;
727 int wetFileCount = 0;
728
729 for(int i = 0; i < WETFiles.length; i++) {
730 File WETFile = WETFiles[i];
731 logger.debug("Processing WETfile: " + WETFile);
732
733 // Any .gz files listed means they haven't been unzipped yet. So unzip.
734 String WETFilename = WETFile.toString();
735 if(WETFilename.endsWith(".gz")) {
736 File GZippedWETFile = WETFile;
737 String WETGZippedFilename = WETFilename;
738 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
739
740 WETFile = new File(WETFilename);
741 Utility.unzipFile(GZippedWETFile, WETFile);
742 }
743 // hereafter all WETFiles should refer to the unzipped version
744 // Check the unzipped WETFile exists
745
746 if(!WETFile.exists() || !WETFile.isFile()) {
747 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
748 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
749 return;
750 }
751
752 // Finally, we can process this WETFile's records into the keep and discard pile
753 wetFileCount++;
754 logger.debug("Off to process " + WETFile);
755 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
756 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
757 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
758 wetFileProcessor.processWETFile();
759 wetRecordCount += wetFileProcessor.getRecordCount();
760 }
761
762 // for information purposes
763 this.setWETFileCount(wetFileCount);
764 this.setRecordCount(wetRecordCount);
765 }
766
767
768 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
769 public static void printUsage() {
770 System.err.println("Run this program as:");
771 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
772 }
773
774 /** Filename filter to only list warc.wet files or else warc.wet.gz files
775 * for which unzipped warc.wet equivalents don't yet exist.
776 */
777 private static class WETFilenameFilter implements FilenameFilter {
778
779 public boolean accept(File dir, String name) {
780 if(name.endsWith(".warc.wet")) {
781 logger.debug("Will include " + name + " for processing.");
782 return true;
783 }
784
785 if(name.endsWith(".warc.wet.gz")) {
786 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
787 File unzippedVersion = new File(dir, nameWithoutGZext);
788 if(unzippedVersion.exists()) {
789 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
790 logger.debug("Skipping " + name);
791 return false; // don't count gzipped version if unzipped version exists.
792 }
793 else {
794 logger.debug("Only zipped version " + name + " exists.");
795 return true; // No unzipped version, so have to work with gzipped version
796 }
797 }
798
799 // we're not even interested in any other file extensions
800 logger.debug("Not a WET file. Skipping " + name);
801 return false;
802 }
803 }
804
805
806 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
807
808 public boolean accept(File dir, String name) {
809 File f = new File (dir, name);
810 if(f.isDirectory()) {
811 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
812 return true;
813 }
814 }
815 else {
816 System.err.println("File " + f + " is not a directory");
817 }
818 return false;
819 }
820 }
821
822 public static void main(String[] args) {
823 if(args.length != 2) {
824 printUsage();
825 return;
826 }
827
828 File commoncrawlDir = new File(args[0]);
829 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
830 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
831 return;
832 }
833
834 File outFolder = new File(args[1]);
835 if(!outFolder.exists() || !outFolder.isDirectory()) {
836 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
837 return;
838 }
839
840 try {
841 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
842
843 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
844
845 for(int i = 0; i < ccrawlFolders.length; i++) {
846 File ccrawlFolder = ccrawlFolders[i];
847 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
848 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
849 }
850
851
852 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
853 // The former is the only unique one. seedURLs and regex-urlfilters are
854 // repeated on a per site/domain basis too, stored in the sites folder
855 File seedURLsFile = new File(outFolder, "seedURLs.txt");
856 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
857 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
858 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
859
860 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
861
862 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
863
864 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
865
866
867 } catch(Exception e) {
868 // can get an exception when instantiating CCWETProcessor instance
869 e.printStackTrace();
870 System.err.println(e.getMessage());
871 }
872
873 return;
874
875 }
876}
Note: See TracBrowser for help on using the repository browser.