source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
File size: 38.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.nio.charset.StandardCharsets;
6import java.util.Properties;
7import java.util.zip.GZIPInputStream;
8import java.util.Iterator;
9import java.util.HashMap;
10import java.util.Map;
11import java.util.Set;
12import java.util.TreeMap;
13import java.util.TreeSet;
14
15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16import org.apache.log4j.Logger;
17
18
19/**
20 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21 * the WET records in each, putting each WET record into a file. Each file is put into a
22 * keep or discard or greyListed folder, and its url listed written into a keep, discard
23 * or greylisted text file, based on based on
24 *
25 * 1. whether it's whitelisted, else greylisted else blacklisted
26 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27 * enough content. Formerly, content-length and number of lines were used to determine if
28 * the content was sufficient. Now it's just word count and number of MAX characters
29 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30 * in conf/config.properties.
31 *
32 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33 * into the conf folder to control any url patterns that are explicitly included or excluded or
34 * set aside for inspecting later. These filter text files don't use regexes, instead their
35 * format is:
36 * - precede URL by ^ to blacklist urls that match the given prefix
37 * - succeed URL by $ to blacklist urls that match the given suffix
38 * - ^url$ will blacklist urls that match the given url completely
39 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40 *
41 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
42 * over greylisting and which takes precedence over blacklisting in turn. However, even
43 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45 * also for nutch.
46 *
47 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48 * in the given input folder. Then use a single instance of the WETProcessor class to process
49 * each single unzipped warc.wet file.
50 *
51 * To compile, including the jars in lib/ for compiling.
52 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53 *
54 * To run, passing the log4j and other properties files in conf/ folder:
55 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56 *
57 * e.g. (from maori-lang-detection/src)
58 *
59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 | less
61 *
62*/
63
64public class CCWETProcessor {
65 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67 // Properties shared across WETProcessor instances
68 public final int MAX_WORD_LENGTH;
69 public final int MIN_NUM_WORDS;
70 public final int MAX_WORDS_CAMELCASE;
71
72 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74 public final String SINGLEPAGE = "SINGLEPAGE";
75 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77 /**
78 * Characters that need escaping if used as a string literal in a regex
79 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80 * https://www.regular-expressions.info/refcharacters.html
81 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82 * as would happen for any chars appearing earlier in this list than \\
83 */
84 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
85 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
86
87 private Properties configProperties = new Properties();
88
89 // File paths shared across WETProcessor instances
90 public final File commoncrawlDir;
91 public final File outputFolder;
92 public final File discardFolder;
93 public final File keepFolder;
94 public final File greyListedFolder;
95 public final File keepURLsFile;
96 public final File discardURLsFile;
97 public final File greyListedFile;
98
99 /** Possible values stored in the blackList/whiteList/greyList Maps */
100 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105 /**
106 * Store url patterns as keys and values indicated whether a url should
107 * match it exactly, start/end with it, or contain it
108 */
109 private HashMap<String, Integer> blackList;
110 private HashMap<String, Integer> greyList;
111 private HashMap<String, Integer> whiteList;
112
113 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
114 * with optional regex defining allowed exceptions, like subdomains or url suffixes
115 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117 * would be relevant.
118 * The map would store top site domain suffix and an optional regex string for allowable
119 * url patterns.
120 */
121 private HashMap<String, String> topSitesMap;
122
123 /** Map of domains we keep and the full urls we're keeping that are of that domain.
124 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125 * while a HashMap has no notion of ordering, because we just need to store urls with
126 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130 */
131 private Map<String, Set<String>> domainsToURLsMap;
132
133 // Keep a count of all the records that all WETProcessors instantiated
134 // by our main method combined have processed
135 private int totalRecordCount = 0;
136
137 private int wetFileCount = 0;
138
139 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142 this.commoncrawlDir = inFolder;
143 this.outputFolder = outFolder;
144
145 // load up the properties from the config file
146 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147 configProperties = new Properties();
148 configProperties.load(infile);
149 //infile.close(); // not explicitly called in examples of try-with-resources
150
151 } catch(Exception e) {
152 logger.error("Exception attempting to read properties from config.properties.", e);
153 }
154
155 if(configProperties.size() == 0) {
156 logger.warn("*** Warning: no values read into config properties. Using defaults.");
157 }
158
159 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164 this.discardFolder = new File(outFolder, "discard");
165 if(!discardFolder.exists()) {
166 discardFolder.mkdir();
167 }
168 this.keepFolder = new File(outFolder, "keep");
169 if(!keepFolder.exists()) {
170 keepFolder.mkdir();
171 }
172
173 this.greyListedFolder = new File(outFolder, "greylisted");
174 if(!greyListedFolder.exists()) {
175 greyListedFolder.mkdir();
176 }
177
178 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181 }
182 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185 }
186 this.greyListedFile = new File(outFolder, "greyListed.txt");
187 if(greyListedFile.exists() && !greyListedFile.delete()) {
188 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189 }
190
191 // prepare our blacklist, greylist (for inspection) and whitelist
192 logger.info("Loading blacklist.");
193 blackList = new HashMap<String, Integer>();
194 initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196 logger.info("Loading greylist.");
197 greyList = new HashMap<String, Integer>();
198 initURLFilterList(greyList, "url-greylist-filter.txt");
199
200 logger.info("Loading whitelist.");
201 whiteList = new HashMap<String, Integer>();
202 initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204 // Create the map of topSites
205 logger.info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206 topSitesMap = new HashMap<String, String>();
207
208 // Read in our csv file of topsites and what to do when one hits a match with a seedURL
209 // and put these in our topSitesMap
210 // https://commons.apache.org/proper/commons-csv/apidocs/index.html
211 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214 .withCommentMarker('#')
215 .withSkipHeaderRecord()
216 .withIgnoreSurroundingSpaces();
217
218 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220 try (
221 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222 ) {
223 for (CSVRecord csvRecord : parser) {
224 String topsite = csvRecord.get(0);
225 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226 topSitesMap.put(topsite, allowed_url_pattern);
227
228 //logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230 }
231 } catch(Exception e) {
232 logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233 }
234
235 //logger.debug("Prematurely terminating for testing purposes.");
236 //System.exit(-1);
237 }
238
239
240 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
241 private String escapeStringForRegex(String str) {
242 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
243 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
244 str = str.replace(Character.toString(c), "\\"+c);
245 }
246 return str;
247 }
248
249 /**
250 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
251 * as output the URL seed list and regex-urlfilter text files required by nutch, see
252 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
253 */
254 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
255 File domainURLsFile, File topSiteMatchesFile,
256 File possibleProductSitesFile) {
257 // Maintain a Map of unique domains mapped to seed urls at that domain
258 // TreeSet: by default, "the elements are ordered using their natural ordering"
259 // (or by a Comparator provided at set creation time).
260 // Whereas HashSet doesn't guarantee ordering.
261 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
262 // Would be a similar distinction for Maps.
263 domainsToURLsMap = new TreeMap<String, Set<String>>();
264
265 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
266 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
267
268 // keep an eye out on URLs we need to inspect later
269 Set<String> possibleProductDomains = new TreeSet<String>();
270 File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
271
272 try (
273 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
274 BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
275 ) {
276
277 // read a URL at a time from urlsFile
278 String url = null;
279 String domainWithProtocol = null;
280 while((url = reader.readLine()) != null) { // readLine removes newline separator
281
282 // work out domain. This retains any www. or subdomain prefix
283 // passing true to further also retain the http(s) protocol
284 domainWithProtocol = Utility.getDomainForURL(url, true);
285
286 Set<String> urlsSet;
287 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
288 urlsSet = new TreeSet<String>();
289 urlsSet.add(url);
290 domainsToURLsMap.put(domainWithProtocol, urlsSet);
291 } else {
292 urlsSet = domainsToURLsMap.get(domainWithProtocol);
293 urlsSet.add(url);
294 }
295
296 /*
297 // Dr Nichols said that a url that was located outside the country and
298 // which had /mi/ URLs was more likely to be an autotranslated (product) site.
299 // Following Dr Nichols' idea, let's keep a look out for more product sites:
300 // if any URL contains /mi AND the tld of its domain is outside of New Zealand
301 // then add that domain (if not already added) and that url into a file
302 // for later manual inspection
303 if(!domainWithProtocol.endsWith(".nz")
304 && (url.contains("/mi/") || url.endsWith("/mi"))) {
305
306 if(!possibleProductDomains.contains(domainWithProtocol)) {
307
308 String countryCode = "";
309 try {
310 // more expensive test, so do this only if above conditions are true:
311 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
312 System.err.println("@@@@ Got country code: " + countryCode);
313 } catch(Exception exceptObj) {
314 countryCode = ""; // forces domain to be included for inspection
315
316 logger.error("Could not check if domain " + domainWithProtocol
317 + " was in country: " + countryCode,
318 exceptObj);
319 }
320
321 boolean isInNZ = countryCode.toLowerCase().equals("nz");
322
323
324 //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
325 if(!isInNZ) {
326 possibleProductDomains.add(domainWithProtocol);
327 // write both domain and a sample seedURL on that site out to file
328 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
329 possibleProductSitesWriter.write("\t" + url + "\n");
330 }
331 }
332 //else {
333 // already wrote out domain to file at some point, write just the URL out to file
334 //possibleProductSitesWriter.write("\t" + url + "\n");
335 //}
336 }
337 */
338 }
339 } catch (IOException ioe) {
340 logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
341 }
342
343 // We'd have pruned out duplicates by now and have a sorted list of domains,
344 // each of which maps to seed URLs in the commoncrawl for that domain
345
346 int domainCount = 0;
347 File sitesFolder = new File(outputFolder, "sites");
348 if(!sitesFolder.exists()) {
349 sitesFolder.mkdir();
350 }
351 final String FORMATSTR = "%05d";
352
353 // write out each domain followed in sequence by all urls we found in that domain
354 // (urls with tab up front)
355 try (
356 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
357 // Also a global file listing any urls that matched top sites that didn't specify
358 // allowed regex patterns
359 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
360 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
361 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
362 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));
363 ) {
364
365 // initialise topSiteMatchesFile with some instructional text.
366 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
367 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
368 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
369
370 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
371 Set<String> domainsSet = domainsToURLsMap.keySet();
372 Iterator<String> domainIterator = domainsSet.iterator();
373
374 /*
375 // DEBUG
376 String value = topSitesMap.get("wikipedia.org");
377 if(value == null) {
378 logger.debug("### wikipedia.org had null value");
379 } else {
380 logger.debug("### wikipedia.org had value: " + value);
381 } // DEBUG
382 */
383
384 while(domainIterator.hasNext()) {
385 String domainWithProtocol = domainIterator.next();
386 // Also get domain without protocol prefix
387 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
388 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
389 String domain = domainWithProtocol.substring(startIndex);
390
391 /*if(domain.contains("docs.google.com")) {
392 logger.debug("domain with protocol: " + domainWithProtocol);
393 logger.debug("domain: " + domain);
394 }*/
395
396 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
397 // If the domain is of a topsite for which no allowed URL pattern has been provided
398 // in sites-too-big-to-exhaustively-crawl.txt,
399 // then we don't know how to crawl the site. Warn the user by writing the affected
400 // domain and seedURLs to the topSiteMatchesFile.
401 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
402
403 // topsite, but we don't (yet) know what portion can be crawled
404 // Append the top site and url to a global/toplevel file that
405 // the user needs to check later and we're done with this domain as it
406 // won't go into any other file hereafter
407
408 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
409 for(String url : urlsForDomainSet) {
410 topSiteMatchesWriter.write("\t" + url + "\n");
411 }
412
413 continue; // done with this domain
414 }
415
416 // start counting the domains we're actually going to process
417 domainCount++;
418
419 String siteID = String.format(FORMATSTR, domainCount);
420 File domainFolder = new File(sitesFolder, siteID);
421 domainFolder.mkdir();
422
423 // write out the domain
424 //seedURLsWriter.write(domainWithProtocol + "\n");
425
426
427 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
428 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
429 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
430 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
431 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
432 try (
433 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
434 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
435 ) {
436
437 // write all sorted unique domains into global domains file
438 // Using the domain withuot protocol since the global domains file is for
439 // informational purposes
440 domainURLsWriter.write(domain + "\n");
441
442 // Only write urls and no domain into single global seedurls file
443 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
444 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
445 // If we ever run nutch on a single seedURLs listing containing
446 // all seed pages to crawl sites from, the above two files will work for that.
447
448 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file
449 // also write into the global seeds file (with a tab prefixed to each?)
450 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
451 for(String url : urlsForDomainSet) {
452 seedURLsWriter.write(url + "\n"); // global seedURLs file
453 siteURLsWriter.write(url + "\n");
454 }
455
456 if(allowedURLPatternRegex == null) { // entire site can be crawled
457 siteURLsWriter.write(domainWithProtocol + "\n");
458
459 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
460 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
461 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
462 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
463 urlFilterWriter.write(regexed_domain + "\n"); //global file
464 siteRegexWriter.write(regexed_domain + "\n"); // site file
465 }
466 else { // domain belongs to a top site where only portion of site can be crawled
467
468 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
469 siteURLsWriter.write(domainWithProtocol + "\n");
470 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
471 // and not for all of blogspot.com
472
473 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
474 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
475 urlFilterWriter.write(regexed_domain + "\n");
476 siteRegexWriter.write(regexed_domain + "\n");
477
478 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
479 // don't write out domain. We want individual pages
480 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
481
482 // don't write out domain as a regex expression url filter either,
483 // write out the individual seed urls for the domain instead
484 // since we will only be downloading the single page
485
486 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
487 for(String urlInDomain : urlsForDomainSet) {
488 // don't append slash to end this time
489 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
490 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
491 urlFilterWriter.write(regexed_url + "\n");
492 siteRegexWriter.write(regexed_url + "\n");
493 }
494 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
495
496 // DON'T write out domain into siteURLs file,
497 // BUT DO write it into urlFilter file
498 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
499
500 urlFilterWriter.write(regexed_domain + "\n");
501 siteRegexWriter.write(regexed_domain + "\n");
502 } else { // allowedURLPatternRegex is a url-form - convert to regex
503 if(!allowedURLPatternRegex.endsWith("/")) {
504 allowedURLPatternRegex += "/";
505 }
506 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
507 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
508 siteURLsWriter.write(domainWithProtocol + "\n");
509 urlFilterWriter.write(regexed_pattern + "\n");
510 siteRegexWriter.write(regexed_pattern + "\n");
511
512 }
513 }
514
515 } catch (IOException ioe) {
516 logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
517 }
518
519 }
520
521 } catch (IOException ioe) {
522 logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
523 + "\n\t" + urlFilterFile
524 + "\n\t" + domainURLsFile
525 + "\n\t" + topSiteMatchesFile, ioe);
526 }
527
528 /*
529 // BEGIN DEBUG
530 logger.debug("@@@@ TopSitesMap contains: ");
531 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
532 String topSite = entry.getKey();
533 String urlPattern = entry.getValue();
534 logger.debug(topSite + " - " + urlPattern);
535 } // END DEBUG
536 */
537 }
538
539 private String stripSubDomain(String url) {
540 int index = url.indexOf(".");
541 if(index != -1) {
542 url = url.substring(index+1);
543 }
544 return url;
545 }
546
547
548 /**
549 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
550 * with or without www. prefix. This method tests for such as case as it would be dangerous
551 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
552 */
553 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
554 // check for an exact match as-is
555 if(seedURLDomain.equals(domain)) {
556 return true;
557 }
558
559 // else check if with or without a www. prefix we have an exact match with domain
560 if(seedURLDomain.startsWith("www.")) {
561 if(seedURLDomain.substring(4).equals(domain)) {
562 return true;
563 }
564 } else {
565 if(domain.equals("www."+seedURLDomain)) {
566 return true;
567 }
568 }
569
570 return false;
571 }
572
573
574 /**
575 * Check if the domain of the seedurl, either in its entirety or when stripped of
576 * www/subdomains, is in the list of top sites.
577 * If it is, and the given url matches the regex for that topsite, then add the url to the
578 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
579 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
580 * @return one of the following values:
581 * - This function returns null if the seedURL's domain does not match any of the topsites.
582 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
583 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
584 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
585 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
586 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
587 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
588 * URL pattern.
589 */
590 private String isURLinTopSitesMap(String fullSeedDomain) {
591 boolean keepLooping = true;
592
593 String domain = fullSeedDomain;
594
595 // domain aprameter will have retained www or subdomains, but is stripped of protocol
596
597 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
598 // if it does, return the value for that topsite domain in the topSitesMap
599 // If no match at all, return null.
600 do {
601
602 String allowed_url_pattern = topSitesMap.get(domain);
603 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
604 // there's an entry for the URL in the topSitesMap
605 logger.debug("##### A top site matches URL domain " + domain);
606
607 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
608 // www prefix, should not exactly match the topSitesMap domain
609 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
610 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
611
612 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
613 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
614 }
615 return allowed_url_pattern;
616 }
617 // else, no entry for the URL in the topSitesMap
618 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
619
620 String newDomain = stripSubDomain(domain);
621 if(domain.equals(newDomain)) {
622 keepLooping = false;
623 } else {
624 domain = newDomain;
625 }
626 } while(keepLooping);
627
628 // url in entirety or stripped of subdomains did not match any of the topsites
629 return null;
630 }
631
632 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
633 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
634 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
635 //while(i.hasNext()) {
636 // Map.Entry<String, Integer> entry = i.next();
637 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
638 String urlPattern = entry.getKey();
639 Integer matchRule = entry.getValue();
640
641 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
642 return true;
643 }
644 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
645 return true;
646 }
647 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
648 return true;
649 }
650 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
651 return true;
652 }
653 // else check the rest of the filter list against this url
654 // before returning false to be certain it's not been listed in the filter list
655 }
656
657 return false;
658 }
659
660 /**
661 * Returns true if the url or pattern is found in the blacklist file.
662 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
663 * it won't get blacklisted after all. But that's not implemented here.
664 */
665 public boolean isBlacklisted(String url) {
666 boolean isBlackListed = isListedInFilterList(blackList, url);
667
668 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
669 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
670 String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol
671 if(!isBlackListed && url.contains("jasmin")) {
672 logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
673 blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
674 }
675 return isBlackListed;
676 }
677
678 /**
679 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
680 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
681 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
682 */
683 public boolean isGreylisted(String url) {
684 // auto-translated product sites
685 return isListedInFilterList(greyList, url);
686 }
687
688 /**
689 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
690 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
691 */
692 public boolean isWhitelisted(String url) {
693 return isListedInFilterList(whiteList, url);
694 }
695
696 /**
697 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
698 * whether it is in the mentioned black|grey|white list.
699 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
700 * By not having this method deal with actual regex for filters, this has the advantage that
701 * we don't have to remember to escape or double escape each filter to turn it into a regex.
702 */
703 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
704
705 // if filterListFilename does not exist in the conf folder, just return
706 if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
707 logger.warn("Filter list filename: " + filterListFilename + " does not exist");
708 return;
709 }
710
711 try (
712 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
713 ) {
714 String filter = null;
715 while((filter = reader.readLine()) != null) {
716 // skip comments and empty lines
717 filter = filter.trim();
718 if(filter.equals("") || filter.startsWith("#")) {
719 continue;
720 }
721
722 if(filter.startsWith("^") && filter.endsWith("$")) {
723 filter = filter.substring(1, filter.length()-1);
724 list.put(filter, LIST_ENTRY_MATCHES);
725 }
726 else if(filter.startsWith("^")) {
727 filter = filter.substring(1);
728 list.put(filter, LIST_ENTRY_STARTSWITH);
729 //logger.debug("Match filter startswith: " + filter);
730 }
731 else if(filter.endsWith("$")) {
732 filter = filter.substring(0, filter.length()-1);
733 list.put(filter, LIST_ENTRY_ENDSWITH);
734 //logger.debug("@@@ Match filter endswith: " + filter);
735 }
736 else {
737 list.put(filter, LIST_ENTRY_CONTAINS);
738 }
739 //logger.debug("Got filter: " + filter);
740 }
741
742 } catch (IOException ioe) {
743 logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
744 }
745
746 }
747
748 /** Maintain a count of all WET files processed. */
749 public void setWETFileCount(int count) { this.wetFileCount = count; }
750
751 /** Maintain a count of all WET records processed. */
752 //public int getRecordCount() { return this.totalRecordCount; }
753 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
754 public void setRecordCount(int count) { this.totalRecordCount = count; }
755
756 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
757
758 // Will list all the warc.wet files in the input directory or else their gzipped versions
759 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
760
761 int wetRecordCount = 0;
762 int wetFileCount = 0;
763
764 for(int i = 0; i < WETFiles.length; i++) {
765 File WETFile = WETFiles[i];
766 logger.debug("Processing WETfile: " + WETFile);
767
768 // Any .gz files listed means they haven't been unzipped yet. So unzip.
769 String WETFilename = WETFile.toString();
770 if(WETFilename.endsWith(".gz")) {
771 File GZippedWETFile = WETFile;
772 String WETGZippedFilename = WETFilename;
773 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
774
775 WETFile = new File(WETFilename);
776 Utility.unzipFile(GZippedWETFile, WETFile);
777 }
778 // hereafter all WETFiles should refer to the unzipped version
779 // Check the unzipped WETFile exists
780
781 if(!WETFile.exists() || !WETFile.isFile()) {
782 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
783 return;
784 }
785
786 // Finally, we can process this WETFile's records into the keep and discard pile
787 wetFileCount++;
788 logger.debug("Off to process " + WETFile);
789 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
790 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
791 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
792 wetFileProcessor.processWETFile();
793 wetRecordCount += wetFileProcessor.getRecordCount();
794 }
795
796 // for information purposes
797 this.setWETFileCount(wetFileCount);
798 this.setRecordCount(wetRecordCount);
799 }
800
801
802 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
803 public static void printUsage() {
804 System.err.println("Run this program as:");
805 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");
806 }
807
808 /** Filename filter to only list warc.wet files or else warc.wet.gz files
809 * for which unzipped warc.wet equivalents don't yet exist.
810 */
811 private static class WETFilenameFilter implements FilenameFilter {
812
813 public boolean accept(File dir, String name) {
814 if(name.endsWith(".warc.wet")) {
815 logger.debug("Will include " + name + " for processing.");
816 return true;
817 }
818
819 if(name.endsWith(".warc.wet.gz")) {
820 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
821 File unzippedVersion = new File(dir, nameWithoutGZext);
822 if(unzippedVersion.exists()) {
823 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
824 logger.debug("Skipping " + name);
825 return false; // don't count gzipped version if unzipped version exists.
826 }
827 else {
828 logger.debug("Only zipped version " + name + " exists.");
829 return true; // No unzipped version, so have to work with gzipped version
830 }
831 }
832
833 // we're not even interested in any other file extensions
834 logger.debug("Not a WET file. Skipping " + name);
835 return false;
836 }
837 }
838
839
840 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
841
842 public boolean accept(File dir, String name) {
843 File f = new File (dir, name);
844 if(f.isDirectory()) {
845 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
846 return true;
847 }
848 }
849 else {
850 logger.info("File " + f + " is not a directory");
851 }
852 return false;
853 }
854 }
855
856 public static void main(String[] args) {
857 if(args.length != 2) {
858 printUsage();
859 return;
860 }
861
862 File commoncrawlDir = new File(args[0]);
863 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
864 logger.error("Error: " + args[0] + " does not exist or is not a directory");
865 return;
866 }
867
868 File outFolder = new File(args[1]);
869 if(!outFolder.exists() || !outFolder.isDirectory()) {
870 logger.error("Error: " + args[1] + " does not exist or is not a directory.");
871 return;
872 }
873
874 try {
875 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
876
877 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
878
879 for(int i = 0; i < ccrawlFolders.length; i++) {
880 File ccrawlFolder = ccrawlFolders[i];
881 logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder);
882 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
883 }
884
885 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
886 // The former is the only unique one. seedURLs and regex-urlfilters are
887 // repeated on a per site/domain basis too, stored in the sites folder
888 File seedURLsFile = new File(outFolder, "seedURLs.txt");
889 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
890 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
891 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
892 File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
893
894 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
895
896 logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
897
898 logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
899
900
901 } catch(Exception e) {
902 // can get an exception when instantiating CCWETProcessor instance
903 logger.error(e.getMessage(), e);
904 }
905
906 return;
907
908 }
909}
Note: See TracBrowser for help on using the repository browser.