source: other-projects/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33808

Last change on this file since 33808 was 33666, checked in by ak19, 4 years ago

Having finished sending all the crawl data to mongodb 1. Recrawled the 2 sites which I had earlier noted required recrawling 00152, 00332. 00152 required changes to how it needed to be crawled. MP3 files needed to be blocked, as there were HBase error messages about key values being too large. 2. Modified the regex-urlfilter.GS_TEMPLATE file for this to block mp3 files in general for future crawls too (in the location of the file where jpg etc were already blocked by nutch's default regex url filters). 3. Further had to control the 00152 site to only be crawled under its /maori/ sub-domain. Since the seedURL maori.html was not off a /maori/ url, this revealed that the CCWETProcessor code didn't already consider allowing filters to okay seedURLs even where the crawl was controlled to run over a subdomain (as expressed in conf/sites-too-big-to-exhaustively-crawl file) but where the seedURL didn't match these controlled regex filters. So now, in such cases, the CCWETProcessor adds seedURLs that don't match to the filters too (so we get just the single file of the seedURL pages) besides a filter on the requested subdomain, so we follow all pages linked by the seedURLs that match the subdomain expression. 4. Adding to_crawl.tar.gz to svn, the tarball of the sites to_crawl that I actually ran nutch over, of all the sites folders with their seedURL.txt and regex-urlfilter.txt files that the batchcrawl.sh runs over. This didn't use the latest version of the sites folder and blacklist/whitelist files generated by CCWETProcessor, since the latest version was regenerated after the final modifications to CCWETProcessor which was after crawling was finished. But to_crawl.tar.gz does have a manually modified 00152, wit the correct regex-urlfilter file and uses the newer regex-urlfilter.GS_TEMPLATE file that blocks mp3 files. 5. crawledNode6.tar.gz now contains the dump output for sites 00152 and 00332, which were crawled on node6 today (after which their processed dump.txt file results were added into MongoDB). 7. MoreReading/mongodb.txt now contains the results of some queries I ran against the total nutch-crawled data.

File size: 40.3 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.nio.charset.StandardCharsets;
6import java.util.Properties;
7import java.util.zip.GZIPInputStream;
8import java.util.Iterator;
9import java.util.HashMap;
10import java.util.Map;
11import java.util.Set;
12import java.util.TreeMap;
13import java.util.TreeSet;
14
15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16import org.apache.log4j.Logger;
17
18
19/**
20 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21 * the WET records in each, putting each WET record into a file. Each file is put into a
22 * keep or discard or greyListed folder, and its url listed written into a keep, discard
23 * or greylisted text file, based on based on
24 *
25 * 1. whether it's whitelisted, else greylisted else blacklisted
26 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27 * enough content. Formerly, content-length and number of lines were used to determine if
28 * the content was sufficient. Now it's just word count and number of MAX characters
29 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30 * in conf/config.properties.
31 *
32 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33 * into the conf folder to control any url patterns that are explicitly included or excluded or
34 * set aside for inspecting later. These filter text files don't use regexes, instead their
35 * format is:
36 * - precede URL by ^ to blacklist urls that match the given prefix
37 * - succeed URL by $ to blacklist urls that match the given suffix
38 * - ^url$ will blacklist urls that match the given url completely
39 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40 *
41 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
42 * over greylisting and which takes precedence over blacklisting in turn. However, even
43 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45 * also for nutch.
46 *
47 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48 * in the given input folder. Then use a single instance of the WETProcessor class to process
49 * each single unzipped warc.wet file.
50 *
51 * To compile, including the jars in lib/ for compiling.
52 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53 *
54 * To run, passing the log4j and other properties files in conf/ folder:
55 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56 *
57 * e.g. (from maori-lang-detection/src)
58 *
59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/maori-lang-detection/to_crawl
60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/maori-lang-detection/to_crawl 2>&1 | less
61 *
62*/
63
64public class CCWETProcessor {
65 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67 // Properties shared across WETProcessor instances
68 public final int MAX_WORD_LENGTH;
69 public final int MIN_NUM_WORDS;
70 public final int MAX_WORDS_CAMELCASE;
71
72 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74 public final String SINGLEPAGE = "SINGLEPAGE";
75 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77 /**
78 * Characters that need escaping if used as a string literal in a regex
79 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80 * https://www.regular-expressions.info/refcharacters.html
81 * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82 * as would happen for any chars appearing earlier in this list than \\
83 */
84 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
85 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
86
87 private Properties configProperties = new Properties();
88
89 // File paths shared across WETProcessor instances
90 public final File commoncrawlDir;
91 public final File outputFolder;
92 public final File discardFolder;
93 public final File keepFolder;
94 public final File greyListedFolder;
95 public final File keepURLsFile;
96 public final File discardURLsFile;
97 public final File greyListedFile;
98
99 /** Possible values stored in the blackList/whiteList/greyList Maps */
100 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105 /**
106 * Store url patterns as keys and values indicated whether a url should
107 * match it exactly, start/end with it, or contain it
108 */
109 private HashMap<String, Integer> blackList;
110 private HashMap<String, Integer> greyList;
111 private HashMap<String, Integer> whiteList;
112
113 /** map of topsites with allowable regexes: sites too big to exhaustively crawl
114 * with optional regex defining allowed exceptions, like subdomains or url suffixes
115 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117 * would be relevant.
118 * The map would store top site domain suffix and an optional regex string for allowable
119 * url patterns.
120 */
121 private HashMap<String, String> topSitesMap;
122
123 /** Map of domains we keep and the full urls we're keeping that are of that domain.
124 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125 * while a HashMap has no notion of ordering, because we just need to store urls with
126 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130 */
131 private Map<String, Set<String>> domainsToURLsMap;
132
133 // Keep a count of all the records that all WETProcessors instantiated
134 // by our main method combined have processed
135 private int totalRecordCount = 0;
136
137 private int wetFileCount = 0;
138
139 private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142 this.commoncrawlDir = inFolder;
143 this.outputFolder = outFolder;
144
145 // load up the properties from the config file
146 try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147 configProperties = new Properties();
148 configProperties.load(infile);
149 //infile.close(); // not explicitly called in examples of try-with-resources
150
151 } catch(Exception e) {
152 logger.error("Exception attempting to read properties from config.properties.", e);
153 }
154
155 if(configProperties.size() == 0) {
156 logger.warn("*** Warning: no values read into config properties. Using defaults.");
157 }
158
159 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164 this.discardFolder = new File(outFolder, "discard");
165 if(!discardFolder.exists()) {
166 discardFolder.mkdir();
167 }
168 this.keepFolder = new File(outFolder, "keep");
169 if(!keepFolder.exists()) {
170 keepFolder.mkdir();
171 }
172
173 this.greyListedFolder = new File(outFolder, "greylisted");
174 if(!greyListedFolder.exists()) {
175 greyListedFolder.mkdir();
176 }
177
178 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181 }
182 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185 }
186 this.greyListedFile = new File(outFolder, "greyListed.txt");
187 if(greyListedFile.exists() && !greyListedFile.delete()) {
188 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189 }
190
191 // prepare our blacklist, greylist (for inspection) and whitelist
192 logger.info("Loading blacklist.");
193 blackList = new HashMap<String, Integer>();
194 initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196 logger.info("Loading greylist.");
197 greyList = new HashMap<String, Integer>();
198 initURLFilterList(greyList, "url-greylist-filter.txt");
199
200 logger.info("Loading whitelist.");
201 whiteList = new HashMap<String, Integer>();
202 initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204 // Create the map of topSites
205 logger.info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206 topSitesMap = new HashMap<String, String>();
207
208 // Read in our csv file of topsites and what to do when one hits a match with a seedURL
209 // and put these in our topSitesMap
210 // https://commons.apache.org/proper/commons-csv/apidocs/index.html
211 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212 //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213 CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214 .withCommentMarker('#')
215 .withSkipHeaderRecord()
216 .withIgnoreSurroundingSpaces();
217
218 File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219 // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220 try (
221 CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222 ) {
223 for (CSVRecord csvRecord : parser) {
224 String topsite = csvRecord.get(0);
225 String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226 topSitesMap.put(topsite, allowed_url_pattern);
227
228 //logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230 }
231 } catch(Exception e) {
232 logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233 }
234
235 //logger.debug("Prematurely terminating for testing purposes.");
236 //System.exit(-1);
237 }
238
239
240 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
241 private String escapeStringForRegex(String str) {
242 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
243 char c = ESCAPE_CHARS_FOR_RE.charAt(i);
244 str = str.replace(Character.toString(c), "\\"+c);
245 }
246 return str;
247 }
248
249 /**
250 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
251 * as output the URL seed list and regex-urlfilter text files required by nutch, see
252 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
253 *
254 * This method creates seedURLs files and url-regexfilter files needed by nutch, instructing
255 * it what portion to crawl of each site.
256 *
257 * The topSiteMatches file also gets created, listing sites excluded from crawling as
258 * they're too large to exhaustively crawl. The user will be told to inspect this file
259 * after this program has finished running.
260 *
261 * If checkForPossibleProductSites, then any urls containing /mi(/) that are outside of NZ
262 * or whose geolocation isn't known will end up in the file denoted by possibleProductSitesFile
263 *
264 */
265 public void prepareSitesForNutchCrawling(File seedURLsFile, File urlFilterFile,
266 File domainURLsFile, File topSiteMatchesFile,
267 boolean checkForPossibleProductSites, File possibleProductSitesFile) {
268 // Maintain a Map of unique domains mapped to seed urls at that domain
269 // TreeSet: by default, "the elements are ordered using their natural ordering"
270 // (or by a Comparator provided at set creation time).
271 // Whereas HashSet doesn't guarantee ordering.
272 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
273 // Would be a similar distinction for Maps.
274 domainsToURLsMap = new TreeMap<String, Set<String>>();
275
276 final String PROTOCOL_REGEX_PREFIX = "+^https?://";
277 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
278
279 // keep an eye out on URLs we need to inspect later
280 Set<String> possibleProductDomains = new TreeSet<String>();
281 File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
282
283 try (
284 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
285 BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
286 ) {
287
288 // read a URL at a time from urlsFile
289 String url = null;
290 String domainWithProtocol = null;
291 while((url = reader.readLine()) != null) { // readLine removes newline separator
292
293 // work out domain. This retains any www. or subdomain prefix
294 // passing true to further also retain the http(s) protocol
295 domainWithProtocol = Utility.getDomainForURL(url, true);
296
297 Set<String> urlsSet;
298 if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
299 urlsSet = new TreeSet<String>();
300 urlsSet.add(url);
301 domainsToURLsMap.put(domainWithProtocol, urlsSet);
302 } else {
303 urlsSet = domainsToURLsMap.get(domainWithProtocol);
304 urlsSet.add(url);
305 }
306
307 if(checkForPossibleProductSites) {
308 // Dr Nichols said that a url that was located outside the country and
309 // which had /mi/ URLs was more likely to be an autotranslated (product) site.
310 // Following Dr Nichols' idea, let's keep a look out for more product sites:
311 // if any URL contains /mi AND the tld of its domain is outside of New Zealand
312 // then add that domain (if not already added) and that url into a file
313 // for later manual inspection
314 if(!domainWithProtocol.endsWith(".nz")
315 && (url.contains("/mi/") || url.endsWith("/mi"))) {
316
317 if(!possibleProductDomains.contains(domainWithProtocol)) {
318
319 String countryCode = "";
320 try {
321 // more expensive test, so do this only if above conditions are true:
322 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
323 System.err.println("@@@@ Got country code: " + countryCode);
324 } catch(Exception exceptObj) {
325 countryCode = ""; // forces domain to be included for inspection
326
327 logger.error("Could not check if domain " + domainWithProtocol
328 + " was in country: " + countryCode,
329 exceptObj);
330 }
331
332 boolean isInNZ = countryCode.toLowerCase().equals("nz");
333
334
335 //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
336 if(!isInNZ) {
337 possibleProductDomains.add(domainWithProtocol);
338 // write both domain and a sample seedURL on that site out to file
339 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
340 possibleProductSitesWriter.write("\t" + url + "\n");
341 }
342 }
343 //else {
344 // already wrote out domain to file at some point, write just the URL out to file
345 //possibleProductSitesWriter.write("\t" + url + "\n");
346 //}
347 }
348 }
349 }
350 } catch (IOException ioe) {
351 logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
352 }
353
354 // We'd have pruned out duplicates by now and have a sorted list of domains,
355 // each of which maps to seed URLs in the commoncrawl for that domain
356
357 int domainCount = 0;
358 File sitesFolder = new File(outputFolder, "sites");
359 if(!sitesFolder.exists()) {
360 sitesFolder.mkdir();
361 }
362 final String FORMATSTR = "%05d";
363
364 // write out each domain followed in sequence by all urls we found in that domain
365 // (urls with tab up front)
366 try (
367 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
368 // Also a global file listing any urls that matched top sites that didn't specify
369 // allowed regex patterns
370 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
371 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
372 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
373 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));
374 ) {
375
376 // initialise topSiteMatchesFile with some instructional text.
377 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
378 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
379 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
380
381 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
382 Set<String> domainsSet = domainsToURLsMap.keySet();
383 Iterator<String> domainIterator = domainsSet.iterator();
384
385 /*
386 // DEBUG
387 String value = topSitesMap.get("wikipedia.org");
388 if(value == null) {
389 logger.debug("### wikipedia.org had null value");
390 } else {
391 logger.debug("### wikipedia.org had value: " + value);
392 } // DEBUG
393 */
394
395 while(domainIterator.hasNext()) {
396 String domainWithProtocol = domainIterator.next();
397 // Also get domain without protocol prefix
398 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
399 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
400 String domain = domainWithProtocol.substring(startIndex);
401
402 /*if(domain.contains("docs.google.com")) {
403 logger.debug("domain with protocol: " + domainWithProtocol);
404 logger.debug("domain: " + domain);
405 }*/
406
407 String allowedURLPatternRegex = isURLinTopSitesMap(domain);
408 // If the domain is of a topsite for which no allowed URL pattern has been provided
409 // in sites-too-big-to-exhaustively-crawl.txt,
410 // then we don't know how to crawl the site. Warn the user by writing the affected
411 // domain and seedURLs to the topSiteMatchesFile.
412 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
413
414 // topsite, but we don't (yet) know what portion can be crawled
415 // Append the top site and url to a global/toplevel file that
416 // the user needs to check later and we're done with this domain as it
417 // won't go into any other file hereafter
418
419 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
420 for(String url : urlsForDomainSet) {
421 topSiteMatchesWriter.write("\t" + url + "\n");
422 }
423
424 continue; // done with this domain
425 }
426
427 // start counting the domains we're actually going to process
428 domainCount++;
429
430 String siteID = String.format(FORMATSTR, domainCount);
431 File domainFolder = new File(sitesFolder, siteID);
432 domainFolder.mkdir();
433
434 // write out the domain
435 //seedURLsWriter.write(domainWithProtocol + "\n");
436
437
438 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
439 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
440 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
441 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
442 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
443 try (
444 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
445 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
446 ) {
447
448 // write all sorted unique domains into global domains file
449 // Using the domain withuot protocol since the global domains file is for
450 // informational purposes
451 domainURLsWriter.write(domain + "\n");
452
453 // Only write urls and no domain into single global seedurls file
454 // But write domain and tab-spaced urls into individual sites/0000#/seedURLs.txt
455 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
456 // If we ever run nutch on a single seedURLs listing containing
457 // all seed pages to crawl sites from, the above two files will work for that.
458
459 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file
460 // also write into the global seeds file (with a tab prefixed to each?)
461 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
462 for(String url : urlsForDomainSet) {
463 seedURLsWriter.write(url + "\n"); // global seedURLs file
464 siteURLsWriter.write(url + "\n");
465 }
466
467 if(allowedURLPatternRegex == null) { // entire site can be crawled
468 siteURLsWriter.write(domainWithProtocol + "\n");
469
470 // Write out filter in the following form for a site, e.g. for nutch.apache.org:
471 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
472 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
473 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
474 urlFilterWriter.write(regexed_domain + "\n"); //global file
475 siteRegexWriter.write(regexed_domain + "\n"); // site file
476 }
477 else { // domain belongs to a top site where only portion of site can be crawled
478
479 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
480 siteURLsWriter.write(domainWithProtocol + "\n");
481 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
482 // and not for all of blogspot.com
483
484 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
485 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
486 urlFilterWriter.write(regexed_domain + "\n");
487 siteRegexWriter.write(regexed_domain + "\n");
488
489 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
490 // don't write out domain. We want individual pages
491 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
492
493 // don't write out domain as a regex expression url filter either,
494 // write out the individual seed urls for the domain instead
495 // since we will only be downloading the single page
496
497 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
498 for(String urlInDomain : urlsForDomainSet) {
499 // don't append slash to end this time
500 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
501 //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
502 urlFilterWriter.write(regexed_url + "\n");
503 siteRegexWriter.write(regexed_url + "\n");
504 }
505 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
506
507 // DON'T write out domain into siteURLs file,
508 // BUT DO write it into urlFilter file
509 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
510
511 urlFilterWriter.write(regexed_domain + "\n");
512 siteRegexWriter.write(regexed_domain + "\n");
513 } else { // allowedURLPatternRegex is a url-form - convert to regex
514 if(!allowedURLPatternRegex.endsWith("/")) {
515 allowedURLPatternRegex += "/";
516 }
517 String regexed_pattern = FILTER_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
518 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
519
520 // In case any of the seedURLs themselves are not within the
521 // allowedURLPatternRegex part of the site, FIRST write out such
522 // seedURLs as allowed regex patterns, so they get downloaded
523 // as single pages.
524 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
525 for(String urlInDomain : urlsForDomainSet) {
526
527 String urlWithoutProtocolAndWWW = Utility.stripProtocolAndWWWFromURL(urlInDomain);
528 String allowedURLPatternWithoutProtocolAndWWW = Utility.stripProtocolAndWWWFromURL(allowedURLPatternRegex);
529 if(!urlWithoutProtocolAndWWW.startsWith(allowedURLPatternWithoutProtocolAndWWW)) {
530 // don't append slash to end this time
531 String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
532 urlFilterWriter.write(regexed_url + "\n");
533 siteRegexWriter.write(regexed_url + "\n");
534 }
535 }
536
537 siteURLsWriter.write(domainWithProtocol + "\n");
538 // write out allowedURLPatternRegex istead of the domain
539 //siteURLsWriter.write(allowedURLPatternRegex + "\n");
540
541 // Now restrict any other URLs found to be within the allowedURLPattern
542 // part of the site
543 urlFilterWriter.write(regexed_pattern + "\n");
544 siteRegexWriter.write(regexed_pattern + "\n");
545 }
546 }
547
548 } catch (IOException ioe) {
549 logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
550 }
551
552 }
553
554 } catch (IOException ioe) {
555 logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
556 + "\n\t" + urlFilterFile
557 + "\n\t" + domainURLsFile
558 + "\n\t" + topSiteMatchesFile, ioe);
559 }
560
561 /*
562 // BEGIN DEBUG
563 logger.debug("@@@@ TopSitesMap contains: ");
564 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
565 String topSite = entry.getKey();
566 String urlPattern = entry.getValue();
567 logger.debug(topSite + " - " + urlPattern);
568 } // END DEBUG
569 */
570 }
571
572 private String stripSubDomain(String url) {
573 int index = url.indexOf(".");
574 if(index != -1) {
575 url = url.substring(index+1);
576 }
577 return url;
578 }
579
580
581 /**
582 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
583 * with or without www. prefix. This method tests for such as case as it would be dangerous
584 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
585 */
586 private boolean isExactDomainMatch(String seedURLDomain, String domain) {
587 // check for an exact match as-is
588 if(seedURLDomain.equals(domain)) {
589 return true;
590 }
591
592 // else check if with or without a www. prefix we have an exact match with domain
593 if(seedURLDomain.startsWith("www.")) {
594 if(seedURLDomain.substring(4).equals(domain)) {
595 return true;
596 }
597 } else {
598 if(domain.equals("www."+seedURLDomain)) {
599 return true;
600 }
601 }
602
603 return false;
604 }
605
606
607 /**
608 * Check if the domain of the seedurl, either in its entirety or when stripped of
609 * www/subdomains, is in the list of top sites.
610 * If it is, and the given url matches the regex for that topsite, then add the url to the
611 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
612 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
613 * @return one of the following values:
614 * - This function returns null if the seedURL's domain does not match any of the topsites.
615 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
616 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
617 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
618 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
619 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
620 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
621 * URL pattern.
622 */
623 private String isURLinTopSitesMap(String fullSeedDomain) {
624 boolean keepLooping = true;
625
626 String domain = fullSeedDomain;
627
628 // domain aprameter will have retained www or subdomains, but is stripped of protocol
629
630 // keep looping, stripping subdomains from url and checking if it matches a topsite domain
631 // if it does, return the value for that topsite domain in the topSitesMap
632 // If no match at all, return null.
633 do {
634
635 String allowed_url_pattern = topSitesMap.get(domain);
636 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
637 // there's an entry for the URL in the topSitesMap
638 logger.debug("##### A top site matches URL domain " + domain);
639
640 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
641 // www prefix, should not exactly match the topSitesMap domain
642 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
643 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
644
645 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
646 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
647 }
648 return allowed_url_pattern;
649 }
650 // else, no entry for the URL in the topSitesMap
651 // We're not done yet: strip subDomain from URL and check it against topSitesMap again
652
653 String newDomain = stripSubDomain(domain);
654 if(domain.equals(newDomain)) {
655 keepLooping = false;
656 } else {
657 domain = newDomain;
658 }
659 } while(keepLooping);
660
661 // url in entirety or stripped of subdomains did not match any of the topsites
662 return null;
663 }
664
665 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
666 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
667 //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
668 //while(i.hasNext()) {
669 // Map.Entry<String, Integer> entry = i.next();
670 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
671 String urlPattern = entry.getKey();
672 Integer matchRule = entry.getValue();
673
674 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
675 return true;
676 }
677 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
678 return true;
679 }
680 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
681 return true;
682 }
683 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
684 return true;
685 }
686 // else check the rest of the filter list against this url
687 // before returning false to be certain it's not been listed in the filter list
688 }
689
690 return false;
691 }
692
693 /**
694 * Returns true if the url or pattern is found in the blacklist file.
695 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
696 * it won't get blacklisted after all. But that's not implemented here.
697 */
698 public boolean isBlacklisted(String url) {
699 boolean isBlackListed = isListedInFilterList(blackList, url);
700
701 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
702 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
703 String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol
704 if(!isBlackListed && url.contains("jasmin")) {
705 logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
706 blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
707 }
708 return isBlackListed;
709 }
710
711 /**
712 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
713 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
714 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
715 */
716 public boolean isGreylisted(String url) {
717 // auto-translated product sites
718 return isListedInFilterList(greyList, url);
719 }
720
721 /**
722 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
723 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
724 */
725 public boolean isWhitelisted(String url) {
726 return isListedInFilterList(whiteList, url);
727 }
728
729 /**
730 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
731 * whether it is in the mentioned black|grey|white list.
732 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
733 * By not having this method deal with actual regex for filters, this has the advantage that
734 * we don't have to remember to escape or double escape each filter to turn it into a regex.
735 */
736 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
737
738 // if filterListFilename does not exist in the conf folder, just return
739 if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
740 logger.warn("Filter list filename: " + filterListFilename + " does not exist");
741 return;
742 }
743
744 try (
745 BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
746 ) {
747 String filter = null;
748 while((filter = reader.readLine()) != null) {
749 // skip comments and empty lines
750 filter = filter.trim();
751 if(filter.equals("") || filter.startsWith("#")) {
752 continue;
753 }
754
755 if(filter.startsWith("^") && filter.endsWith("$")) {
756 filter = filter.substring(1, filter.length()-1);
757 list.put(filter, LIST_ENTRY_MATCHES);
758 }
759 else if(filter.startsWith("^")) {
760 filter = filter.substring(1);
761 list.put(filter, LIST_ENTRY_STARTSWITH);
762 //logger.debug("Match filter startswith: " + filter);
763 }
764 else if(filter.endsWith("$")) {
765 filter = filter.substring(0, filter.length()-1);
766 list.put(filter, LIST_ENTRY_ENDSWITH);
767 //logger.debug("@@@ Match filter endswith: " + filter);
768 }
769 else {
770 list.put(filter, LIST_ENTRY_CONTAINS);
771 }
772 //logger.debug("Got filter: " + filter);
773 }
774
775 } catch (IOException ioe) {
776 logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
777 }
778
779 }
780
781 /** Maintain a count of all WET files processed. */
782 public void setWETFileCount(int count) { this.wetFileCount = count; }
783
784 /** Maintain a count of all WET records processed. */
785 //public int getRecordCount() { return this.totalRecordCount; }
786 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
787 public void setRecordCount(int count) { this.totalRecordCount = count; }
788
789 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
790
791 // Will list all the warc.wet files in the input directory or else their gzipped versions
792 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
793
794 int wetRecordCount = 0;
795 int wetFileCount = 0;
796
797 for(int i = 0; i < WETFiles.length; i++) {
798 File WETFile = WETFiles[i];
799 logger.debug("Processing WETfile: " + WETFile);
800
801 // Any .gz files listed means they haven't been unzipped yet. So unzip.
802 String WETFilename = WETFile.toString();
803 if(WETFilename.endsWith(".gz")) {
804 File GZippedWETFile = WETFile;
805 String WETGZippedFilename = WETFilename;
806 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
807
808 WETFile = new File(WETFilename);
809 Utility.unzipFile(GZippedWETFile, WETFile);
810 }
811 // hereafter all WETFiles should refer to the unzipped version
812 // Check the unzipped WETFile exists
813
814 if(!WETFile.exists() || !WETFile.isFile()) {
815 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
816 return;
817 }
818
819 // Finally, we can process this WETFile's records into the keep and discard pile
820 wetFileCount++;
821 logger.debug("Off to process " + WETFile);
822 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
823 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
824 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
825 wetFileProcessor.processWETFile();
826 wetRecordCount += wetFileProcessor.getRecordCount();
827 }
828
829 // for information purposes
830 this.setWETFileCount(wetFileCount);
831 this.setRecordCount(wetRecordCount);
832 }
833
834
835 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
836 public static void printUsage() {
837 System.err.println("Run this program as:");
838 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path> [--check-for-product-sites]");
839 }
840
841 /** Filename filter to only list warc.wet files or else warc.wet.gz files
842 * for which unzipped warc.wet equivalents don't yet exist.
843 */
844 private static class WETFilenameFilter implements FilenameFilter {
845
846 public boolean accept(File dir, String name) {
847 if(name.endsWith(".warc.wet")) {
848 logger.debug("Will include " + name + " for processing.");
849 return true;
850 }
851
852 if(name.endsWith(".warc.wet.gz")) {
853 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
854 File unzippedVersion = new File(dir, nameWithoutGZext);
855 if(unzippedVersion.exists()) {
856 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
857 logger.debug("Skipping " + name);
858 return false; // don't count gzipped version if unzipped version exists.
859 }
860 else {
861 logger.debug("Only zipped version " + name + " exists.");
862 return true; // No unzipped version, so have to work with gzipped version
863 }
864 }
865
866 // we're not even interested in any other file extensions
867 logger.debug("Not a WET file. Skipping " + name);
868 return false;
869 }
870 }
871
872
873 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
874
875 public boolean accept(File dir, String name) {
876 File f = new File (dir, name);
877 if(f.isDirectory()) {
878 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
879 return true;
880 }
881 }
882 else {
883 logger.info("File " + f + " is not a directory");
884 }
885 return false;
886 }
887 }
888
889 public static void main(String[] args) {
890 if(args.length < 2 || args.length > 3) {
891 printUsage();
892 return;
893 }
894
895 boolean checkForPossibleProductSites = false;
896 if(args.length == 3) {
897 if(!args[2].equals("--check-for-product-sites")) {
898 printUsage();
899 return;
900 } else {
901 checkForPossibleProductSites = true;
902 }
903 }
904
905 File commoncrawlDir = new File(args[0]);
906 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
907 logger.error("Error: " + args[0] + " does not exist or is not a directory");
908 return;
909 }
910
911 File outFolder = new File(args[1]);
912 if(!outFolder.exists() || !outFolder.isDirectory()) {
913 logger.error("Error: " + args[1] + " does not exist or is not a directory.");
914 return;
915 }
916
917
918 try {
919 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
920
921 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
922
923 for(int i = 0; i < ccrawlFolders.length; i++) {
924 File ccrawlFolder = ccrawlFolders[i];
925 logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder);
926 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
927 }
928
929 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
930 // The former is the only unique one. seedURLs and regex-urlfilters are
931 // repeated on a per site/domain basis too, stored in the sites folder
932 File seedURLsFile = new File(outFolder, "seedURLs.txt");
933 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
934 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
935 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
936 File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
937
938
939 ccWETFilesProcessor.prepareSitesForNutchCrawling(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, checkForPossibleProductSites, possibleProductSitesFile);
940
941 logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
942
943 if(checkForPossibleProductSites) {
944 logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
945 } else {
946 possibleProductSitesFile.delete();
947 }
948
949
950 } catch(Exception e) {
951 // can get an exception when instantiating CCWETProcessor instance
952 logger.error(e.getMessage(), e);
953 }
954
955 return;
956
957 }
958}
Note: See TracBrowser for help on using the repository browser.