package org.greenstone.atea; import java.io.*; import java.util.Properties; import java.util.zip.GZIPInputStream; import java.util.Iterator; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.apache.log4j.Logger; /** * The main() method of this class takes a folder of warc.wet(.gz) files and goes through * the WET records in each, putting each WET record into a file. Each file is put into a * keep or discard or greyListed folder, and its url listed written into a keep, discard * or greylisted text file, based on based on * * 1. whether it's whitelisted, else greylisted else blacklisted * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's * enough content. Formerly, content-length and number of lines were used to determine if * the content was sufficient. Now it's just word count and number of MAX characters * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted * in conf/config.properties. * * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt * into the conf folder to control any url patterns that are explicitly included or excluded or * set aside for inspecting later. These filter text files don't use regexes, instead their * format is: * - precede URL by ^ to blacklist urls that match the given prefix * - succeed URL by $ to blacklist urls that match the given suffix * - ^url$ will blacklist urls that match the given url completely * - Without either ^ or $ symbol, urls containing the given url will get blacklisted * * WETProcessor.java's current implementation is that explicit whitelisting has precedence * over greylisting and which takes precedence over blacklisting in turn. However, even * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt * also for nutch. * * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files * in the given input folder. Then use a single instance of the WETProcessor class to process * each single unzipped warc.wet file. * * To compile, including the jars in lib/ for compiling. * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java * * To run, passing the log4j and other properties files in conf/ folder: * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder> * * e.g. * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less * */ public class CCWETProcessor { private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName()); // Properties shared across WETProcessor instances public final int MAX_WORD_LENGTH; public final int MIN_NUM_WORDS; public final int MAX_WORDS_CAMELCASE; // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY"; public final String SINGLEPAGE = "SINGLEPAGE"; /** * Characters that need escaping if used as a string literal in a regex * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions * https://www.regular-expressions.info/refcharacters.html */ //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"]; // put the \\ at start so we don't the escape character for chars escaped earlier public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|"; private Properties configProperties = new Properties(); // File paths shared across WETProcessor instances public final File commoncrawlDir; public final File outputFolder; public final File discardFolder; public final File keepFolder; public final File greyListedFolder; public final File keepURLsFile; public final File discardURLsFile; public final File greyListedFile; /** Possible values stored in the blackList/whiteList/greyList Maps */ private final Integer LIST_ENTRY_CONTAINS = new Integer(0); private final Integer LIST_ENTRY_STARTSWITH = new Integer(1); private final Integer LIST_ENTRY_ENDSWITH = new Integer(2); private final Integer LIST_ENTRY_MATCHES = new Integer(3); /** * Store url patterns as keys and values indicated whether a url should * match it exactly, start/end with it, or contain it */ private HashMap blackList; private HashMap greyList; private HashMap whiteList; /** map of topsites with allowable regexes: sites too big to exhaustively crawl * with optional regex defining allowed exceptions, like subdomains or url suffixes * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot * would be relevant. * The map would store top site domain suffix and an optional regex string for allowable * url patterns. */ private HashMap topSitesMap; /** Map of domains we keep and the full urls we're keeping that are of that domain. * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys, * while a HashMap has no notion of ordering, because we just need to store urls with * their domains. Whether the domains are sorted or the urls per domain are sorted becomes * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?) */ private Map> domainsToURLsMap; // Keep a count of all the records that all WETProcessors instantiated // by our main method combined have processed private int totalRecordCount = 0; private int wetFileCount = 0; public CCWETProcessor(File inFolder, File outFolder) throws Exception { this.commoncrawlDir = inFolder; this.outputFolder = outFolder; // load up the properties from the config file try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { configProperties = new Properties(); configProperties.load(infile); //infile.close(); // not explicitly called in examples of try-with-resources } catch(Exception e) { System.err.println("Exception attempting to read properties from config.properties."); logger.error("Exception attempting to read properties from config.properties."); e.printStackTrace(); } if(configProperties.size() == 0) { System.err.println("*** Warning: no values read into config properties. Using defaults."); } MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20")); MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10")); this.discardFolder = new File(outFolder, "discard"); if(!discardFolder.exists()) { discardFolder.mkdir(); } this.keepFolder = new File(outFolder, "keep"); if(!keepFolder.exists()) { keepFolder.mkdir(); } this.greyListedFolder = new File(outFolder, "greylisted"); if(!greyListedFolder.exists()) { greyListedFolder.mkdir(); } this.keepURLsFile = new File(outFolder, "keepURLs.txt"); if(keepURLsFile.exists() && !keepURLsFile.delete()) { throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed."); } this.discardURLsFile = new File(outFolder, "discardURLs.txt"); if(discardURLsFile.exists() && !discardURLsFile.delete()) { throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed."); } this.greyListedFile = new File(outFolder, "greyListed.txt"); if(greyListedFile.exists() && !greyListedFile.delete()) { throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed."); } // prepare our blacklist, greylist (for inspection) and whitelist System.err.println("Loading blacklist."); blackList = new HashMap(); initURLFilterList(blackList, "url-blacklist-filter.txt"); System.err.println("Loading greylist."); greyList = new HashMap(); initURLFilterList(greyList, "url-greylist-filter.txt"); System.err.println("Loading whitelist."); whiteList = new HashMap(); initURLFilterList(whiteList, "url-whitelist-filter.txt"); // Create the map of topSites System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite."); topSitesMap = new HashMap(); //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt"); try ( BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8")); ) { String str = null; while((str = reader.readLine()) != null) { str = str.trim(); if(str.equals("") || str.startsWith("#")) { continue; } int tabindex = str.indexOf("\t"); if(tabindex == -1) { topSitesMap.put(str, ""); } else { String topsite = str.substring(0, tabindex).trim(); String allowed_url_pattern = str.substring(tabindex+1).trim(); topSitesMap.put(topsite, allowed_url_pattern); } } } catch (IOException ioe) { ioe.printStackTrace(); System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt"); } //System.err.println("Prematurely terminating for testing purposes."); //System.exit(-1); } /** Work out the 'domain' for a given url. * This retains any www. or subdomain prefix. */ private String getDomainForURL(String url, boolean withProtocol) { int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion // the keep the URL around in case param withProtocol=true String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); String domain = url.substring(startIndex); int endIndex = domain.indexOf("/"); if(endIndex == -1) endIndex = domain.length(); domain = domain.substring(0, endIndex); if(withProtocol) { // now that we have the domain (everything to the first / when there is no protocol) // can glue the protocol back on domain = protocol + domain; } return domain; } /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */ private String escapeStringForRegex(String str) { for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) { char c = ESCAPE_CHARS_FOR_RE.charAt(i); str = str.replace(Character.toString(c), "\\"+c); } return str; } /** * Using the keepURLs.txt file generated by running WETProcessor instances, this produces * as output the URL seed list and regex-urlfilter text files required by nutch, see * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial */ public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile, File topSiteMatchesFile) { // Maintain a Map of unique domains mapped to seed urls at that domain // TreeSet: by default, "the elements are ordered using their natural ordering" // (or by a Comparator provided at set creation time). // Whereas HashSet doesn't guarantee ordering. // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. // Would be a similar distinction for Maps. domainsToURLsMap = new TreeMap>(); final String PROTOCOL_REGEX_PREFIX = "+^https?://"; final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt try ( BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile)); ) { // read a URL at a time from urlsFile String url = null; String domainWithProtocol = null; while((url = reader.readLine()) != null) { // readLine removes newline separator // work out domain. This retains any www. or subdomain prefix // passing true to further also retain the http(s) protocol domainWithProtocol = getDomainForURL(url, true); Set urlsSet; if(!domainsToURLsMap.containsKey(domainWithProtocol)) { urlsSet = new TreeSet(); urlsSet.add(url); domainsToURLsMap.put(domainWithProtocol, urlsSet); } else { urlsSet = domainsToURLsMap.get(domainWithProtocol); urlsSet.add(url); } } } catch (IOException ioe) { ioe.printStackTrace(); System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile); } // We'd have pruned out duplicates by now and have a sorted list of domains, // each of which maps to seed URLs in the commoncrawl for that domain int domainCount = 0; File sitesFolder = new File(outputFolder, "sites"); if(!sitesFolder.exists()) { sitesFolder.mkdir(); } final String FORMATSTR = "%05d"; // write out each domain followed in sequence by all urls we found in that domain // (urls with tab up front) try ( // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls // Also a global file listing any urls that matched top sites that didn't specify // allowed regex patterns BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile)); BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)); BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)) ) { // initialise topSiteMatchesFile with some instructional text. topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n"); topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n"); topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n"); //Set>> domainsSet = domainsToURLsMap.keySet(); Set domainsSet = domainsToURLsMap.keySet(); Iterator domainIterator = domainsSet.iterator(); /* // DEBUG String value = topSitesMap.get("wikipedia.org"); if(value == null) { System.err.println("### wikipedia.org had null value"); } else { System.err.println("### wikipedia.org had value: " + value); } // DEBUG */ while(domainIterator.hasNext()) { String domainWithProtocol = domainIterator.next(); int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion String domain = domainWithProtocol.substring(startIndex); System.err.println("domain with protocol: " + domainWithProtocol); System.err.println("domain: " + domain); String allowedURLPatternRegex = isURLinTopSitesMap(domain); // If the domain is of a topsite for which no allowed URL pattern has been provided // in sites-too-big-to-exhaustively-crawl.txt, // then we don't know how to crawl the site. Warn the user by writing the affected // domain and seedURLs to the topSiteMatchesFile. if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) { // topsite, but we don't (yet) know what portion can be crawled // Append the top site and url to a global/toplevel file that // the user needs to check later and we're done with this domain as it // won't go into any other file hereafter Set urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); Iterator urlIterator = urlsForDomainSet.iterator(); while(urlIterator.hasNext()) { String url = urlIterator.next(); topSiteMatchesWriter.write("\t" + url + "\n"); } continue; // done with this domain } // start counting the domains we're actually going to process domainCount++; String siteID = String.format(FORMATSTR, domainCount); File domainFolder = new File(sitesFolder, siteID); domainFolder.mkdir(); // write out the domain //seedURLsWriter.write(domainWithProtocol + "\n"); // for every domain, we need a sites/0000x/ folder, where x is domain#, containing // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt // We still have a global seedURLs.txt and regex-urlfilter.txt too. File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt try ( BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile)); BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile)); ) { // write all sorted unique domains into global domains file // Using the domain withuot protocol since the global domains file is for // informational purposes domainURLsWriter.write(domain + "\n"); // Only write urls and no domain into single global seedurls file // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) // If we ever run nutch on a single seedURLs listing containing // all seed pages to crawl sites from, the above two files will work for that. if(allowedURLPatternRegex == null) { // entire site can be crawled siteURLsWriter.write(domainWithProtocol + "\n"); // Write out filter in the following form for a site, e.g. for nutch.apache.org: // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/"; //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; urlFilterWriter.write(regexed_domain + "\n"); //global file siteRegexWriter.write(regexed_domain + "\n"); // site file } else { // domain belongs to a top site where only portion of site can be crawled if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter siteURLsWriter.write(domainWithProtocol + "\n"); // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com // and not for all of blogspot.com String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/"; //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/"; urlFilterWriter.write(regexed_domain + "\n"); siteRegexWriter.write(regexed_domain + "\n"); } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) { // don't write out domain. We want individual pages //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n"); // don't write out domain as a regex expression url filter either, // write out the individual seed urls for the domain instead // since we will only be downloading the single page Set urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); for(String urlInDomain : urlsForDomainSet) { // don't append slash to end this time String regexed_url = "+^"+escapeStringForRegex(urlInDomain); //String regexed_url = "+^"+urlInDomain.replace(".", "\\."); urlFilterWriter.write(regexed_url + "\n"); siteRegexWriter.write(regexed_url + "\n"); } } else { // allowedURLPatternRegex is a url-form - convert to regex if(!allowedURLPatternRegex.endsWith("/")) { allowedURLPatternRegex += "/"; } String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex); //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\."); siteURLsWriter.write(domainWithProtocol + "\n"); urlFilterWriter.write(regexed_pattern + "\n"); siteRegexWriter.write(regexed_pattern + "\n"); } } // next write out the urls for the domain into the sites/0000x/seedURLs.txt file // also write into the global seeds file (with a tab prefixed to each?) Set urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); Iterator urlIterator = urlsForDomainSet.iterator(); while(urlIterator.hasNext()) { String url = urlIterator.next(); seedURLsWriter.write(url + "\n"); // global seedURLs file siteURLsWriter.write(url + "\n"); } } catch (IOException ioe) { ioe.printStackTrace(); System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile); } } } catch (IOException ioe) { ioe.printStackTrace(); System.err.println("\n@@@@@@@@@ Error writing to one of: "); System.err.println("\t" + seedURLsFile); System.err.println("\t" + urlFilterFile); System.err.println("\t" + domainURLsFile); System.err.println("\t" + topSiteMatchesFile); } /* // BEGIN DEBUG System.err.println("@@@@ TopSitesMap contains: "); for(Map.Entry entry : topSitesMap.entrySet()) { String topSite = entry.getKey(); String urlPattern = entry.getValue(); System.err.println(topSite + " - " + urlPattern); } // END DEBUG */ } private String stripSubDomain(String url) { int index = url.indexOf("."); if(index != -1) { url = url.substring(index+1); } return url; } /** * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com, * with or without www. prefix. This method tests for such as case as it would be dangerous * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain. */ private boolean isExactDomainMatch(String seedURLDomain, String domain) { // check for an exact match as-is if(seedURLDomain.equals(domain)) { return true; } // else check if with or without a www. prefix we have an exact match with domain if(seedURLDomain.startsWith("www.")) { if(seedURLDomain.substring(4).equals(domain)) { return true; } } else { if(domain.equals("www."+seedURLDomain)) { return true; } } return false; } /** * Check if the domain of the seedurl, either in its entirety or when stripped of * www/subdomains, is in the list of top sites. * If it is, and the given url matches the regex for that topsite, then add the url to the * whitelist and a regex disallowing the rest of the topsite to the url regex filter file. * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix. * @return one of the following values: * - This function returns null if the seedURL's domain does not match any of the topsites. * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed- * url-pattern) value was defined for it. The empty String is also returned if the seedURL's * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY). * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed * URL pattern. */ private String isURLinTopSitesMap(String fullSeedDomain) { boolean keepLooping = true; String domain = fullSeedDomain; // domain aprameter will have retained www or subdomains, but is stripped of protocol // keep looping, stripping subdomains from url and checking if it matches a topsite domain // if it does, return the value for that topsite domain in the topSitesMap // If no match at all, return null. do { String allowed_url_pattern = topSitesMap.get(domain); if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); // there's an entry for the URL in the topSitesMap System.err.println("##### A top site matches URL domain " + domain); // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without // www prefix, should not exactly match the topSitesMap domain // e.g. we don't want to crawl a seed URL with domain www.blogspot.com // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY. if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) { return ""; // means don't crawl site, write url into unprocessed-topsite-matches file } return allowed_url_pattern; } // else, no entry for the URL in the topSitesMap // We're not done yet: strip subDomain from URL and check it against topSitesMap again String newDomain = stripSubDomain(domain); if(domain.equals(newDomain)) { keepLooping = false; } else { domain = newDomain; } } while(keepLooping); // url in entirety or stripped of subdomains did not match any of the topsites return null; } private boolean isListedInFilterList(Map filterListMap, String url) { //Set> entries = filterListMap.entrySet(); //Iterator> i = entries.iterator(); //while(i.hasNext()) { // Map.Entry entry = i.next(); for(Map.Entry entry : filterListMap.entrySet()) { String urlPattern = entry.getKey(); Integer matchRule = entry.getValue(); if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) { return true; } else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) { return true; } else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) { return true; } else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) { return true; } // else check the rest of the filter list against this url // before returning false to be certain it's not been listed in the filter list } return false; } /** * Returns true if the url or pattern is found in the blacklist file. * Note that if eventually the same url pattern is found in the greylist or whitelist too, * it won't get blacklisted after all. But that's not implemented here. */ public boolean isBlacklisted(String url) { return isListedInFilterList(blackList, url); } /** * Returns true if the url or pattern is explicitly mentioned in the greylist file. * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist. * Will eventually be pre-empted into the whitelist if mentioned in the whitelist. */ public boolean isGreylisted(String url) { // auto-translated product sites return isListedInFilterList(greyList, url); } /** * Returns true if the url or pattern is explicitly mentioned in the whitelist file * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist. */ public boolean isWhitelisted(String url) { return isListedInFilterList(whiteList, url); } /** * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide * whether it is in the mentioned black|grey|white list. * Filters don't represent actual regex, just ^ and $ as start and end terminators. * By not having this method deal with actual regex for filters, this has the advantage that * we don't have to remember to escape or double escape each filter to turn it into a regex. */ public void initURLFilterList(Map list, String filterListFilename) { // if filterListFilename does not exist in the conf folder, just return if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) { System.err.println(filterListFilename + " does not exist"); return; } try ( BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8")); ) { String filter = null; while((filter = reader.readLine()) != null) { // skip comments and empty lines filter = filter.trim(); if(filter.equals("") || filter.startsWith("#")) { continue; } if(filter.startsWith("^") && filter.endsWith("$")) { filter = filter.substring(1, filter.length()-1); list.put(filter, LIST_ENTRY_MATCHES); } else if(filter.startsWith("^")) { filter = filter.substring(1); list.put(filter, LIST_ENTRY_STARTSWITH); System.err.println("Match filter startswith: " + filter); } else if(filter.endsWith("$")) { filter = filter.substring(0, filter.length()-1); list.put(filter, LIST_ENTRY_ENDSWITH); } else { list.put(filter, LIST_ENTRY_CONTAINS); } //System.err.println("Got filter: " + filter); } } catch (IOException ioe) { ioe.printStackTrace(); System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename); } } /** Maintain a count of all WET files processed. */ public void setWETFileCount(int count) { this.wetFileCount = count; } /** Maintain a count of all WET records processed. */ //public int getRecordCount() { return this.totalRecordCount; } //public void addToRecordCount(int count) { this.totalRecordCount += count; } public void setRecordCount(int count) { this.totalRecordCount = count; } public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) { // Will list all the warc.wet files in the input directory or else their gzipped versions File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter()); int wetRecordCount = 0; int wetFileCount = 0; for(int i = 0; i < WETFiles.length; i++) { File WETFile = WETFiles[i]; logger.debug("Processing WETfile: " + WETFile); // Any .gz files listed means they haven't been unzipped yet. So unzip. String WETFilename = WETFile.toString(); if(WETFilename.endsWith(".gz")) { File GZippedWETFile = WETFile; String WETGZippedFilename = WETFilename; WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz")); WETFile = new File(WETFilename); Utility.unzipFile(GZippedWETFile, WETFile); } // hereafter all WETFiles should refer to the unzipped version // Check the unzipped WETFile exists if(!WETFile.exists() || !WETFile.isFile()) { System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); return; } // Finally, we can process this WETFile's records into the keep and discard pile wetFileCount++; logger.debug("Off to process " + WETFile); String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-## WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this); wetFileProcessor.processWETFile(); wetRecordCount += wetFileProcessor.getRecordCount(); } // for information purposes this.setWETFileCount(wetFileCount); this.setRecordCount(wetRecordCount); } // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // public static void printUsage() { System.err.println("Run this program as:"); System.err.println("\tWetProcessor "); } /** Filename filter to only list warc.wet files or else warc.wet.gz files * for which unzipped warc.wet equivalents don't yet exist. */ private static class WETFilenameFilter implements FilenameFilter { public boolean accept(File dir, String name) { if(name.endsWith(".warc.wet")) { logger.debug("Will include " + name + " for processing."); return true; } if(name.endsWith(".warc.wet.gz")) { String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz")); File unzippedVersion = new File(dir, nameWithoutGZext); if(unzippedVersion.exists()) { logger.debug("--- Unzipped version " + unzippedVersion + " exists."); logger.debug("Skipping " + name); return false; // don't count gzipped version if unzipped version exists. } else { logger.debug("Only zipped version " + name + " exists."); return true; // No unzipped version, so have to work with gzipped version } } // we're not even interested in any other file extensions logger.debug("Not a WET file. Skipping " + name); return false; } } private static class CCrawlWETFolderFilenameFilter implements FilenameFilter { public boolean accept(File dir, String name) { File f = new File (dir, name); if(f.isDirectory()) { if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) { return true; } } else { System.err.println("File " + f + " is not a directory"); } return false; } } public static void main(String[] args) { if(args.length != 2) { printUsage(); return; } File commoncrawlDir = new File(args[0]); if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { System.out.println("Error: " + args[0] + " does not exist or is not a directory"); return; } File outFolder = new File(args[1]); if(!outFolder.exists() || !outFolder.isDirectory()) { System.out.println("Error: " + args[1] + " does not exist or is not a directory."); return; } try { CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder); File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter()); for(int i = 0; i < ccrawlFolders.length; i++) { File ccrawlFolder = ccrawlFolders[i]; System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder); ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder); } // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls // The former is the only unique one. seedURLs and regex-urlfilters are // repeated on a per site/domain basis too, stored in the sites folder File seedURLsFile = new File(outFolder, "seedURLs.txt"); File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); } catch(Exception e) { // can get an exception when instantiating CCWETProcessor instance e.printStackTrace(); System.err.println(e.getMessage()); } return; } }