Context Navigation

← Previous Change
Next Change →

CCWETProcessor.java

Timestamp:

2019-10-09T23:10:06+13:00 (5 years ago)

Author:

ak19

Message:

Implemented the topSitesMap of topsite domain to url pattern in the only way I could make sense of it. Unlike the regex suggested by Dr Bainbridge for the values, which I didn't get how it would work out in this case, I invented set values.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33552
+              r33557
     private HashMap<String, Integer> whiteList;
+    /** map of topsites with allowable regexes: sites too big to exhaustively crawl
+     * with optional regex defining allowed exceptions, like subdomains or url suffixes
+     * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
+     * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
+     * would be relevant.
+     * The map would store top site domain suffix and an optional regex string for allowable
+     * url patterns.
+    */
+    private HashMap<String, String> topSitesMap;
     /** Map of domains we keep and the full urls we're keeping that are of that domain.
      * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
 …
     initURLFilterList(whiteList, "url-whitelist-filter.txt");
+    // Create the map of topSites
+    System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
+    topSitesMap = new HashMap<String, String>();
+    //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt");
+    try (
+         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));
+         ) {
+        String str = null;
+        while((str = reader.readLine()) != null) {
+        str = str.trim();
+        if(str.equals("") || str.startsWith("#")) {
+            continue;
+        }
+        int tabindex = str.indexOf("\t");
+        if(tabindex == -1) {
+            topSitesMap.put(str, "");
+        } else {
+            String topsite = str.substring(0, tabindex).trim();
+            String allowed_url_pattern = str.substring(tabindex+1).trim();
+            topSitesMap.put(topsite, allowed_url_pattern);
+        }
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt");
+    }
     //System.err.println("Prematurely terminating for testing purposes.");
     //System.exit(-1);
+    }
+    /** Work out the 'domain' for a given url.
+     * This retains any www. or subdomain prefix.
+     */
+    private String getDomainForURL(String url) {
+    int startIndex = url.indexOf("//"); // http:// or https:// prefix
+    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
+    String domain = url.substring(startIndex);
+    int endIndex = domain.indexOf("/");
+    if(endIndex == -1) endIndex = domain.length();
+    domain = domain.substring(0, endIndex);
+    return domain;
+    }
 …
      * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
      */
+    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) {
+    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
+                    File domainURLsFile, File topSiteMatchesFile) {
     // Maintain Sets of unique domains and urls
     // TreeSet: by default, "the elements are ordered using their natural ordering"
 …
         while((url = reader.readLine()) != null) { // readLine removes newline separator
+        // work out domain. This retains any www. or subdomain prefix:
+        int startIndex = url.indexOf("//"); // http:// or https:// prefix
+        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
+        domain = url.substring(startIndex);
+        int endIndex = domain.indexOf("/");
+        if(endIndex == -1) endIndex = domain.length();
+        domain = domain.substring(0, endIndex);
+        // work out domain. This retains any www. or subdomain prefix
+        domain = getDomainForURL(url);
         //urlsSet.add(url);
 …
     // We'd have pruned out duplicates by now and have a sorted list of domains,
     // each of which maps to seed URLs in the commoncrawl for that domain
+    /*
+    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
+        Iterator<String> i = urlsSet.iterator();
+        while(i.hasNext()) {
+        String url = i.next();
+        seedURLsWriter.write(url + "\n");
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
+    }
+    */
     int domainCount = 0;
     File sitesFolder = new File(outputFolder, "sites");
 …
     try (
          // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
+         // Also a global file listing any urls that matched top sites that didn't specify
+         // allowed regex patterns
          BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
          BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
+         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
+         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
+         BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
          ) {
+        // initialise topSiteMatchesFile with some instructional text.
+        topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
+        topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
+        topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
         //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
         Set<String> domainsSet = domainsToURLsMap.keySet();
         Iterator<String> domainIterator = domainsSet.iterator();
+        /*
+        // DEBUG
+    String value = topSitesMap.get("wikipedia.org");
+    if(value == null) {
+        System.err.println("### wikipedia.org had null value");
+    } else {
+        System.err.println("### wikipedia.org had value: " + value);
+    } // DEBUG
+        */
         while(domainIterator.hasNext()) {
+        domainCount++;
+        String domain = domainIterator.next();
+        String allowedURLPatternRegex = isURLinTopSitesMap(domain);
+        // If the domain is of a topsite for which no allowed URL pattern has been provided
+        // in sites-too-big-to-exhaustively-crawl.txt,
+        // then we don't know how to crawl the site. Warn the user by writing the affected
+        // domain and seedURLs to the topSiteMatchesFile.
+        if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
+            // topsite, but we don't (yet) know what portion can be crawled
+            // Append the top site and url to a global/toplevel file that
+            // the user needs to check later and we're done with this domain as it
+            // won't go into any other file hereafter
+            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
+            Iterator<String> urlIterator = urlsForDomainSet.iterator();
+            while(urlIterator.hasNext()) {
+            String url = urlIterator.next();
+            topSiteMatchesWriter.write("\t" + url + "\n");
+            }
+            continue; // done with this domain
+        }
+        // start counting the domains we're actually going to process
+        domainCount++;
         String siteID = String.format(FORMATSTR, domainCount);
         File domainFolder = new File(sitesFolder, siteID);
         domainFolder.mkdir();
+        // write out the domain
+        String domain = domainIterator.next();
+        // write out the domain
         //seedURLsWriter.write(domain + "\n");
+        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
+        String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
+        urlFilterWriter.write(regexed_domain + "\n");
         // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
 …
             // If we ever run nutch on a single seedURLs listing containing
             // all seed pages to crawl sites from, the above two files will work for that.
-            siteURLsWriter.write(domain + "\n");
-            siteRegexWriter.write(regexed_domain + "\n");
+            // next write out the urls for the domain with a tab prefixed to each
+            // into the sites/0000x/seedURLs.txt file - also write into the global seeds file
+            if(allowedURLPatternRegex == null) { // entire site can be crawled
+            siteURLsWriter.write(domain + "\n");
+            // Write out filter in the following form for a site, e.g. for nutch.apache.org:
+            // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
+            String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
+            urlFilterWriter.write(regexed_domain + "\n"); //global file
+            siteRegexWriter.write(regexed_domain + "\n"); // site file
+            }
+            else { // domain belongs to a top site where only portion of site can be crawled
+            if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter
+                siteURLsWriter.write(domain + "\n");
+                // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
+                // and not for all of blogspot.com
+               String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/";
+               urlFilterWriter.write(regexed_domain + "\n");
+               siteRegexWriter.write(regexed_domain + "\n");
+            } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) {
+                // don't write out domain. We want individual pages
+                //DON'T DO: siteURLsWriter.write(domain + "\n");
+                // don't write out domain as a regex expression url filter
+                // write out the individual seed urls for the domain instead
+                // since we will only be downloading the single page
+                Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
+                for(String urlInDomain : urlsForDomainSet) {
+                String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
+                urlFilterWriter.write(regexed_url + "\n");
+                siteRegexWriter.write(regexed_url + "\n");
+                }
+            } else { // allowedURLPatternRegex is a url-form - convert to regex
+                String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\.");
+                siteURLsWriter.write(domain + "\n");
+                urlFilterWriter.write(regexed_pattern + "\n");
+                siteRegexWriter.write(regexed_pattern + "\n");
+            }
+            }
+            // next write out the urls for the domain into the sites/0000x/seedURLs.txt file
+            // also write into the global seeds file
+            // (with a tab prefixed to each url?)
             Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
             Iterator<String> urlIterator = urlsForDomainSet.iterator();
             while(urlIterator.hasNext()) {
             String url = urlIterator.next();
             seedURLsWriter.write(url + "\n"); // global seedURLs file
             siteURLsWriter.write("\t" + url + "\n");
+            seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file
+            siteURLsWriter.write("\t" + url + "\n");
+            }
         } catch (IOException ioe) {
 …
             System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
+        }
+        }
+        }
     } catch (IOException ioe) {
         ioe.printStackTrace();
 …
         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
+    }
+    /*
+    // BEGIN DEBUG
+    System.err.println("@@@@ TopSitesMap contains: ");
+    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
+        String topSite = entry.getKey();
+        String urlPattern = entry.getValue();
+        System.err.println(topSite + " - " + urlPattern);
+    } // END DEBUG
+    */
+    }
+    private String stripSubDomain(String url) {
+    int index = url.indexOf(".");
+    if(index != -1) {
+        url = url.substring(index+1);
+    }
+    return url;
+    }
+    /**
+     * Check if the domain of the url, either in its entirety or when stripped of www/subdomains,
+     * is in the list of top sites.
+     * If it is, and the given url matches the regex for that topsite, then add the url to the
+     * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
+     */
+    private String isURLinTopSitesMap(String domain) {
+    boolean keepLooping = true;
+    // domain aprameter will have retained www or subdomains, but is stripped of protocol
+    // keep looping, stripping subdomains from url and checking if it matches a topsite domain
+    // if it does, return the value for that topsite domain in the topSitesMap
+    // If no match at all, return null.
+    do {
+        if(domain.contains("pinterest.com")) {
+        System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map");
+        }
+        String allowed_url_pattern = topSitesMap.get(domain);
+        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
+        // there's an entry for the URL in the topSitesMap
+        System.err.println("##### A top site matches URL domain " + domain);
+        return allowed_url_pattern;
+        }
+        // else, no entry for the URL in the topSitesMap
+        // Not done: strip subDomain from URL and check it against topSitesMap
+        String newURL = stripSubDomain(domain);
+        if(domain.equals(newURL)) keepLooping = false;
+        else domain = newURL;
+    } while(keepLooping);
+    // url in entirety or stripped of subdomains did not match any of the topsites
+    return null;
+    }
     private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
+    Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
+    Iterator<Map.Entry<String, Integer>> i = entries.iterator();
+    while(i.hasNext()) {
+        Map.Entry<String, Integer> entry = i.next();
+    //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
+    //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
+    //while(i.hasNext()) {
+    // Map.Entry<String, Integer> entry = i.next();
+    for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
         String urlPattern = entry.getKey();
         Integer matchRule = entry.getValue();
 …
      */
     public boolean isGreylisted(String url) {
     // TODO: alexa top sites and auto-translated product sites
+    // auto-translated product sites
     return isListedInFilterList(greyList, url);
+    }
 …
+    }
+    // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
+    // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
     // The former is the only unique one. seedURLs and regex-urlfilters are
     // repeated on a per site/domain basis too, stored in the sites folder
 …
     File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
     File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
+    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile);
+    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
+    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
     System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
+    System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n");
     } catch(Exception e) {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33557 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Download in other formats: