Context Navigation

← Previous Change
Next Change →

CCWETProcessor.java

Timestamp:

2019-10-10T23:49:58+13:00 (5 years ago)

Author:

ak19

Message:

Incorporated Dr Bainbridge's suggested improvements: only when there is a subdomain to a seed URL's domain should SUBDOMAIN-COPY be active, otherwise it should be deactivated on topsites match. For example if seedURL's domain is pinky.blogspot.com, then SUBDOMAIN-COPY can crawl that site as it's not all of blogspot. But if the seedURL domain was blogspot.com it would still match the topsite blogspot.com for which SUBDOMAIN-COPY is the value, but the value should be overridden so as not to crawl the site. 2. More complete regex escaping for the regex-urlfilter.txt file. 3. domainToURLs map now contains the domain WITH protocol prefix, which required adjustments to be made in the rest of the code. 4. Together with the changes to the blacklist, whitelist and topsites file (sites-too-big-to-exhaustively crawl file), I think the code is dealing with all the known wanted urls among the topsites now and generating the correct output for the seedURLs and regex-urlfilter file.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java (modified) (17 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33557
+              r33560
     public final int MIN_NUM_WORDS;
     public final int MAX_WORDS_CAMELCASE;
+    // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
+    public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
+    public final String SINGLEPAGE = "SINGLEPAGE";
+    /**
+     * Characters that need escaping if used as a string literal in a regex
+     * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
+     * https://www.regular-expressions.info/refcharacters.html
+    */
+    //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"];
+    // put the \\ at start so we don't the escape character for chars escaped earlier
+    public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
     private Properties configProperties = new Properties();
 …
      * This retains any www. or subdomain prefix.
      */
     private String getDomainForURL(String url) {
     int startIndex = url.indexOf("//"); // http:// or https:// prefix
+    private String getDomainForURL(String url, boolean withProtocol) {
+    int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
     startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
+    // the keep the URL around in case param withProtocol=true
+    String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
     String domain = url.substring(startIndex);
     int endIndex = domain.indexOf("/");
     if(endIndex == -1) endIndex = domain.length();
     domain = domain.substring(0, endIndex);
+    if(withProtocol) {
+        // now that we have the domain (everything to the first / when there is no protocol)
+        // can glue the protocol back on
+        domain = protocol + domain;
+    }
     return domain;
+    }
+    /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
+    private String escapeStringForRegex(String str) {
+    for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
+        char c = ESCAPE_CHARS_FOR_RE.charAt(i);
+        str = str.replace(Character.toString(c), "\\"+c);
+    }
+    return str;
+    }
 …
     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
                     File domainURLsFile, File topSiteMatchesFile) {
     // Maintain Sets of unique domains and urls
+    // Maintain a Map of unique domains mapped to seed urls at that domain
     // TreeSet: by default, "the elements are ordered using their natural ordering"
     // (or by a Comparator provided at set creation time).
     // Whereas HashSet doesn't guarantee ordering.
     // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
+    //Set<String> domainsSet = new TreeSet<String>();
+    //Set<String> urlsSet = new TreeSet<String>();
+    // Would be a similar distinction for Maps.
     domainsToURLsMap = new TreeMap<String, Set<String>>();
+    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
+    final String PROTOCOL_REGEX_PREFIX = "+^https?://";
+    final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
     try (
 …
         // read a URL at a time from urlsFile
         String url = null;
         String domain = null;
+        String domainWithProtocol = null;
         while((url = reader.readLine()) != null) { // readLine removes newline separator
+        // work out domain. This retains any www. or subdomain prefix
+        domain = getDomainForURL(url);
+        //urlsSet.add(url);
+        //domainsSet.add(domain);
+        // work out domain. This retains any www. or subdomain prefix
+        // passing true to further also retain the http(s) protocol
+        domainWithProtocol = getDomainForURL(url, true);
         Set<String> urlsSet;
         if(!domainsToURLsMap.containsKey(domain)) {
+        if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
             urlsSet = new TreeSet<String>();
             urlsSet.add(url);
             domainsToURLsMap.put(domain, urlsSet);
+            domainsToURLsMap.put(domainWithProtocol, urlsSet);
         } else {
             urlsSet = domainsToURLsMap.get(domain);
+            urlsSet = domainsToURLsMap.get(domainWithProtocol);
             urlsSet.add(url);
+        }
 …
         while(domainIterator.hasNext()) {
+        String domain = domainIterator.next();
+        String domainWithProtocol = domainIterator.next();
+        int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
+        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
+        String domain = domainWithProtocol.substring(startIndex);
+        System.err.println("domain with protocol: " + domainWithProtocol);
+        System.err.println("domain: " + domain);
         String allowedURLPatternRegex = isURLinTopSitesMap(domain);
 …
         // domain and seedURLs to the topSiteMatchesFile.
         if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
             // topsite, but we don't (yet) know what portion can be crawled
             // Append the top site and url to a global/toplevel file that
 …
             // won't go into any other file hereafter
             Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
+            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
             Iterator<String> urlIterator = urlsForDomainSet.iterator();
             while(urlIterator.hasNext()) {
 …
         // write out the domain
         //seedURLsWriter.write(domain + "\n");
+        //seedURLsWriter.write(domainWithProtocol + "\n");
 …
             // write all sorted unique domains into global domains file
+            // Using the domain withuot protocol since the global domains file is for
+            // informational purposes
             domainURLsWriter.write(domain + "\n");
 …
             // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
             // If we ever run nutch on a single seedURLs listing containing
+            // all seed pages to crawl sites from, the above two files will work for that.
+            // all seed pages to crawl sites from, the above two files will work for that.
             if(allowedURLPatternRegex == null) { // entire site can be crawled
             siteURLsWriter.write(domain + "\n");
+            siteURLsWriter.write(domainWithProtocol + "\n");
             // Write out filter in the following form for a site, e.g. for nutch.apache.org:
             // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
+            String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
+            String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
+            //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
             urlFilterWriter.write(regexed_domain + "\n"); //global file
             siteRegexWriter.write(regexed_domain + "\n"); // site file
 …
             else { // domain belongs to a top site where only portion of site can be crawled
             if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter
                 siteURLsWriter.write(domain + "\n");
+            if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
+                siteURLsWriter.write(domainWithProtocol + "\n");
                 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
                 // and not for all of blogspot.com
+               String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/";
+               urlFilterWriter.write(regexed_domain + "\n");
+               siteRegexWriter.write(regexed_domain + "\n");
+            } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) {
+                String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
+                //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
+                urlFilterWriter.write(regexed_domain + "\n");
+                siteRegexWriter.write(regexed_domain + "\n");
+            } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
                 // don't write out domain. We want individual pages
                 //DON'T DO: siteURLsWriter.write(domain + "\n");
+                //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
                 // don't write out domain as a regex expression url filter
+                // don't write out domain as a regex expression url filter either,
                 // write out the individual seed urls for the domain instead
                 // since we will only be downloading the single page
+                Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
+                for(String urlInDomain : urlsForDomainSet) {
+                String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
+                Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
+                for(String urlInDomain : urlsForDomainSet) {
+                // don't append slash to end this time
+                String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
+                //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
                 urlFilterWriter.write(regexed_url + "\n");
                 siteRegexWriter.write(regexed_url + "\n");
+                }
             } else { // allowedURLPatternRegex is a url-form - convert to regex
+                String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\.");
+                siteURLsWriter.write(domain + "\n");
+                if(!allowedURLPatternRegex.endsWith("/")) {
+                allowedURLPatternRegex += "/";
+                }
+                String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
+                //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
+                siteURLsWriter.write(domainWithProtocol + "\n");
                 urlFilterWriter.write(regexed_pattern + "\n");
                 siteRegexWriter.write(regexed_pattern + "\n");
 …
             // next write out the urls for the domain into the sites/0000x/seedURLs.txt file
+            // also write into the global seeds file
+            // (with a tab prefixed to each url?)
+            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
+            // also write into the global seeds file (with a tab prefixed to each?)
+            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
             Iterator<String> urlIterator = urlsForDomainSet.iterator();
             while(urlIterator.hasNext()) {
             String url = urlIterator.next();
             seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file
             siteURLsWriter.write("\t" + url + "\n");
+            seedURLsWriter.write(url + "\n"); // global seedURLs file
+            siteURLsWriter.write(url + "\n");
+            }
         } catch (IOException ioe) {
             ioe.printStackTrace();
             System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
+            System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
+        }
 …
     } catch (IOException ioe) {
         ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
+    }
+    // write out domains as regular expressions into "regex-urlfilter.txt" file
+    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
+        Set<String> domainsSet = domainsToURLsMap.keySet();
+        Iterator<String> i = domainsSet.iterator();
+        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
+        while(i.hasNext()) {
+        String domain = i.next();
+        domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
+        urlFilterWriter.write(domain + "\n");
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
+    }
+        System.err.println("\n@@@@@@@@@ Error writing to one of: ");
+        System.err.println("\t" + seedURLsFile);
+        System.err.println("\t" + urlFilterFile);
+        System.err.println("\t" + domainURLsFile);
+        System.err.println("\t" + topSiteMatchesFile);
+    }
     /*
 …
     return url;
+    }
+    /**
+     * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
+     * with or without www. prefix. This method tests for such as case as it would be dangerous
+     * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
+     */
+    private boolean isExactDomainMatch(String seedURLDomain, String domain) {
+    // check for an exact match as-is
+    if(seedURLDomain.equals(domain)) {
+        return true;
+    }
+    // else check if with or without a www. prefix we have an exact match with domain
+    if(seedURLDomain.startsWith("www.")) {
+        if(seedURLDomain.substring(4).equals(domain)) {
+        return true;
+        }
+    } else {
+        if(domain.equals("www."+seedURLDomain)) {
+        return true;
+        }
+    }
+    return false;
+     }
     /**
      * Check if the domain of the url, either in its entirety or when stripped of www/subdomains,
      * is in the list of top sites.
+     * Check if the domain of the seedurl, either in its entirety or when stripped of
+     * www/subdomains, is in the list of top sites.
      * If it is, and the given url matches the regex for that topsite, then add the url to the
      * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
+     */
+    private String isURLinTopSitesMap(String domain) {
+     * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
+     * @return one of the following values:
+     *  - This function returns null if the seedURL's domain does not match any of the topsites.
+     *  - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
+     * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
+     * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
+     * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
+     *  - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
+     * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
+     * URL pattern.
+     */
+    private String isURLinTopSitesMap(String fullSeedDomain) {
     boolean keepLooping = true;
+    String domain = fullSeedDomain;
     // domain aprameter will have retained www or subdomains, but is stripped of protocol
 …
     // If no match at all, return null.
     do {
-        if(domain.contains("pinterest.com")) {
-        System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map");
+        }
         String allowed_url_pattern = topSitesMap.get(domain);
         if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
         // there's an entry for the URL in the topSitesMap
+        System.err.println("##### A top site matches URL domain " + domain);
+        return allowed_url_pattern;
+        System.err.println("##### A top site matches URL domain " + domain);
+        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
+        // www prefix, should not exactly match the topSitesMap domain
+        // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
+        // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
+        if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
+            return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
+        }
+        return allowed_url_pattern;
+        }
         // else, no entry for the URL in the topSitesMap
         // Not done: strip subDomain from URL and check it against topSitesMap
+        // We're not done yet: strip subDomain from URL and check it against topSitesMap again
+        String newURL = stripSubDomain(domain);
+        if(domain.equals(newURL)) keepLooping = false;
+        else domain = newURL;
+        String newDomain = stripSubDomain(domain);
+        if(domain.equals(newDomain)) {
+        keepLooping = false;
+        } else {
+        domain = newDomain;
+        }
     } while(keepLooping);
 …
     this.setRecordCount(wetRecordCount);
+    }
+    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
     public static void printUsage() {
     System.err.println("Run this program as:");
 …
     System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n");
+    System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33560 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Download in other formats: