Ignore:
Timestamp:
2019-10-10T23:49:58+13:00 (5 years ago)
Author:
ak19
Message:
  1. Incorporated Dr Bainbridge's suggested improvements: only when there is a subdomain to a seed URL's domain should SUBDOMAIN-COPY be active, otherwise it should be deactivated on topsites match. For example if seedURL's domain is pinky.blogspot.com, then SUBDOMAIN-COPY can crawl that site as it's not all of blogspot. But if the seedURL domain was blogspot.com it would still match the topsite blogspot.com for which SUBDOMAIN-COPY is the value, but the value should be overridden so as not to crawl the site. 2. More complete regex escaping for the regex-urlfilter.txt file. 3. domainToURLs map now contains the domain WITH protocol prefix, which required adjustments to be made in the rest of the code. 4. Together with the changes to the blacklist, whitelist and topsites file (sites-too-big-to-exhaustively crawl file), I think the code is dealing with all the known wanted urls among the topsites now and generating the correct output for the seedURLs and regex-urlfilter file.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33557 r33560  
    6565    public final int MIN_NUM_WORDS;
    6666    public final int MAX_WORDS_CAMELCASE;
     67
     68    // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
     69    public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
     70    public final String SINGLEPAGE = "SINGLEPAGE";
     71   
     72    /**
     73     * Characters that need escaping if used as a string literal in a regex
     74     * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
     75     * https://www.regular-expressions.info/refcharacters.html
     76    */
     77    //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"];
     78    // put the \\ at start so we don't the escape character for chars escaped earlier
     79    public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
    6780   
    6881    private Properties configProperties = new Properties();
     
    220233     * This retains any www. or subdomain prefix.
    221234     */
    222     private String getDomainForURL(String url) {
    223     int startIndex = url.indexOf("//"); // http:// or https:// prefix
     235    private String getDomainForURL(String url, boolean withProtocol) {
     236    int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
    224237    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     238    // the keep the URL around in case param withProtocol=true
     239    String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
     240   
    225241    String domain = url.substring(startIndex);
    226242    int endIndex = domain.indexOf("/");
    227243    if(endIndex == -1) endIndex = domain.length();
    228244    domain = domain.substring(0, endIndex);
     245
     246    if(withProtocol) {
     247        // now that we have the domain (everything to the first / when there is no protocol)
     248        // can glue the protocol back on
     249        domain = protocol + domain;
     250    }
    229251   
    230252    return domain;
     253    }
     254
     255    /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
     256    private String escapeStringForRegex(String str) {   
     257    for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
     258        char c = ESCAPE_CHARS_FOR_RE.charAt(i);
     259        str = str.replace(Character.toString(c), "\\"+c);
     260    }
     261    return str;
    231262    }
    232263   
     
    238269    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
    239270                    File domainURLsFile, File topSiteMatchesFile) {
    240     // Maintain Sets of unique domains and urls
     271    // Maintain a Map of unique domains mapped to seed urls at that domain
    241272    // TreeSet: by default, "the elements are ordered using their natural ordering"
    242273    // (or by a Comparator provided at set creation time).
    243274    // Whereas HashSet doesn't guarantee ordering.
    244275    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
    245 
    246     //Set<String> domainsSet = new TreeSet<String>();
    247     //Set<String> urlsSet = new TreeSet<String>();
     276    // Would be a similar distinction for Maps.
    248277    domainsToURLsMap = new TreeMap<String, Set<String>>();
    249    
    250     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
     278
     279    final String PROTOCOL_REGEX_PREFIX = "+^https?://";
     280    final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
    251281   
    252282    try (
     
    256286        // read a URL at a time from urlsFile
    257287        String url = null;
    258         String domain = null;
     288        String domainWithProtocol = null;
    259289        while((url = reader.readLine()) != null) { // readLine removes newline separator
    260290       
    261         // work out domain. This retains any www. or subdomain prefix       
    262         domain = getDomainForURL(url);
    263 
    264         //urlsSet.add(url);
    265         //domainsSet.add(domain);
     291        // work out domain. This retains any www. or subdomain prefix
     292        // passing true to further also retain the http(s) protocol
     293        domainWithProtocol = getDomainForURL(url, true);
     294
    266295        Set<String> urlsSet;
    267         if(!domainsToURLsMap.containsKey(domain)) {
     296        if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
    268297            urlsSet = new TreeSet<String>();
    269298            urlsSet.add(url);
    270             domainsToURLsMap.put(domain, urlsSet);
     299            domainsToURLsMap.put(domainWithProtocol, urlsSet);
    271300        } else {
    272             urlsSet = domainsToURLsMap.get(domain);
     301            urlsSet = domainsToURLsMap.get(domainWithProtocol);
    273302            urlsSet.add(url);
    274303        }
     
    322351       
    323352        while(domainIterator.hasNext()) {
    324         String domain = domainIterator.next();
     353        String domainWithProtocol = domainIterator.next();
     354        int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
     355        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     356        String domain = domainWithProtocol.substring(startIndex);
     357       
     358        System.err.println("domain with protocol: " + domainWithProtocol);
     359        System.err.println("domain: " + domain);
    325360       
    326361        String allowedURLPatternRegex = isURLinTopSitesMap(domain);     
     
    330365        // domain and seedURLs to the topSiteMatchesFile.       
    331366        if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
     367           
    332368            // topsite, but we don't (yet) know what portion can be crawled
    333369            // Append the top site and url to a global/toplevel file that
     
    335371            // won't go into any other file hereafter
    336372
    337             Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
     373            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
    338374            Iterator<String> urlIterator = urlsForDomainSet.iterator();
    339375            while(urlIterator.hasNext()) {
     
    353389       
    354390        // write out the domain     
    355         //seedURLsWriter.write(domain + "\n");
     391        //seedURLsWriter.write(domainWithProtocol + "\n");
    356392       
    357393
     
    367403
    368404            // write all sorted unique domains into global domains file
     405            // Using the domain withuot protocol since the global domains file is for
     406            // informational purposes
    369407            domainURLsWriter.write(domain + "\n");
    370408
     
    373411            // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
    374412            // If we ever run nutch on a single seedURLs listing containing
    375             // all seed pages to crawl sites from, the above two files will work for that.
    376            
     413            // all seed pages to crawl sites from, the above two files will work for that. 
    377414           
    378415            if(allowedURLPatternRegex == null) { // entire site can be crawled
    379             siteURLsWriter.write(domain + "\n");
     416            siteURLsWriter.write(domainWithProtocol + "\n");
    380417           
    381418            // Write out filter in the following form for a site, e.g. for nutch.apache.org:
    382419            // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
    383             String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";     
     420            String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";       
     421            //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";       
    384422            urlFilterWriter.write(regexed_domain + "\n"); //global file
    385423            siteRegexWriter.write(regexed_domain + "\n"); // site file
     
    387425            else { // domain belongs to a top site where only portion of site can be crawled
    388426           
    389             if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter
    390                 siteURLsWriter.write(domain + "\n");
     427            if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
     428                siteURLsWriter.write(domainWithProtocol + "\n");
    391429                // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
    392430                // and not for all of blogspot.com
    393431               
    394                String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/";
    395                urlFilterWriter.write(regexed_domain + "\n");
    396                siteRegexWriter.write(regexed_domain + "\n");
    397                
    398             } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) {
     432                String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
     433                //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
     434                urlFilterWriter.write(regexed_domain + "\n");
     435                siteRegexWriter.write(regexed_domain + "\n");
     436               
     437            } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
    399438                // don't write out domain. We want individual pages
    400                 //DON'T DO: siteURLsWriter.write(domain + "\n");
     439                //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
    401440               
    402                 // don't write out domain as a regex expression url filter
     441                // don't write out domain as a regex expression url filter either,
    403442                // write out the individual seed urls for the domain instead
    404443                // since we will only be downloading the single page
    405444               
    406                 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
    407                 for(String urlInDomain : urlsForDomainSet) {               
    408                 String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
     445                Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
     446                for(String urlInDomain : urlsForDomainSet) {
     447                // don't append slash to end this time
     448                String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
     449                //String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
    409450                urlFilterWriter.write(regexed_url + "\n");
    410451                siteRegexWriter.write(regexed_url + "\n");
    411452                }
    412453            } else { // allowedURLPatternRegex is a url-form - convert to regex
    413                 String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\.");
    414                 siteURLsWriter.write(domain + "\n");
     454                if(!allowedURLPatternRegex.endsWith("/")) {
     455                allowedURLPatternRegex += "/";
     456                }
     457                String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
     458                //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
     459                siteURLsWriter.write(domainWithProtocol + "\n");
    415460                urlFilterWriter.write(regexed_pattern + "\n");
    416461                siteRegexWriter.write(regexed_pattern + "\n");
     
    420465           
    421466            // next write out the urls for the domain into the sites/0000x/seedURLs.txt file
    422             // also write into the global seeds file
    423             // (with a tab prefixed to each url?)
    424             Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
     467            // also write into the global seeds file (with a tab prefixed to each?)
     468            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
    425469            Iterator<String> urlIterator = urlsForDomainSet.iterator();
    426470            while(urlIterator.hasNext()) {
    427471            String url = urlIterator.next();
    428             seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file
    429             siteURLsWriter.write("\t" + url + "\n");
     472            seedURLsWriter.write(url + "\n"); // global seedURLs file
     473            siteURLsWriter.write(url + "\n");
    430474            }
    431475        } catch (IOException ioe) {
    432476            ioe.printStackTrace();
    433             System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
     477            System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
    434478        }
    435479       
     
    438482    } catch (IOException ioe) {
    439483        ioe.printStackTrace();
    440         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
    441     }
    442    
    443     // write out domains as regular expressions into "regex-urlfilter.txt" file
    444     try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
    445         Set<String> domainsSet = domainsToURLsMap.keySet();     
    446         Iterator<String> i = domainsSet.iterator();
    447         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
    448         while(i.hasNext()) {
    449         String domain = i.next();
    450         domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";       
    451         urlFilterWriter.write(domain + "\n");
    452         }
    453        
    454     } catch (IOException ioe) {
    455         ioe.printStackTrace();
    456         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
    457     }
     484        System.err.println("\n@@@@@@@@@ Error writing to one of: ");
     485        System.err.println("\t" + seedURLsFile);
     486        System.err.println("\t" + urlFilterFile);
     487        System.err.println("\t" + domainURLsFile); 
     488        System.err.println("\t" + topSiteMatchesFile);   
     489    }   
    458490
    459491    /*
     
    475507    return url;
    476508    }
    477    
     509
     510
     511    /**
     512     * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
     513     * with or without www. prefix. This method tests for such as case as it would be dangerous
     514     * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
     515     */
     516    private boolean isExactDomainMatch(String seedURLDomain, String domain) {
     517    // check for an exact match as-is
     518    if(seedURLDomain.equals(domain)) {
     519        return true;
     520    }
     521
     522    // else check if with or without a www. prefix we have an exact match with domain
     523    if(seedURLDomain.startsWith("www.")) {
     524        if(seedURLDomain.substring(4).equals(domain)) {
     525        return true;
     526        }
     527    } else {
     528        if(domain.equals("www."+seedURLDomain)) {
     529        return true;
     530        }
     531    }
     532
     533    return false;
     534     }
     535   
     536
    478537    /**
    479      * Check if the domain of the url, either in its entirety or when stripped of www/subdomains,
    480      * is in the list of top sites.
     538     * Check if the domain of the seedurl, either in its entirety or when stripped of
     539     * www/subdomains, is in the list of top sites.
    481540     * If it is, and the given url matches the regex for that topsite, then add the url to the
    482541     * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
    483      
    484      */
    485     private String isURLinTopSitesMap(String domain) {
     542     * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
     543     * @return one of the following values:
     544     *  - This function returns null if the seedURL's domain does not match any of the topsites.
     545     *  - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
     546     * url-pattern) value was defined for it. The empty String is also returned if the seedURL's
     547     * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
     548     * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
     549     *  - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
     550     * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
     551     * URL pattern.
     552     */
     553    private String isURLinTopSitesMap(String fullSeedDomain) {
    486554    boolean keepLooping = true;
    487555
     556    String domain = fullSeedDomain;
     557   
    488558    // domain aprameter will have retained www or subdomains, but is stripped of protocol
    489559   
     
    492562    // If no match at all, return null.
    493563    do {
    494         if(domain.contains("pinterest.com")) {
    495         System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map");
    496         }
    497564       
    498565        String allowed_url_pattern = topSitesMap.get(domain);
    499566        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
    500567        // there's an entry for the URL in the topSitesMap
    501         System.err.println("##### A top site matches URL domain " + domain);       
    502         return allowed_url_pattern;     
     568        System.err.println("##### A top site matches URL domain " + domain);
     569
     570        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
     571        // www prefix, should not exactly match the topSitesMap domain
     572        // e.g. we don't want to crawl a seed URL with domain www.blogspot.com
     573        // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
     574       
     575        if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
     576            return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
     577        }
     578        return allowed_url_pattern;
    503579        }
    504580        // else, no entry for the URL in the topSitesMap
    505         // Not done: strip subDomain from URL and check it against topSitesMap
     581        // We're not done yet: strip subDomain from URL and check it against topSitesMap again
    506582       
    507         String newURL = stripSubDomain(domain);
    508         if(domain.equals(newURL)) keepLooping = false;
    509         else domain = newURL;
     583        String newDomain = stripSubDomain(domain);
     584        if(domain.equals(newDomain)) {
     585        keepLooping = false;
     586        } else {
     587        domain = newDomain;
     588        }
    510589    } while(keepLooping);
    511590
     
    674753    this.setRecordCount(wetRecordCount);
    675754    }
    676    
     755
     756
     757    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
    677758    public static void printUsage() {
    678759    System.err.println("Run this program as:");
     
    770851    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
    771852
    772     System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n");
     853    System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
    773854   
    774855   
Note: See TracChangeset for help on using the changeset viewer.