Changeset 33560

Show
Ignore:
Timestamp:
10.10.2019 23:49:58 (7 days ago)
Author:
ak19
Message:

1. Incorporated Dr Bainbridge's suggested improvements: only when there is a subdomain to a seed URL's domain should SUBDOMAIN-COPY be active, otherwise it should be deactivated on topsites match. For example if seedURL's domain is pinky.blogspot.com, then SUBDOMAIN-COPY can crawl that site as it's not all of blogspot. But if the seedURL domain was blogspot.com it would still match the topsite blogspot.com for which SUBDOMAIN-COPY is the value, but the value should be overridden so as not to crawl the site. 2. More complete regex escaping for the regex-urlfilter.txt file. 3. domainToURLs map now contains the domain WITH protocol prefix, which required adjustments to be made in the rest of the code. 4. Together with the changes to the blacklist, whitelist and topsites file (sites-too-big-to-exhaustively crawl file), I think the code is dealing with all the known wanted urls among the topsites now and generating the correct output for the seedURLs and regex-urlfilter file.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33557 r33560  
    6565    public final int MIN_NUM_WORDS; 
    6666    public final int MAX_WORDS_CAMELCASE; 
     67 
     68    // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file 
     69    public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY"; 
     70    public final String SINGLEPAGE = "SINGLEPAGE"; 
     71     
     72    /** 
     73     * Characters that need escaping if used as a string literal in a regex 
     74     * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions 
     75     * https://www.regular-expressions.info/refcharacters.html 
     76    */ 
     77    //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"]; 
     78    // put the \\ at start so we don't the escape character for chars escaped earlier 
     79    public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|"; 
    6780     
    6881    private Properties configProperties = new Properties(); 
     
    220233     * This retains any www. or subdomain prefix. 
    221234     */ 
    222     private String getDomainForURL(String url) { 
    223     int startIndex = url.indexOf("//"); // http:// or https:// prefix 
     235    private String getDomainForURL(String url, boolean withProtocol) { 
     236    int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 
    224237    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     238    // the keep the URL around in case param withProtocol=true 
     239    String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); 
     240     
    225241    String domain = url.substring(startIndex); 
    226242    int endIndex = domain.indexOf("/"); 
    227243    if(endIndex == -1) endIndex = domain.length(); 
    228244    domain = domain.substring(0, endIndex); 
     245 
     246    if(withProtocol) { 
     247        // now that we have the domain (everything to the first / when there is no protocol) 
     248        // can glue the protocol back on 
     249        domain = protocol + domain; 
     250    } 
    229251     
    230252    return domain; 
     253    } 
     254 
     255    /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */ 
     256    private String escapeStringForRegex(String str) {    
     257    for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) { 
     258        char c = ESCAPE_CHARS_FOR_RE.charAt(i); 
     259        str = str.replace(Character.toString(c), "\\"+c); 
     260    } 
     261    return str;  
    231262    } 
    232263     
     
    238269    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 
    239270                    File domainURLsFile, File topSiteMatchesFile) { 
    240     // Maintain Sets of unique domains and urls 
     271    // Maintain a Map of unique domains mapped to seed urls at that domain 
    241272    // TreeSet: by default, "the elements are ordered using their natural ordering" 
    242273    // (or by a Comparator provided at set creation time). 
    243274    // Whereas HashSet doesn't guarantee ordering. 
    244275    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 
    245  
    246     //Set<String> domainsSet = new TreeSet<String>(); 
    247     //Set<String> urlsSet = new TreeSet<String>(); 
     276    // Would be a similar distinction for Maps. 
    248277    domainsToURLsMap = new TreeMap<String, Set<String>>(); 
    249      
    250     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 
     278 
     279    final String PROTOCOL_REGEX_PREFIX = "+^https?://"; 
     280    final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 
    251281     
    252282    try ( 
     
    256286        // read a URL at a time from urlsFile 
    257287        String url = null; 
    258         String domain = null; 
     288        String domainWithProtocol = null; 
    259289        while((url = reader.readLine()) != null) { // readLine removes newline separator 
    260290         
    261         // work out domain. This retains any www. or subdomain prefix        
    262         domain = getDomainForURL(url); 
    263  
    264         //urlsSet.add(url); 
    265         //domainsSet.add(domain); 
     291        // work out domain. This retains any www. or subdomain prefix 
     292        // passing true to further also retain the http(s) protocol 
     293        domainWithProtocol = getDomainForURL(url, true); 
     294 
    266295        Set<String> urlsSet; 
    267         if(!domainsToURLsMap.containsKey(domain)) { 
     296        if(!domainsToURLsMap.containsKey(domainWithProtocol)) { 
    268297            urlsSet = new TreeSet<String>(); 
    269298            urlsSet.add(url); 
    270             domainsToURLsMap.put(domain, urlsSet); 
     299            domainsToURLsMap.put(domainWithProtocol, urlsSet); 
    271300        } else { 
    272             urlsSet = domainsToURLsMap.get(domain); 
     301            urlsSet = domainsToURLsMap.get(domainWithProtocol); 
    273302            urlsSet.add(url); 
    274303        } 
     
    322351         
    323352        while(domainIterator.hasNext()) { 
    324         String domain = domainIterator.next(); 
     353        String domainWithProtocol = domainIterator.next(); 
     354        int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 
     355        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     356        String domain = domainWithProtocol.substring(startIndex); 
     357         
     358        System.err.println("domain with protocol: " + domainWithProtocol); 
     359        System.err.println("domain: " + domain); 
    325360         
    326361        String allowedURLPatternRegex = isURLinTopSitesMap(domain);      
     
    330365        // domain and seedURLs to the topSiteMatchesFile.        
    331366        if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) { 
     367             
    332368            // topsite, but we don't (yet) know what portion can be crawled 
    333369            // Append the top site and url to a global/toplevel file that 
     
    335371            // won't go into any other file hereafter 
    336372 
    337             Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
     373            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 
    338374            Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
    339375            while(urlIterator.hasNext()) { 
     
    353389         
    354390        // write out the domain      
    355         //seedURLsWriter.write(domain + "\n"); 
     391        //seedURLsWriter.write(domainWithProtocol + "\n"); 
    356392         
    357393 
     
    367403 
    368404            // write all sorted unique domains into global domains file 
     405            // Using the domain withuot protocol since the global domains file is for 
     406            // informational purposes 
    369407            domainURLsWriter.write(domain + "\n"); 
    370408 
     
    373411            // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 
    374412            // If we ever run nutch on a single seedURLs listing containing 
    375             // all seed pages to crawl sites from, the above two files will work for that. 
    376              
     413            // all seed pages to crawl sites from, the above two files will work for that.   
    377414             
    378415            if(allowedURLPatternRegex == null) { // entire site can be crawled 
    379             siteURLsWriter.write(domain + "\n"); 
     416            siteURLsWriter.write(domainWithProtocol + "\n"); 
    380417             
    381418            // Write out filter in the following form for a site, e.g. for nutch.apache.org: 
    382419            // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
    383             String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";      
     420            String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";        
     421            //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";        
    384422            urlFilterWriter.write(regexed_domain + "\n"); //global file 
    385423            siteRegexWriter.write(regexed_domain + "\n"); // site file 
     
    387425            else { // domain belongs to a top site where only portion of site can be crawled 
    388426             
    389             if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter 
    390                 siteURLsWriter.write(domain + "\n"); 
     427            if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter 
     428                siteURLsWriter.write(domainWithProtocol + "\n"); 
    391429                // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com 
    392430                // and not for all of blogspot.com 
    393431                 
    394                String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/"; 
    395                urlFilterWriter.write(regexed_domain + "\n"); 
    396                siteRegexWriter.write(regexed_domain + "\n"); 
    397                 
    398             } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) { 
     432                String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/"; 
     433                //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/"; 
     434                urlFilterWriter.write(regexed_domain + "\n"); 
     435                siteRegexWriter.write(regexed_domain + "\n"); 
     436                 
     437            } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) { 
    399438                // don't write out domain. We want individual pages 
    400                 //DON'T DO: siteURLsWriter.write(domain + "\n"); 
     439                //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n"); 
    401440                 
    402                 // don't write out domain as a regex expression url filter 
     441                // don't write out domain as a regex expression url filter either, 
    403442                // write out the individual seed urls for the domain instead 
    404443                // since we will only be downloading the single page 
    405444                 
    406                 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
    407                 for(String urlInDomain : urlsForDomainSet) {                 
    408                 String regexed_url = "+^"+urlInDomain.replace(".", "\\."); 
     445                Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 
     446                for(String urlInDomain : urlsForDomainSet) { 
     447                // don't append slash to end this time 
     448                String regexed_url = "+^"+escapeStringForRegex(urlInDomain); 
     449                //String regexed_url = "+^"+urlInDomain.replace(".", "\\."); 
    409450                urlFilterWriter.write(regexed_url + "\n"); 
    410451                siteRegexWriter.write(regexed_url + "\n"); 
    411452                } 
    412453            } else { // allowedURLPatternRegex is a url-form - convert to regex 
    413                 String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\."); 
    414                 siteURLsWriter.write(domain + "\n"); 
     454                if(!allowedURLPatternRegex.endsWith("/")) { 
     455                allowedURLPatternRegex += "/"; 
     456                } 
     457                String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex); 
     458                //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\."); 
     459                siteURLsWriter.write(domainWithProtocol + "\n"); 
    415460                urlFilterWriter.write(regexed_pattern + "\n"); 
    416461                siteRegexWriter.write(regexed_pattern + "\n"); 
     
    420465             
    421466            // next write out the urls for the domain into the sites/0000x/seedURLs.txt file 
    422             // also write into the global seeds file 
    423             // (with a tab prefixed to each url?) 
    424             Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
     467            // also write into the global seeds file (with a tab prefixed to each?) 
     468            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 
    425469            Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
    426470            while(urlIterator.hasNext()) { 
    427471            String url = urlIterator.next(); 
    428             seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file 
    429             siteURLsWriter.write("\t" + url + "\n"); 
     472            seedURLsWriter.write(url + "\n"); // global seedURLs file 
     473            siteURLsWriter.write(url + "\n"); 
    430474            } 
    431475        } catch (IOException ioe) { 
    432476            ioe.printStackTrace(); 
    433             System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile); 
     477            System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile); 
    434478        } 
    435479         
     
    438482    } catch (IOException ioe) { 
    439483        ioe.printStackTrace(); 
    440         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile); 
    441     } 
    442      
    443     // write out domains as regular expressions into "regex-urlfilter.txt" file 
    444     try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 
    445         Set<String> domainsSet = domainsToURLsMap.keySet();      
    446         Iterator<String> i = domainsSet.iterator(); 
    447         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
    448         while(i.hasNext()) { 
    449         String domain = i.next(); 
    450         domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";         
    451         urlFilterWriter.write(domain + "\n"); 
    452         } 
    453          
    454     } catch (IOException ioe) { 
    455         ioe.printStackTrace(); 
    456         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 
    457     } 
     484        System.err.println("\n@@@@@@@@@ Error writing to one of: "); 
     485        System.err.println("\t" + seedURLsFile); 
     486        System.err.println("\t" + urlFilterFile); 
     487        System.err.println("\t" + domainURLsFile);   
     488        System.err.println("\t" + topSiteMatchesFile);     
     489    }    
    458490 
    459491    /* 
     
    475507    return url; 
    476508    } 
    477      
     509 
     510 
     511    /**  
     512     * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com, 
     513     * with or without www. prefix. This method tests for such as case as it would be dangerous 
     514     * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain. 
     515     */ 
     516    private boolean isExactDomainMatch(String seedURLDomain, String domain) { 
     517    // check for an exact match as-is 
     518    if(seedURLDomain.equals(domain)) { 
     519        return true; 
     520    } 
     521 
     522    // else check if with or without a www. prefix we have an exact match with domain 
     523    if(seedURLDomain.startsWith("www.")) { 
     524        if(seedURLDomain.substring(4).equals(domain)) { 
     525        return true; 
     526        } 
     527    } else { 
     528        if(domain.equals("www."+seedURLDomain)) { 
     529        return true; 
     530        } 
     531    } 
     532 
     533    return false; 
     534     } 
     535     
     536 
    478537    /** 
    479      * Check if the domain of the url, either in its entirety or when stripped of www/subdomains, 
    480      * is in the list of top sites. 
     538     * Check if the domain of the seedurl, either in its entirety or when stripped of  
     539     * www/subdomains, is in the list of top sites. 
    481540     * If it is, and the given url matches the regex for that topsite, then add the url to the 
    482541     * whitelist and a regex disallowing the rest of the topsite to the url regex filter file. 
    483       
    484      */ 
    485     private String isURLinTopSitesMap(String domain) { 
     542     * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix. 
     543     * @return one of the following values: 
     544     *  - This function returns null if the seedURL's domain does not match any of the topsites. 
     545     *  - The empty String is returned if the seedURL's domain matched a topsite but no (allowed- 
     546     * url-pattern) value was defined for it. The empty String is also returned if the seedURL's 
     547     * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't 
     548     * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY). 
     549     *  - A non-emptry String is returned if the seedURL's domain matched a topsite and a value 
     550     * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed 
     551     * URL pattern. 
     552     */ 
     553    private String isURLinTopSitesMap(String fullSeedDomain) { 
    486554    boolean keepLooping = true; 
    487555 
     556    String domain = fullSeedDomain; 
     557     
    488558    // domain aprameter will have retained www or subdomains, but is stripped of protocol 
    489559     
     
    492562    // If no match at all, return null. 
    493563    do { 
    494         if(domain.contains("pinterest.com")) { 
    495         System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map"); 
    496         } 
    497564         
    498565        String allowed_url_pattern = topSitesMap.get(domain); 
    499566        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 
    500567        // there's an entry for the URL in the topSitesMap 
    501         System.err.println("##### A top site matches URL domain " + domain);         
    502         return allowed_url_pattern;      
     568        System.err.println("##### A top site matches URL domain " + domain); 
     569 
     570        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without 
     571        // www prefix, should not exactly match the topSitesMap domain 
     572        // e.g. we don't want to crawl a seed URL with domain www.blogspot.com 
     573        // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY. 
     574         
     575        if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) { 
     576            return ""; // means don't crawl site, write url into unprocessed-topsite-matches file 
     577        } 
     578        return allowed_url_pattern; 
    503579        } 
    504580        // else, no entry for the URL in the topSitesMap 
    505         // Not done: strip subDomain from URL and check it against topSitesMap 
     581        // We're not done yet: strip subDomain from URL and check it against topSitesMap again 
    506582         
    507         String newURL = stripSubDomain(domain); 
    508         if(domain.equals(newURL)) keepLooping = false; 
    509         else domain = newURL; 
     583        String newDomain = stripSubDomain(domain); 
     584        if(domain.equals(newDomain)) { 
     585        keepLooping = false; 
     586        } else { 
     587        domain = newDomain; 
     588        } 
    510589    } while(keepLooping); 
    511590 
     
    674753    this.setRecordCount(wetRecordCount); 
    675754    } 
    676      
     755 
     756 
     757    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
    677758    public static void printUsage() { 
    678759    System.err.println("Run this program as:"); 
     
    770851    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
    771852 
    772     System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n"); 
     853    System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 
    773854     
    774855