Ignore:
Timestamp:
2019-10-11T20:49:05+13:00 (5 years ago)
Author:
ak19
Message:
  1. sites-too-big-to-exhaustively-crawl.txt is now a comma separated list. 2. After the discussion with Dr Bainbridge that SINGLEPAGE is not what we want for docs.google.com, I found that the tentative switch to SUBDOMAIN-COPY for docs.google.com will not work precisely because of the important change we had to make yesterday: if SUBDOMAIN-COPY, then only copy SUBdomains, and not root domains. If root domain with SUBDOMAIN-COPY, then the seedURL gets written out to unprocessed-topsite-matches.txt and its site doesn't get crawled. 3. This revealed a lacuna in sites-too-big-to-exhaustively-crawl.txt possible list of values and I had to invent a new value which I introduce and have tested with this commit: FOLLOW_LINKS_WITHIN_TOPSITE. This value so far applies only to docs.google.com and will keep following any links originating in a seedURL on docs.google.com but only as long as it's within that topsite domain (docs.google.com). 4. Tidied some old fashioned use of Iterator, replaced with newer style of for loops that work with Types. Comitting before update code to use the apache csv API.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33560 r33561  
    6969    public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
    7070    public final String SINGLEPAGE = "SINGLEPAGE";
     71    public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
    7172   
    7273    /**
     
    7475     * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
    7576     * https://www.regular-expressions.info/refcharacters.html
    76     */
    77     //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"];
    78     // put the \\ at start so we don't the escape character for chars escaped earlier
     77     * Put the \\ (escape char) at start so we don't double-escape chars already escaped,
     78     * as would happen for any chars appearing earlier in this list than \\
     79    */   
    7980    public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|";
     81    //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"];
    8082   
    8183    private Properties configProperties = new Properties();
     
    212214        }
    213215
    214         int tabindex = str.indexOf("\t");
    215         if(tabindex == -1) {
     216        // comma separated list of values
     217        int splitindex = str.indexOf(",");
     218        if(splitindex == -1) {
    216219            topSitesMap.put(str, "");
    217220        } else {
    218             String topsite = str.substring(0, tabindex).trim();
    219             String allowed_url_pattern = str.substring(tabindex+1).trim();
     221            String topsite = str.substring(0, splitindex).trim();
     222            String allowed_url_pattern = str.substring(splitindex+1).trim();
    220223            topSitesMap.put(topsite, allowed_url_pattern);
    221224        }
     
    352355        while(domainIterator.hasNext()) {
    353356        String domainWithProtocol = domainIterator.next();
     357        // Also get domain without protocol prefix
    354358        int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
    355359        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
    356360        String domain = domainWithProtocol.substring(startIndex);
    357        
    358         System.err.println("domain with protocol: " + domainWithProtocol);
    359         System.err.println("domain: " + domain);
     361
     362        /*if(domain.contains("docs.google.com")) {
     363            System.err.println("domain with protocol: " + domainWithProtocol);
     364            System.err.println("domain: " + domain);
     365            }*/
    360366       
    361367        String allowedURLPatternRegex = isURLinTopSitesMap(domain);     
     
    372378
    373379            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
    374             Iterator<String> urlIterator = urlsForDomainSet.iterator();
    375             while(urlIterator.hasNext()) {
    376             String url = urlIterator.next();
     380            for(String url : urlsForDomainSet) {
    377381            topSiteMatchesWriter.write("\t" + url + "\n");
    378382            }
    379             
     383           
    380384            continue; // done with this domain
    381385        }
     
    451455                siteRegexWriter.write(regexed_url + "\n");
    452456                }
     457            } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
     458               
     459                // DON'T write out domain into siteURLs file,
     460                // BUT DO write it into urlFilter file
     461                String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
     462
     463                urlFilterWriter.write(regexed_domain + "\n");
     464                siteRegexWriter.write(regexed_domain + "\n");
    453465            } else { // allowedURLPatternRegex is a url-form - convert to regex
    454466                if(!allowedURLPatternRegex.endsWith("/")) {
     
    467479            // also write into the global seeds file (with a tab prefixed to each?)
    468480            Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
    469             Iterator<String> urlIterator = urlsForDomainSet.iterator();
    470             while(urlIterator.hasNext()) {
    471             String url = urlIterator.next();
     481            for(String url : urlsForDomainSet) {
    472482            seedURLsWriter.write(url + "\n"); // global seedURLs file
    473483            siteURLsWriter.write(url + "\n");
    474484            }
     485           
    475486        } catch (IOException ioe) {
    476487            ioe.printStackTrace();
Note: See TracChangeset for help on using the changeset viewer.