Changeset 33557

Show
Ignore:
Timestamp:
09.10.2019 23:10:06 (8 days ago)
Author:
ak19
Message:

Implemented the topSitesMap of topsite domain to url pattern in the only way I could make sense of it. Unlike the regex suggested by Dr Bainbridge for the values, which I didn't get how it would work out in this case, I invented set values.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33552 r33557  
    9292    private HashMap<String, Integer> whiteList; 
    9393 
     94    /** map of topsites with allowable regexes: sites too big to exhaustively crawl 
     95     * with optional regex defining allowed exceptions, like subdomains or url suffixes 
     96     * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org 
     97     * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot 
     98     * would be relevant. 
     99     * The map would store top site domain suffix and an optional regex string for allowable 
     100     * url patterns. 
     101    */ 
     102    private HashMap<String, String> topSitesMap; 
     103     
    94104    /** Map of domains we keep and the full urls we're keeping that are of that domain.  
    95105     * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys, 
     
    173183    initURLFilterList(whiteList, "url-whitelist-filter.txt"); 
    174184 
     185    // Create the map of topSites 
     186    System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite."); 
     187    topSitesMap = new HashMap<String, String>(); 
     188    //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt"); 
     189 
     190    try ( 
     191         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8")); 
     192         ) { 
     193 
     194        String str = null; 
     195        while((str = reader.readLine()) != null) { 
     196        str = str.trim(); 
     197        if(str.equals("") || str.startsWith("#")) { 
     198            continue; 
     199        } 
     200 
     201        int tabindex = str.indexOf("\t"); 
     202        if(tabindex == -1) { 
     203            topSitesMap.put(str, ""); 
     204        } else { 
     205            String topsite = str.substring(0, tabindex).trim(); 
     206            String allowed_url_pattern = str.substring(tabindex+1).trim(); 
     207            topSitesMap.put(topsite, allowed_url_pattern); 
     208        } 
     209        } 
     210    } catch (IOException ioe) { 
     211        ioe.printStackTrace(); 
     212        System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt"); 
     213    } 
     214     
    175215    //System.err.println("Prematurely terminating for testing purposes."); 
    176216    //System.exit(-1); 
     217    } 
     218 
     219    /** Work out the 'domain' for a given url. 
     220     * This retains any www. or subdomain prefix. 
     221     */ 
     222    private String getDomainForURL(String url) { 
     223    int startIndex = url.indexOf("//"); // http:// or https:// prefix 
     224    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     225    String domain = url.substring(startIndex); 
     226    int endIndex = domain.indexOf("/"); 
     227    if(endIndex == -1) endIndex = domain.length(); 
     228    domain = domain.substring(0, endIndex); 
     229     
     230    return domain; 
    177231    } 
    178232     
     
    182236     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 
    183237     */ 
    184     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) { 
     238    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 
     239                    File domainURLsFile, File topSiteMatchesFile) { 
    185240    // Maintain Sets of unique domains and urls 
    186241    // TreeSet: by default, "the elements are ordered using their natural ordering" 
     
    204259        while((url = reader.readLine()) != null) { // readLine removes newline separator 
    205260         
    206         // work out domain. This retains any www. or subdomain prefix: 
    207         int startIndex = url.indexOf("//"); // http:// or https:// prefix 
    208         startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
    209         domain = url.substring(startIndex); 
    210         int endIndex = domain.indexOf("/"); 
    211         if(endIndex == -1) endIndex = domain.length(); 
    212         domain = domain.substring(0, endIndex); 
     261        // work out domain. This retains any www. or subdomain prefix        
     262        domain = getDomainForURL(url); 
    213263 
    214264        //urlsSet.add(url); 
     
    232282    // We'd have pruned out duplicates by now and have a sorted list of domains, 
    233283    // each of which maps to seed URLs in the commoncrawl for that domain 
    234      
    235     /* 
    236     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
    237         Iterator<String> i = urlsSet.iterator(); 
    238         while(i.hasNext()) { 
    239         String url = i.next(); 
    240         seedURLsWriter.write(url + "\n"); 
    241         } 
    242          
    243     } catch (IOException ioe) { 
    244         ioe.printStackTrace(); 
    245         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 
    246     } 
    247     */ 
    248      
     284 
    249285    int domainCount = 0; 
    250286    File sitesFolder = new File(outputFolder, "sites"); 
     
    258294    try ( 
    259295         // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 
     296         // Also a global file listing any urls that matched top sites that didn't specify 
     297         // allowed regex patterns 
    260298         BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));  
    261299         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 
    262          BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) 
     300         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)); 
     301         BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)) 
    263302         ) { 
     303 
     304        // initialise topSiteMatchesFile with some instructional text. 
     305        topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n"); 
     306        topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n"); 
     307        topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n"); 
     308         
    264309        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet(); 
    265310        Set<String> domainsSet = domainsToURLsMap.keySet(); 
    266311        Iterator<String> domainIterator = domainsSet.iterator(); 
     312 
     313        /* 
     314        // DEBUG 
     315    String value = topSitesMap.get("wikipedia.org"); 
     316    if(value == null) { 
     317        System.err.println("### wikipedia.org had null value"); 
     318    } else { 
     319        System.err.println("### wikipedia.org had value: " + value); 
     320    } // DEBUG 
     321        */ 
    267322         
    268323        while(domainIterator.hasNext()) { 
    269         domainCount++; 
     324        String domain = domainIterator.next(); 
     325         
     326        String allowedURLPatternRegex = isURLinTopSitesMap(domain);      
     327        // If the domain is of a topsite for which no allowed URL pattern has been provided 
     328        // in sites-too-big-to-exhaustively-crawl.txt, 
     329        // then we don't know how to crawl the site. Warn the user by writing the affected 
     330        // domain and seedURLs to the topSiteMatchesFile.        
     331        if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) { 
     332            // topsite, but we don't (yet) know what portion can be crawled 
     333            // Append the top site and url to a global/toplevel file that 
     334            // the user needs to check later and we're done with this domain as it 
     335            // won't go into any other file hereafter 
     336 
     337            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
     338            Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
     339            while(urlIterator.hasNext()) { 
     340            String url = urlIterator.next(); 
     341            topSiteMatchesWriter.write("\t" + url + "\n");  
     342            } 
     343             
     344            continue; // done with this domain 
     345        } 
     346 
     347        // start counting the domains we're actually going to process 
     348        domainCount++;       
     349         
    270350        String siteID = String.format(FORMATSTR, domainCount); 
    271351        File domainFolder = new File(sitesFolder, siteID); 
    272352        domainFolder.mkdir(); 
    273353         
    274         // write out the domain 
    275         String domain = domainIterator.next(); 
     354        // write out the domain      
    276355        //seedURLsWriter.write(domain + "\n"); 
    277         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
    278         String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 
    279         urlFilterWriter.write(regexed_domain + "\n"); 
     356         
    280357 
    281358        // for every domain, we need a sites/0000x/ folder, where x is domain#, containing 
     
    297374            // If we ever run nutch on a single seedURLs listing containing 
    298375            // all seed pages to crawl sites from, the above two files will work for that. 
    299             siteURLsWriter.write(domain + "\n");         
    300             siteRegexWriter.write(regexed_domain + "\n"); 
    301376             
    302             // next write out the urls for the domain with a tab prefixed to each 
    303             // into the sites/0000x/seedURLs.txt file - also write into the global seeds file 
     377             
     378            if(allowedURLPatternRegex == null) { // entire site can be crawled 
     379            siteURLsWriter.write(domain + "\n"); 
     380             
     381            // Write out filter in the following form for a site, e.g. for nutch.apache.org: 
     382            // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
     383            String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";      
     384            urlFilterWriter.write(regexed_domain + "\n"); //global file 
     385            siteRegexWriter.write(regexed_domain + "\n"); // site file 
     386            } 
     387            else { // domain belongs to a top site where only portion of site can be crawled 
     388             
     389            if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter 
     390                siteURLsWriter.write(domain + "\n"); 
     391                // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com 
     392                // and not for all of blogspot.com 
     393                 
     394               String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/"; 
     395               urlFilterWriter.write(regexed_domain + "\n"); 
     396               siteRegexWriter.write(regexed_domain + "\n"); 
     397                
     398            } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) { 
     399                // don't write out domain. We want individual pages 
     400                //DON'T DO: siteURLsWriter.write(domain + "\n"); 
     401                 
     402                // don't write out domain as a regex expression url filter 
     403                // write out the individual seed urls for the domain instead 
     404                // since we will only be downloading the single page 
     405                 
     406                Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
     407                for(String urlInDomain : urlsForDomainSet) {                 
     408                String regexed_url = "+^"+urlInDomain.replace(".", "\\."); 
     409                urlFilterWriter.write(regexed_url + "\n"); 
     410                siteRegexWriter.write(regexed_url + "\n"); 
     411                } 
     412            } else { // allowedURLPatternRegex is a url-form - convert to regex 
     413                String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\."); 
     414                siteURLsWriter.write(domain + "\n"); 
     415                urlFilterWriter.write(regexed_pattern + "\n"); 
     416                siteRegexWriter.write(regexed_pattern + "\n"); 
     417 
     418            } 
     419            } 
     420             
     421            // next write out the urls for the domain into the sites/0000x/seedURLs.txt file 
     422            // also write into the global seeds file 
     423            // (with a tab prefixed to each url?) 
    304424            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
    305425            Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
    306426            while(urlIterator.hasNext()) { 
    307427            String url = urlIterator.next(); 
    308             seedURLsWriter.write(url + "\n"); // global seedURLs file 
    309             siteURLsWriter.write("\t" + url + "\n");  
     428            seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file 
     429            siteURLsWriter.write("\t" + url + "\n"); 
    310430            } 
    311431        } catch (IOException ioe) { 
     
    313433            System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile); 
    314434        } 
    315         } 
    316          
     435         
     436        } 
     437 
    317438    } catch (IOException ioe) { 
    318439        ioe.printStackTrace(); 
     
    335456        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 
    336457    } 
     458 
     459    /* 
     460    // BEGIN DEBUG 
     461    System.err.println("@@@@ TopSitesMap contains: "); 
     462    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) { 
     463        String topSite = entry.getKey(); 
     464        String urlPattern = entry.getValue();        
     465        System.err.println(topSite + " - " + urlPattern); 
     466    } // END DEBUG 
     467    */ 
     468    } 
     469 
     470    private String stripSubDomain(String url) { 
     471    int index = url.indexOf("."); 
     472    if(index != -1) { 
     473        url = url.substring(index+1); 
     474    } 
     475    return url; 
     476    } 
     477     
     478    /** 
     479     * Check if the domain of the url, either in its entirety or when stripped of www/subdomains, 
     480     * is in the list of top sites. 
     481     * If it is, and the given url matches the regex for that topsite, then add the url to the 
     482     * whitelist and a regex disallowing the rest of the topsite to the url regex filter file. 
     483      
     484     */ 
     485    private String isURLinTopSitesMap(String domain) { 
     486    boolean keepLooping = true; 
     487 
     488    // domain aprameter will have retained www or subdomains, but is stripped of protocol 
     489     
     490    // keep looping, stripping subdomains from url and checking if it matches a topsite domain 
     491    // if it does, return the value for that topsite domain in the topSitesMap 
     492    // If no match at all, return null. 
     493    do { 
     494        if(domain.contains("pinterest.com")) { 
     495        System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map"); 
     496        } 
     497         
     498        String allowed_url_pattern = topSitesMap.get(domain); 
     499        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 
     500        // there's an entry for the URL in the topSitesMap 
     501        System.err.println("##### A top site matches URL domain " + domain);         
     502        return allowed_url_pattern;      
     503        } 
     504        // else, no entry for the URL in the topSitesMap 
     505        // Not done: strip subDomain from URL and check it against topSitesMap 
     506         
     507        String newURL = stripSubDomain(domain); 
     508        if(domain.equals(newURL)) keepLooping = false; 
     509        else domain = newURL; 
     510    } while(keepLooping); 
     511 
     512    // url in entirety or stripped of subdomains did not match any of the topsites 
     513    return null; 
    337514    } 
    338515 
    339516    private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) { 
    340     Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet(); 
    341     Iterator<Map.Entry<String, Integer>> i = entries.iterator(); 
    342     while(i.hasNext()) { 
    343         Map.Entry<String, Integer> entry = i.next(); 
     517    //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet(); 
     518    //Iterator<Map.Entry<String, Integer>> i = entries.iterator(); 
     519    //while(i.hasNext()) { 
     520    // Map.Entry<String, Integer> entry = i.next(); 
     521    for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) { 
    344522        String urlPattern = entry.getKey(); 
    345523        Integer matchRule = entry.getValue(); 
     
    379557     */ 
    380558    public boolean isGreylisted(String url) { 
    381     // TODO: alexa top sites and auto-translated product sites 
     559    // auto-translated product sites 
    382560    return isListedInFilterList(greyList, url); 
    383561    } 
     
    579757    } 
    580758 
    581     // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 
     759     
     760    // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 
    582761    // The former is the only unique one. seedURLs and regex-urlfilters are 
    583762    // repeated on a per site/domain basis too, stored in the sites folder 
     
    585764    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 
    586765    File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 
    587     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile); 
     766    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 
     767     
     768    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); 
    588769 
    589770    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
     771 
     772    System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n"); 
     773     
    590774     
    591775    } catch(Exception e) {