Changeset 33557 for gs3-extensions


Ignore:
Timestamp:
2019-10-09T23:10:06+13:00 (5 years ago)
Author:
ak19
Message:

Implemented the topSitesMap of topsite domain to url pattern in the only way I could make sense of it. Unlike the regex suggested by Dr Bainbridge for the values, which I didn't get how it would work out in this case, I invented set values.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33552 r33557  
    9292    private HashMap<String, Integer> whiteList;
    9393
     94    /** map of topsites with allowable regexes: sites too big to exhaustively crawl
     95     * with optional regex defining allowed exceptions, like subdomains or url suffixes
     96     * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
     97     * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
     98     * would be relevant.
     99     * The map would store top site domain suffix and an optional regex string for allowable
     100     * url patterns.
     101    */
     102    private HashMap<String, String> topSitesMap;
     103   
    94104    /** Map of domains we keep and the full urls we're keeping that are of that domain.
    95105     * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
     
    173183    initURLFilterList(whiteList, "url-whitelist-filter.txt");
    174184
     185    // Create the map of topSites
     186    System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
     187    topSitesMap = new HashMap<String, String>();
     188    //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt");
     189
     190    try (
     191         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));
     192         ) {
     193
     194        String str = null;
     195        while((str = reader.readLine()) != null) {
     196        str = str.trim();
     197        if(str.equals("") || str.startsWith("#")) {
     198            continue;
     199        }
     200
     201        int tabindex = str.indexOf("\t");
     202        if(tabindex == -1) {
     203            topSitesMap.put(str, "");
     204        } else {
     205            String topsite = str.substring(0, tabindex).trim();
     206            String allowed_url_pattern = str.substring(tabindex+1).trim();
     207            topSitesMap.put(topsite, allowed_url_pattern);
     208        }
     209        }
     210    } catch (IOException ioe) {
     211        ioe.printStackTrace();
     212        System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt");
     213    }
     214   
    175215    //System.err.println("Prematurely terminating for testing purposes.");
    176216    //System.exit(-1);
     217    }
     218
     219    /** Work out the 'domain' for a given url.
     220     * This retains any www. or subdomain prefix.
     221     */
     222    private String getDomainForURL(String url) {
     223    int startIndex = url.indexOf("//"); // http:// or https:// prefix
     224    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     225    String domain = url.substring(startIndex);
     226    int endIndex = domain.indexOf("/");
     227    if(endIndex == -1) endIndex = domain.length();
     228    domain = domain.substring(0, endIndex);
     229   
     230    return domain;
    177231    }
    178232   
     
    182236     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
    183237     */
    184     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) {
     238    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
     239                    File domainURLsFile, File topSiteMatchesFile) {
    185240    // Maintain Sets of unique domains and urls
    186241    // TreeSet: by default, "the elements are ordered using their natural ordering"
     
    204259        while((url = reader.readLine()) != null) { // readLine removes newline separator
    205260       
    206         // work out domain. This retains any www. or subdomain prefix:
    207         int startIndex = url.indexOf("//"); // http:// or https:// prefix
    208         startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
    209         domain = url.substring(startIndex);
    210         int endIndex = domain.indexOf("/");
    211         if(endIndex == -1) endIndex = domain.length();
    212         domain = domain.substring(0, endIndex);
     261        // work out domain. This retains any www. or subdomain prefix       
     262        domain = getDomainForURL(url);
    213263
    214264        //urlsSet.add(url);
     
    232282    // We'd have pruned out duplicates by now and have a sorted list of domains,
    233283    // each of which maps to seed URLs in the commoncrawl for that domain
    234    
    235     /*
    236     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
    237         Iterator<String> i = urlsSet.iterator();
    238         while(i.hasNext()) {
    239         String url = i.next();
    240         seedURLsWriter.write(url + "\n");
    241         }
    242        
    243     } catch (IOException ioe) {
    244         ioe.printStackTrace();
    245         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
    246     }
    247     */
    248    
     284
    249285    int domainCount = 0;
    250286    File sitesFolder = new File(outputFolder, "sites");
     
    258294    try (
    259295         // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
     296         // Also a global file listing any urls that matched top sites that didn't specify
     297         // allowed regex patterns
    260298         BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
    261299         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
    262          BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
     300         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
     301         BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
    263302         ) {
     303
     304        // initialise topSiteMatchesFile with some instructional text.
     305        topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
     306        topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
     307        topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
     308       
    264309        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
    265310        Set<String> domainsSet = domainsToURLsMap.keySet();
    266311        Iterator<String> domainIterator = domainsSet.iterator();
     312
     313        /*
     314        // DEBUG
     315    String value = topSitesMap.get("wikipedia.org");
     316    if(value == null) {
     317        System.err.println("### wikipedia.org had null value");
     318    } else {
     319        System.err.println("### wikipedia.org had value: " + value);
     320    } // DEBUG
     321        */
    267322       
    268323        while(domainIterator.hasNext()) {
    269         domainCount++;
     324        String domain = domainIterator.next();
     325       
     326        String allowedURLPatternRegex = isURLinTopSitesMap(domain);     
     327        // If the domain is of a topsite for which no allowed URL pattern has been provided
     328        // in sites-too-big-to-exhaustively-crawl.txt,
     329        // then we don't know how to crawl the site. Warn the user by writing the affected
     330        // domain and seedURLs to the topSiteMatchesFile.       
     331        if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
     332            // topsite, but we don't (yet) know what portion can be crawled
     333            // Append the top site and url to a global/toplevel file that
     334            // the user needs to check later and we're done with this domain as it
     335            // won't go into any other file hereafter
     336
     337            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
     338            Iterator<String> urlIterator = urlsForDomainSet.iterator();
     339            while(urlIterator.hasNext()) {
     340            String url = urlIterator.next();
     341            topSiteMatchesWriter.write("\t" + url + "\n");
     342            }
     343           
     344            continue; // done with this domain
     345        }
     346
     347        // start counting the domains we're actually going to process
     348        domainCount++;     
     349       
    270350        String siteID = String.format(FORMATSTR, domainCount);
    271351        File domainFolder = new File(sitesFolder, siteID);
    272352        domainFolder.mkdir();
    273353       
    274         // write out the domain
    275         String domain = domainIterator.next();
     354        // write out the domain     
    276355        //seedURLsWriter.write(domain + "\n");
    277         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
    278         String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
    279         urlFilterWriter.write(regexed_domain + "\n");
     356       
    280357
    281358        // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
     
    297374            // If we ever run nutch on a single seedURLs listing containing
    298375            // all seed pages to crawl sites from, the above two files will work for that.
    299             siteURLsWriter.write(domain + "\n");       
    300             siteRegexWriter.write(regexed_domain + "\n");
    301376           
    302             // next write out the urls for the domain with a tab prefixed to each
    303             // into the sites/0000x/seedURLs.txt file - also write into the global seeds file
     377           
     378            if(allowedURLPatternRegex == null) { // entire site can be crawled
     379            siteURLsWriter.write(domain + "\n");
     380           
     381            // Write out filter in the following form for a site, e.g. for nutch.apache.org:
     382            // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
     383            String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";     
     384            urlFilterWriter.write(regexed_domain + "\n"); //global file
     385            siteRegexWriter.write(regexed_domain + "\n"); // site file
     386            }
     387            else { // domain belongs to a top site where only portion of site can be crawled
     388           
     389            if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter
     390                siteURLsWriter.write(domain + "\n");
     391                // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
     392                // and not for all of blogspot.com
     393               
     394               String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/";
     395               urlFilterWriter.write(regexed_domain + "\n");
     396               siteRegexWriter.write(regexed_domain + "\n");
     397               
     398            } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) {
     399                // don't write out domain. We want individual pages
     400                //DON'T DO: siteURLsWriter.write(domain + "\n");
     401               
     402                // don't write out domain as a regex expression url filter
     403                // write out the individual seed urls for the domain instead
     404                // since we will only be downloading the single page
     405               
     406                Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
     407                for(String urlInDomain : urlsForDomainSet) {               
     408                String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
     409                urlFilterWriter.write(regexed_url + "\n");
     410                siteRegexWriter.write(regexed_url + "\n");
     411                }
     412            } else { // allowedURLPatternRegex is a url-form - convert to regex
     413                String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\.");
     414                siteURLsWriter.write(domain + "\n");
     415                urlFilterWriter.write(regexed_pattern + "\n");
     416                siteRegexWriter.write(regexed_pattern + "\n");
     417
     418            }
     419            }
     420           
     421            // next write out the urls for the domain into the sites/0000x/seedURLs.txt file
     422            // also write into the global seeds file
     423            // (with a tab prefixed to each url?)
    304424            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
    305425            Iterator<String> urlIterator = urlsForDomainSet.iterator();
    306426            while(urlIterator.hasNext()) {
    307427            String url = urlIterator.next();
    308             seedURLsWriter.write(url + "\n"); // global seedURLs file
    309             siteURLsWriter.write("\t" + url + "\n"); 
     428            seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file
     429            siteURLsWriter.write("\t" + url + "\n");
    310430            }
    311431        } catch (IOException ioe) {
     
    313433            System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
    314434        }
    315         }
    316        
     435       
     436        }
     437
    317438    } catch (IOException ioe) {
    318439        ioe.printStackTrace();
     
    335456        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
    336457    }
     458
     459    /*
     460    // BEGIN DEBUG
     461    System.err.println("@@@@ TopSitesMap contains: ");
     462    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
     463        String topSite = entry.getKey();
     464        String urlPattern = entry.getValue();       
     465        System.err.println(topSite + " - " + urlPattern);
     466    } // END DEBUG
     467    */
     468    }
     469
     470    private String stripSubDomain(String url) {
     471    int index = url.indexOf(".");
     472    if(index != -1) {
     473        url = url.substring(index+1);
     474    }
     475    return url;
     476    }
     477   
     478    /**
     479     * Check if the domain of the url, either in its entirety or when stripped of www/subdomains,
     480     * is in the list of top sites.
     481     * If it is, and the given url matches the regex for that topsite, then add the url to the
     482     * whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
     483     
     484     */
     485    private String isURLinTopSitesMap(String domain) {
     486    boolean keepLooping = true;
     487
     488    // domain aprameter will have retained www or subdomains, but is stripped of protocol
     489   
     490    // keep looping, stripping subdomains from url and checking if it matches a topsite domain
     491    // if it does, return the value for that topsite domain in the topSitesMap
     492    // If no match at all, return null.
     493    do {
     494        if(domain.contains("pinterest.com")) {
     495        System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map");
     496        }
     497       
     498        String allowed_url_pattern = topSitesMap.get(domain);
     499        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
     500        // there's an entry for the URL in the topSitesMap
     501        System.err.println("##### A top site matches URL domain " + domain);       
     502        return allowed_url_pattern;     
     503        }
     504        // else, no entry for the URL in the topSitesMap
     505        // Not done: strip subDomain from URL and check it against topSitesMap
     506       
     507        String newURL = stripSubDomain(domain);
     508        if(domain.equals(newURL)) keepLooping = false;
     509        else domain = newURL;
     510    } while(keepLooping);
     511
     512    // url in entirety or stripped of subdomains did not match any of the topsites
     513    return null;
    337514    }
    338515
    339516    private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
    340     Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
    341     Iterator<Map.Entry<String, Integer>> i = entries.iterator();
    342     while(i.hasNext()) {
    343         Map.Entry<String, Integer> entry = i.next();
     517    //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
     518    //Iterator<Map.Entry<String, Integer>> i = entries.iterator();
     519    //while(i.hasNext()) {
     520    // Map.Entry<String, Integer> entry = i.next();
     521    for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
    344522        String urlPattern = entry.getKey();
    345523        Integer matchRule = entry.getValue();
     
    379557     */
    380558    public boolean isGreylisted(String url) {
    381     // TODO: alexa top sites and auto-translated product sites
     559    // auto-translated product sites
    382560    return isListedInFilterList(greyList, url);
    383561    }
     
    579757    }
    580758
    581     // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
     759   
     760    // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
    582761    // The former is the only unique one. seedURLs and regex-urlfilters are
    583762    // repeated on a per site/domain basis too, stored in the sites folder
     
    585764    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
    586765    File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
    587     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile);
     766    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
     767   
     768    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
    588769
    589770    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     771
     772    System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n");
     773   
    590774   
    591775    } catch(Exception e) {
Note: See TracChangeset for help on using the changeset viewer.