Changeset 33518


Ignore:
Timestamp:
2019-09-24T21:13:47+12:00 (5 years ago)
Author:
ak19
Message:

Intermediate commit: got the seed urls file temporarily written out as domain followed by commoncrawl's urls within the domain. For the next commit, I will try splitting them into individual files per domain along with their individual regex-url txt file list restricted just to the site/domain, while returning what's output into the seed urls file back to all urls, sorted.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33517 r33518  
    99import java.util.Map;
    1010import java.util.Set;
     11import java.util.TreeMap;
    1112import java.util.TreeSet;
    1213
     
    9293
    9394    /** Map of domains we keep and the full urls we're keeping that are of that domain.
    94      * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys,
    95      * since a HashMap has no notion of ordering.
    96      */
    97     private TreeMap<String, TreeSet<String>> domainsToURLsMap;
     95     * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
     96     * while a HashMap has no notion of ordering, because we just need to store urls with
     97     * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
     98     * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
     99     * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
     100     * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
     101     */
     102    private Map<String, Set<String>> domainsToURLsMap;
    98103   
    99104    // Keep a count of all the records that all WETProcessors instantiated
     
    184189    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
    185190
    186     Set<String> domainsSet = new TreeSet<String>();
    187     Set<String> urlsSet = new TreeSet<String>();
    188 
     191    //Set<String> domainsSet = new TreeSet<String>();
     192    //Set<String> urlsSet = new TreeSet<String>();
     193    domainsToURLsMap = new TreeMap<String, Set<String>>();
     194   
    189195    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
    190196   
     
    206212        domain = domain.substring(0, endIndex);
    207213
    208         //if(!domainsMap.containsKey(domain)) {
    209         urlsSet.add(url);
    210         domainsSet.add(domain);
    211         //}
     214        //urlsSet.add(url);
     215        //domainsSet.add(domain);
     216        Set<String> urlsSet;
     217        if(!domainsToURLsMap.containsKey(domain)) {
     218            urlsSet = new TreeSet<String>();
     219            urlsSet.add(url);
     220            domainsToURLsMap.put(domain, urlsSet);
     221        } else {
     222            urlsSet = domainsToURLsMap.get(domain);
     223            urlsSet.add(url);
     224        }
     225
    212226        }
    213227    } catch (IOException ioe) {
     
    215229        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
    216230    }
    217    
     231
     232    /*
    218233    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
    219234        Iterator<String> i = urlsSet.iterator();
     
    227242        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
    228243    }
    229 
     244    */
     245
     246    // write out each domain followed in sequence by all urls we found in that domain
     247    // (urls with tab up front)
     248    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
     249        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
     250        Set<String> domainsSet = domainsToURLsMap.keySet();
     251        Iterator<String> domainIterator = domainsSet.iterator();
     252
     253        while(domainIterator.hasNext()) {
     254        // write out the domain
     255        String domain = domainIterator.next();
     256        seedURLsWriter.write(domain + "\n");
     257       
     258        // next write out the urls for the domain with a tab prefixed to each
     259        Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
     260        Iterator<String> urlIterator = urlsForDomainSet.iterator();
     261        while(urlIterator.hasNext()) {
     262            String url = urlIterator.next();
     263            seedURLsWriter.write("\t" + url + "\n");
     264        }       
     265        }
     266       
     267    } catch (IOException ioe) {
     268        ioe.printStackTrace();
     269        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
     270    }
     271   
     272    // write out domains as regular expressions into "regex-urlfilter.txt" file
    230273    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
     274        Set<String> domainsSet = domainsToURLsMap.keySet();     
    231275        Iterator<String> i = domainsSet.iterator();
    232276        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
Note: See TracChangeset for help on using the changeset viewer.