Changeset 33518

Show
Ignore:
Timestamp:
24.09.2019 21:13:47 (3 weeks ago)
Author:
ak19
Message:

Intermediate commit: got the seed urls file temporarily written out as domain followed by commoncrawl's urls within the domain. For the next commit, I will try splitting them into individual files per domain along with their individual regex-url txt file list restricted just to the site/domain, while returning what's output into the seed urls file back to all urls, sorted.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33517 r33518  
    99import java.util.Map; 
    1010import java.util.Set; 
     11import java.util.TreeMap; 
    1112import java.util.TreeSet; 
    1213 
     
    9293 
    9394    /** Map of domains we keep and the full urls we're keeping that are of that domain.  
    94      * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys, 
    95      * since a HashMap has no notion of ordering. 
    96      */ 
    97     private TreeMap<String, TreeSet<String>> domainsToURLsMap; 
     95     * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys, 
     96     * while a HashMap has no notion of ordering, because we just need to store urls with 
     97     * their domains. Whether the domains are sorted or the urls per domain are sorted becomes 
     98     * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the 
     99     * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html 
     100     * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?) 
     101     */ 
     102    private Map<String, Set<String>> domainsToURLsMap; 
    98103     
    99104    // Keep a count of all the records that all WETProcessors instantiated 
     
    184189    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 
    185190 
    186     Set<String> domainsSet = new TreeSet<String>(); 
    187     Set<String> urlsSet = new TreeSet<String>(); 
    188  
     191    //Set<String> domainsSet = new TreeSet<String>(); 
     192    //Set<String> urlsSet = new TreeSet<String>(); 
     193    domainsToURLsMap = new TreeMap<String, Set<String>>(); 
     194     
    189195    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 
    190196     
     
    206212        domain = domain.substring(0, endIndex); 
    207213 
    208         //if(!domainsMap.containsKey(domain)) { 
    209         urlsSet.add(url); 
    210         domainsSet.add(domain); 
    211         //} 
     214        //urlsSet.add(url); 
     215        //domainsSet.add(domain); 
     216        Set<String> urlsSet; 
     217        if(!domainsToURLsMap.containsKey(domain)) { 
     218            urlsSet = new TreeSet<String>(); 
     219            urlsSet.add(url); 
     220            domainsToURLsMap.put(domain, urlsSet); 
     221        } else { 
     222            urlsSet = domainsToURLsMap.get(domain); 
     223            urlsSet.add(url); 
     224        } 
     225 
    212226        } 
    213227    } catch (IOException ioe) { 
     
    215229        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile); 
    216230    } 
    217      
     231 
     232    /* 
    218233    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
    219234        Iterator<String> i = urlsSet.iterator(); 
     
    227242        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 
    228243    } 
    229  
     244    */ 
     245 
     246    // write out each domain followed in sequence by all urls we found in that domain 
     247    // (urls with tab up front) 
     248    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
     249        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet(); 
     250        Set<String> domainsSet = domainsToURLsMap.keySet(); 
     251        Iterator<String> domainIterator = domainsSet.iterator(); 
     252 
     253        while(domainIterator.hasNext()) { 
     254        // write out the domain 
     255        String domain = domainIterator.next(); 
     256        seedURLsWriter.write(domain + "\n"); 
     257         
     258        // next write out the urls for the domain with a tab prefixed to each 
     259        Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
     260        Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
     261        while(urlIterator.hasNext()) { 
     262            String url = urlIterator.next(); 
     263            seedURLsWriter.write("\t" + url + "\n"); 
     264        }        
     265        } 
     266         
     267    } catch (IOException ioe) { 
     268        ioe.printStackTrace(); 
     269        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 
     270    } 
     271     
     272    // write out domains as regular expressions into "regex-urlfilter.txt" file 
    230273    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 
     274        Set<String> domainsSet = domainsToURLsMap.keySet();      
    231275        Iterator<String> i = domainsSet.iterator(); 
    232276        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/