Changeset 33519

Show
Ignore:
Timestamp:
24.09.2019 21:40:16 (3 weeks ago)
Author:
ak19
Message:

Code still writes out the global seedURLs.txt and regex-urlfilter.txt (in case this remains meaningful), but now also creates individual site directories containing their individual seedURLs.txt and regex-urlfilter.txt

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33518 r33519  
    244244    */ 
    245245 
     246    int domainCount = 0; 
     247    File sitesFolder = new File(outputFolder, "sites"); 
     248    if(!sitesFolder.exists()) { 
     249        sitesFolder.mkdir(); 
     250    } 
     251    final String FORMATSTR = "%05d"; 
     252     
    246253    // write out each domain followed in sequence by all urls we found in that domain 
    247254    // (urls with tab up front) 
    248     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
     255    try ( 
     256         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 
     257         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) 
     258         ) { 
    249259        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet(); 
    250260        Set<String> domainsSet = domainsToURLsMap.keySet(); 
    251261        Iterator<String> domainIterator = domainsSet.iterator(); 
    252  
     262         
    253263        while(domainIterator.hasNext()) { 
     264        domainCount++; 
     265        String siteID = String.format(FORMATSTR, domainCount); 
     266        File domainFolder = new File(sitesFolder, siteID); 
     267        domainFolder.mkdir(); 
     268         
    254269        // write out the domain 
    255270        String domain = domainIterator.next(); 
    256         seedURLsWriter.write(domain + "\n"); 
    257          
    258         // next write out the urls for the domain with a tab prefixed to each 
    259         Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
    260         Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
    261         while(urlIterator.hasNext()) { 
    262             String url = urlIterator.next(); 
    263             seedURLsWriter.write("\t" + url + "\n"); 
    264         }        
     271        //seedURLsWriter.write(domain + "\n"); 
     272        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
     273        String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 
     274        urlFilterWriter.write(regexed_domain + "\n"); 
     275 
     276        // for every domain, we need sites/0000x/ folder containing its own 
     277        // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt 
     278        // We still have a global seedURLs.txt and regex-urlfilter.txt too. 
     279        File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt 
     280        File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt 
     281        try ( 
     282             BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile)); 
     283             BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile)); 
     284             ) { 
     285            // only write urls and no domain into single global seedurls file 
     286            // But write domain and tabbed urls into individual sites/0000x.txt files 
     287            // and write regexed domain into it too 
     288            siteURLsWriter.write(domain + "\n");         
     289            siteRegexWriter.write(regexed_domain + "\n"); 
     290             
     291            // next write out the urls for the domain with a tab prefixed to each 
     292            // into the sites/0000x/seedURLs.txt file - also write into the global seeds file 
     293            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 
     294            Iterator<String> urlIterator = urlsForDomainSet.iterator(); 
     295            while(urlIterator.hasNext()) { 
     296            String url = urlIterator.next(); 
     297            seedURLsWriter.write(url + "\n"); // global seedURLs file 
     298            siteURLsWriter.write("\t" + url + "\n");  
     299            } 
     300        } catch (IOException ioe) { 
     301            ioe.printStackTrace(); 
     302            System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile); 
     303        } 
    265304        } 
    266305         
    267306    } catch (IOException ioe) { 
    268307        ioe.printStackTrace(); 
    269         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 
     308        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile); 
    270309    } 
    271310