Ignore:
Timestamp:
2019-09-24T21:40:16+12:00 (5 years ago)
Author:
ak19
Message:

Code still writes out the global seedURLs.txt and regex-urlfilter.txt (in case this remains meaningful), but now also creates individual site directories containing their individual seedURLs.txt and regex-urlfilter.txt

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33518 r33519  
    244244    */
    245245
     246    int domainCount = 0;
     247    File sitesFolder = new File(outputFolder, "sites");
     248    if(!sitesFolder.exists()) {
     249        sitesFolder.mkdir();
     250    }
     251    final String FORMATSTR = "%05d";
     252   
    246253    // write out each domain followed in sequence by all urls we found in that domain
    247254    // (urls with tab up front)
    248     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
     255    try (
     256         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
     257         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
     258         ) {
    249259        //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
    250260        Set<String> domainsSet = domainsToURLsMap.keySet();
    251261        Iterator<String> domainIterator = domainsSet.iterator();
    252 
     262       
    253263        while(domainIterator.hasNext()) {
     264        domainCount++;
     265        String siteID = String.format(FORMATSTR, domainCount);
     266        File domainFolder = new File(sitesFolder, siteID);
     267        domainFolder.mkdir();
     268       
    254269        // write out the domain
    255270        String domain = domainIterator.next();
    256         seedURLsWriter.write(domain + "\n");
    257        
    258         // next write out the urls for the domain with a tab prefixed to each
    259         Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
    260         Iterator<String> urlIterator = urlsForDomainSet.iterator();
    261         while(urlIterator.hasNext()) {
    262             String url = urlIterator.next();
    263             seedURLsWriter.write("\t" + url + "\n");
    264         }       
     271        //seedURLsWriter.write(domain + "\n");
     272        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
     273        String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
     274        urlFilterWriter.write(regexed_domain + "\n");
     275
     276        // for every domain, we need sites/0000x/ folder containing its own
     277        // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
     278        // We still have a global seedURLs.txt and regex-urlfilter.txt too.
     279        File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
     280        File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
     281        try (
     282             BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
     283             BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
     284             ) {
     285            // only write urls and no domain into single global seedurls file
     286            // But write domain and tabbed urls into individual sites/0000x.txt files
     287            // and write regexed domain into it too
     288            siteURLsWriter.write(domain + "\n");       
     289            siteRegexWriter.write(regexed_domain + "\n");
     290           
     291            // next write out the urls for the domain with a tab prefixed to each
     292            // into the sites/0000x/seedURLs.txt file - also write into the global seeds file
     293            Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
     294            Iterator<String> urlIterator = urlsForDomainSet.iterator();
     295            while(urlIterator.hasNext()) {
     296            String url = urlIterator.next();
     297            seedURLsWriter.write(url + "\n"); // global seedURLs file
     298            siteURLsWriter.write("\t" + url + "\n");
     299            }
     300        } catch (IOException ioe) {
     301            ioe.printStackTrace();
     302            System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
     303        }
    265304        }
    266305       
    267306    } catch (IOException ioe) {
    268307        ioe.printStackTrace();
    269         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
     308        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
    270309    }
    271310   
Note: See TracChangeset for help on using the changeset viewer.