- Timestamp:
- 2019-09-24T21:40:16+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33518 r33519 244 244 */ 245 245 246 int domainCount = 0; 247 File sitesFolder = new File(outputFolder, "sites"); 248 if(!sitesFolder.exists()) { 249 sitesFolder.mkdir(); 250 } 251 final String FORMATSTR = "%05d"; 252 246 253 // write out each domain followed in sequence by all urls we found in that domain 247 254 // (urls with tab up front) 248 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 255 try ( 256 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 257 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) 258 ) { 249 259 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet(); 250 260 Set<String> domainsSet = domainsToURLsMap.keySet(); 251 261 Iterator<String> domainIterator = domainsSet.iterator(); 252 262 253 263 while(domainIterator.hasNext()) { 264 domainCount++; 265 String siteID = String.format(FORMATSTR, domainCount); 266 File domainFolder = new File(sitesFolder, siteID); 267 domainFolder.mkdir(); 268 254 269 // write out the domain 255 270 String domain = domainIterator.next(); 256 seedURLsWriter.write(domain + "\n"); 257 258 // next write out the urls for the domain with a tab prefixed to each 259 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 260 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 261 while(urlIterator.hasNext()) { 262 String url = urlIterator.next(); 263 seedURLsWriter.write("\t" + url + "\n"); 264 } 271 //seedURLsWriter.write(domain + "\n"); 272 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 273 String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 274 urlFilterWriter.write(regexed_domain + "\n"); 275 276 // for every domain, we need sites/0000x/ folder containing its own 277 // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt 278 // We still have a global seedURLs.txt and regex-urlfilter.txt too. 279 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt 280 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt 281 try ( 282 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile)); 283 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile)); 284 ) { 285 // only write urls and no domain into single global seedurls file 286 // But write domain and tabbed urls into individual sites/0000x.txt files 287 // and write regexed domain into it too 288 siteURLsWriter.write(domain + "\n"); 289 siteRegexWriter.write(regexed_domain + "\n"); 290 291 // next write out the urls for the domain with a tab prefixed to each 292 // into the sites/0000x/seedURLs.txt file - also write into the global seeds file 293 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 294 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 295 while(urlIterator.hasNext()) { 296 String url = urlIterator.next(); 297 seedURLsWriter.write(url + "\n"); // global seedURLs file 298 siteURLsWriter.write("\t" + url + "\n"); 299 } 300 } catch (IOException ioe) { 301 ioe.printStackTrace(); 302 System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile); 303 } 265 304 } 266 305 267 306 } catch (IOException ioe) { 268 307 ioe.printStackTrace(); 269 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);308 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile); 270 309 } 271 310
Note:
See TracChangeset
for help on using the changeset viewer.