- Timestamp:
- 2019-10-14T21:04:58+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33562 r33565 423 423 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 424 424 // If we ever run nutch on a single seedURLs listing containing 425 // all seed pages to crawl sites from, the above two files will work for that. 425 // all seed pages to crawl sites from, the above two files will work for that. 426 427 // first write out the urls for the domain into the sites/0000x/seedURLs.txt file 428 // also write into the global seeds file (with a tab prefixed to each?) 429 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 430 for(String url : urlsForDomainSet) { 431 seedURLsWriter.write(url + "\n"); // global seedURLs file 432 siteURLsWriter.write(url + "\n"); 433 } 434 426 435 427 436 if(allowedURLPatternRegex == null) { // entire site can be crawled … … 455 464 // since we will only be downloading the single page 456 465 457 Set<String>urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);466 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 458 467 for(String urlInDomain : urlsForDomainSet) { 459 468 // don't append slash to end this time … … 482 491 483 492 } 484 }485 486 // next write out the urls for the domain into the sites/0000x/seedURLs.txt file487 // also write into the global seeds file (with a tab prefixed to each?)488 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);489 for(String url : urlsForDomainSet) {490 seedURLsWriter.write(url + "\n"); // global seedURLs file491 siteURLsWriter.write(url + "\n");492 493 } 493 494
Note:
See TracChangeset
for help on using the changeset viewer.