- Timestamp:
- 2019-11-13T23:08:37+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33624 r33666 57 57 * e.g. (from maori-lang-detection/src) 58 58 * 59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/ gs3-extensions/maori-lang-detection/to_crawl60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/ gs3-extensions/maori-lang-detection/to_crawl 2>&1 | less59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/maori-lang-detection/to_crawl 60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/maori-lang-detection/to_crawl 2>&1 | less 61 61 * 62 62 */ … … 452 452 453 453 // Only write urls and no domain into single global seedurls file 454 // But write domain and tab bed urls into individual sites/0000#/seedURLs.txt454 // But write domain and tab-spaced urls into individual sites/0000#/seedURLs.txt 455 455 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 456 456 // If we ever run nutch on a single seedURLs listing containing … … 515 515 allowedURLPatternRegex += "/"; 516 516 } 517 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);517 String regexed_pattern = FILTER_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex); 518 518 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\."); 519 520 // In case any of the seedURLs themselves are not within the 521 // allowedURLPatternRegex part of the site, FIRST write out such 522 // seedURLs as allowed regex patterns, so they get downloaded 523 // as single pages. 524 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 525 for(String urlInDomain : urlsForDomainSet) { 526 527 String urlWithoutProtocolAndWWW = Utility.stripProtocolAndWWWFromURL(urlInDomain); 528 String allowedURLPatternWithoutProtocolAndWWW = Utility.stripProtocolAndWWWFromURL(allowedURLPatternRegex); 529 if(!urlWithoutProtocolAndWWW.startsWith(allowedURLPatternWithoutProtocolAndWWW)) { 530 // don't append slash to end this time 531 String regexed_url = "+^"+escapeStringForRegex(urlInDomain); 532 urlFilterWriter.write(regexed_url + "\n"); 533 siteRegexWriter.write(regexed_url + "\n"); 534 } 535 } 536 519 537 siteURLsWriter.write(domainWithProtocol + "\n"); 538 // write out allowedURLPatternRegex istead of the domain 539 //siteURLsWriter.write(allowedURLPatternRegex + "\n"); 540 541 // Now restrict any other URLs found to be within the allowedURLPattern 542 // part of the site 520 543 urlFilterWriter.write(regexed_pattern + "\n"); 521 siteRegexWriter.write(regexed_pattern + "\n"); 522 544 siteRegexWriter.write(regexed_pattern + "\n"); 523 545 } 524 546 }
Note:
See TracChangeset
for help on using the changeset viewer.