Changeset 33666 for other-projects/maori-lang-detection/src/org
- Timestamp:
- 2019-11-13T23:08:37+13:00 (5 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33624 r33666 57 57 * e.g. (from maori-lang-detection/src) 58 58 * 59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/ gs3-extensions/maori-lang-detection/to_crawl60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/ gs3-extensions/maori-lang-detection/to_crawl 2>&1 | less59 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/maori-lang-detection/to_crawl 60 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/maori-lang-detection/to_crawl 2>&1 | less 61 61 * 62 62 */ … … 452 452 453 453 // Only write urls and no domain into single global seedurls file 454 // But write domain and tab bed urls into individual sites/0000#/seedURLs.txt454 // But write domain and tab-spaced urls into individual sites/0000#/seedURLs.txt 455 455 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 456 456 // If we ever run nutch on a single seedURLs listing containing … … 515 515 allowedURLPatternRegex += "/"; 516 516 } 517 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);517 String regexed_pattern = FILTER_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex); 518 518 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\."); 519 520 // In case any of the seedURLs themselves are not within the 521 // allowedURLPatternRegex part of the site, FIRST write out such 522 // seedURLs as allowed regex patterns, so they get downloaded 523 // as single pages. 524 urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 525 for(String urlInDomain : urlsForDomainSet) { 526 527 String urlWithoutProtocolAndWWW = Utility.stripProtocolAndWWWFromURL(urlInDomain); 528 String allowedURLPatternWithoutProtocolAndWWW = Utility.stripProtocolAndWWWFromURL(allowedURLPatternRegex); 529 if(!urlWithoutProtocolAndWWW.startsWith(allowedURLPatternWithoutProtocolAndWWW)) { 530 // don't append slash to end this time 531 String regexed_url = "+^"+escapeStringForRegex(urlInDomain); 532 urlFilterWriter.write(regexed_url + "\n"); 533 siteRegexWriter.write(regexed_url + "\n"); 534 } 535 } 536 519 537 siteURLsWriter.write(domainWithProtocol + "\n"); 538 // write out allowedURLPatternRegex istead of the domain 539 //siteURLsWriter.write(allowedURLPatternRegex + "\n"); 540 541 // Now restrict any other URLs found to be within the allowedURLPattern 542 // part of the site 520 543 urlFilterWriter.write(regexed_pattern + "\n"); 521 siteRegexWriter.write(regexed_pattern + "\n"); 522 544 siteRegexWriter.write(regexed_pattern + "\n"); 523 545 } 524 546 } -
other-projects/maori-lang-detection/src/org/greenstone/atea/Utility.java
r33623 r33666 60 60 throws Exception 61 61 { 62 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix63 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion64 String domain = domainWithProtocol.substring(startIndex);62 //int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 63 //startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 64 String domain = stripProtocolFromURL(domainWithProtocol); //domainWithProtocol.substring(startIndex); 65 65 66 66 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP … … 82 82 } 83 83 84 public static String stripProtocolAndWWWFromURL(String url) { 85 url = stripProtocolFromURL(url); 86 87 if(url.startsWith("www.")) { // strip any "wwww." at start as well too 88 url = url.substring(4); 89 } 90 91 return url; 92 } 93 94 public static String stripProtocolFromURL(String url) { 95 int startIndex = url.indexOf("//"); // for http:// or https:// prefix 96 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 97 return url.substring(startIndex); 98 } 99 100 84 101 /** Work out the 'domain' for a given url. 85 102 * This retains any www. or subdomain prefix. … … 88 105 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 89 106 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 90 // the keep the URLaround in case param withProtocol=true107 // keep the protocol around in case param withProtocol=true 91 108 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); 92 109
Note:
See TracChangeset
for help on using the changeset viewer.