- Timestamp:
- 2019-11-05T21:04:09+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33615 r33623 237 237 } 238 238 239 /** Work out the 'domain' for a given url.240 * This retains any www. or subdomain prefix.241 */242 public static String getDomainForURL(String url, boolean withProtocol) {243 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix244 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion245 // the keep the URL around in case param withProtocol=true246 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);247 248 String domain = url.substring(startIndex);249 int endIndex = domain.indexOf("/");250 if(endIndex == -1) endIndex = domain.length();251 domain = domain.substring(0, endIndex);252 253 if(withProtocol) {254 // now that we have the domain (everything to the first / when there is no protocol)255 // can glue the protocol back on256 domain = protocol + domain;257 }258 259 return domain;260 }261 239 262 240 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */ … … 304 282 // work out domain. This retains any www. or subdomain prefix 305 283 // passing true to further also retain the http(s) protocol 306 domainWithProtocol = getDomainForURL(url, true);284 domainWithProtocol = Utility.getDomainForURL(url, true); 307 285 308 286 Set<String> urlsSet; … … 316 294 } 317 295 296 /* 318 297 // Dr Nichols said that a url that was located outside the country and 319 298 // which had /mi/ URLs was more likely to be an autotranslated (product) site. … … 322 301 // then add that domain (if not already added) and that url into a file 323 302 // for later manual inspection 324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 325 /* 303 if(!domainWithProtocol.endsWith(".nz") 304 && (url.contains("/mi/") || url.endsWith("/mi"))) { 305 326 306 if(!possibleProductDomains.contains(domainWithProtocol)) { 327 307 … … 345 325 if(!isInNZ) { 346 326 possibleProductDomains.add(domainWithProtocol); 347 // write both domain and a sample URL on that site out to file327 // write both domain and a sample seedURL on that site out to file 348 328 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n"); 349 329 possibleProductSitesWriter.write("\t" + url + "\n"); 350 330 } 351 }*/ /*else { 352 // already wrote out domain to file at some point, write just the URL out to file 353 possibleProductSitesWriter.write("\t" + url + "\n"); 354 }*/ 355 } 331 } 332 //else { 333 // already wrote out domain to file at some point, write just the URL out to file 334 //possibleProductSitesWriter.write("\t" + url + "\n"); 335 //} 336 } 337 */ 356 338 } 357 339 } catch (IOException ioe) { … … 686 668 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually, 687 669 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted 688 String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol670 String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol 689 671 if(!isBlackListed && url.contains("jasmin")) { 690 672 logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); … … 821 803 public static void printUsage() { 822 804 System.err.println("Run this program as:"); 823 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");805 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>"); 824 806 } 825 807
Note:
See TracChangeset
for help on using the changeset viewer.