- Timestamp:
- 2019-10-24T22:04:37+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33582 r33603 275 275 */ 276 276 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 277 File domainURLsFile, File topSiteMatchesFile) { 277 File domainURLsFile, File topSiteMatchesFile, 278 File possibleProductSitesFile) { 278 279 // Maintain a Map of unique domains mapped to seed urls at that domain 279 280 // TreeSet: by default, "the elements are ordered using their natural ordering" … … 286 287 final String PROTOCOL_REGEX_PREFIX = "+^https?://"; 287 288 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 289 290 // keep an eye out on URLs we need to inspect later 291 Set<String> possibleProductDomains = new TreeSet<String>(); 292 File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile()); 288 293 289 294 try ( 290 295 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile)); 296 BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile)); 291 297 ) { 292 298 … … 310 316 } 311 317 318 // Dr Nichols said that a url that was located outside the country and 319 // which had /mi/ URLs was more likely to be an autotranslated (product) site. 320 // Following Dr Nichols' idea, let's keep a look out for more product sites: 321 // if any URL contains /mi AND the tld of its domain is outside of New Zealand 322 // then add that domain (if not already added) and that url into a file 323 // for later manual inspection 324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 325 if(!possibleProductDomains.contains(domainWithProtocol)) { 326 // more expensive test, so do this only if above conditions are true: 327 if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 328 possibleProductDomains.add(domainWithProtocol); 329 // write both domain and URL out to file 330 possibleProductSitesWriter.write(domainWithProtocol + "\n"); 331 possibleProductSitesWriter.write("\t" + url + "\n"); 332 } 333 } else { 334 // already wrote out domain to file, write just the URL out to file 335 possibleProductSitesWriter.write("\t" + url + "\n"); 336 } 337 } 312 338 } 313 339 } catch (IOException ioe) { … … 334 360 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 335 361 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)); 336 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)) 362 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)); 337 363 ) { 338 364 … … 427 453 siteURLsWriter.write(url + "\n"); 428 454 } 429 430 455 431 456 if(allowedURLPatternRegex == null) { // entire site can be crawled … … 887 912 File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 888 913 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 889 890 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); 914 File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt"); 915 916 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 891 917 892 918 info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
Note:
See TracChangeset
for help on using the changeset viewer.