Changeset 33603 for gs3-extensions
- Timestamp:
- 2019-10-24T22:04:37+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 2 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt
r33565 r33603 314 314 315 315 316 316 ------------------- 317 Dr Nichols's suggestion: can store listing of potential product sites to inspect by checking url for /mi in combination with whether the domain's IP geolocates to OUTSIDE New Zealand (tld nz). 318 * https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java 319 - https://mvnrepository.com/artifact/com.maxmind.geoip/geoip-api/1.2.10 320 - older .dat.gz file is archived at https://web.archive.org/web/20180917084618/http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz 321 - and newer geo country data at https://dev.maxmind.com/geoip/geoip2/geolite2/ 322 * https://dev.maxmind.com/geoip/geoip2/geolite2/ 323 * older GeoIp API (has LookupService): https://github.com/maxmind/geoip-api-java 324 * Newer GeoIp2 API: https://dev.maxmind.com/geoip/geoip2/downloadable/#MaxMind_APIs 325 and https://maxmind.github.io/GeoIP2-java/doc/v2.12.0/ 326 * https://maxmind.github.io/GeoIP2-java/ 327 * https://github.com/AtlasOfLivingAustralia/ala-hub/issues/11 328 329 330 --- 331 https://check-host.net/ip-info 332 https://ipinfo.info/html/ip_checker.php 333 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33582 r33603 275 275 */ 276 276 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 277 File domainURLsFile, File topSiteMatchesFile) { 277 File domainURLsFile, File topSiteMatchesFile, 278 File possibleProductSitesFile) { 278 279 // Maintain a Map of unique domains mapped to seed urls at that domain 279 280 // TreeSet: by default, "the elements are ordered using their natural ordering" … … 286 287 final String PROTOCOL_REGEX_PREFIX = "+^https?://"; 287 288 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 289 290 // keep an eye out on URLs we need to inspect later 291 Set<String> possibleProductDomains = new TreeSet<String>(); 292 File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile()); 288 293 289 294 try ( 290 295 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile)); 296 BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile)); 291 297 ) { 292 298 … … 310 316 } 311 317 318 // Dr Nichols said that a url that was located outside the country and 319 // which had /mi/ URLs was more likely to be an autotranslated (product) site. 320 // Following Dr Nichols' idea, let's keep a look out for more product sites: 321 // if any URL contains /mi AND the tld of its domain is outside of New Zealand 322 // then add that domain (if not already added) and that url into a file 323 // for later manual inspection 324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 325 if(!possibleProductDomains.contains(domainWithProtocol)) { 326 // more expensive test, so do this only if above conditions are true: 327 if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 328 possibleProductDomains.add(domainWithProtocol); 329 // write both domain and URL out to file 330 possibleProductSitesWriter.write(domainWithProtocol + "\n"); 331 possibleProductSitesWriter.write("\t" + url + "\n"); 332 } 333 } else { 334 // already wrote out domain to file, write just the URL out to file 335 possibleProductSitesWriter.write("\t" + url + "\n"); 336 } 337 } 312 338 } 313 339 } catch (IOException ioe) { … … 334 360 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 335 361 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)); 336 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)) 362 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)); 337 363 ) { 338 364 … … 427 453 siteURLsWriter.write(url + "\n"); 428 454 } 429 430 455 431 456 if(allowedURLPatternRegex == null) { // entire site can be crawled … … 887 912 File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 888 913 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 889 890 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); 914 File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt"); 915 916 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 891 917 892 918 info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java
r33467 r33603 2 2 3 3 import java.io.*; 4 import java.net.InetAddress; 4 5 import java.util.zip.GZIPInputStream; 6 7 import com.maxmind.geoip.*; // for LookupService and Location 5 8 6 9 import org.apache.log4j.Logger; … … 41 44 } 42 45 46 /** 47 * Attribution following below is as per https://dev.maxmind.com/geoip/geoip2/geolite2/ 48 * 49 * This product includes GeoLite2 data created by MaxMind, available from 50 * <a href="https://www.maxmind.com">https://www.maxmind.com</a>. 51 * 52 * Usage: 53 * https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java 54 * version I'm using: https://github.com/maxmind/geoip-api-java 55 * Newer version: https://maxmind.github.io/GeoIP2-java/ 56 */ 57 public static boolean isDomainInCountry(String domainWithProtocol, 58 String countryCode, File geoLiteCityDatFile) 59 { 60 countryCode = countryCode.toUpperCase(); 61 62 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 63 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 64 String domain = domainWithProtocol.substring(startIndex); 65 66 boolean result = false; 67 try { 68 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP 69 LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE); 70 71 // get IP for domain 72 InetAddress inetAddress = InetAddress.getByName(domain); 73 String ipAddress = inetAddress.getHostAddress(); 74 75 // get location object for IP 76 Location location = cl.getLocation(ipAddress); 77 78 if(location != null) { 79 // compare country code with tld parameter 80 System.err.println("@@@@ Got country code: " + location.countryCode); 81 result = location.countryCode.equals(countryCode); 82 } else { 83 System.err.println("@@@@ No location info in DB for: " + domainWithProtocol); 84 } 85 } catch(Exception e) { 86 e.printStackTrace(); 87 System.err.println("Could not check if domain " + domain + " was in country: " + countryCode); 88 } finally { 89 return result; 90 } 91 } 43 92 }
Note:
See TracChangeset
for help on using the changeset viewer.