Changeset 33604
- Timestamp:
- 2019-10-24T23:22:30+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt
r33569 r33604 61 61 # special case 62 62 mi.centr-zashity.ru,SINGLEPAGE 63 64 martinvrijland.nl,martinvrijland.nl/mi/ 65 csunplugged.org,csunplugged.org/mi/ 66 gpedia.com,gpedia.com/mi/ 63 67 64 68 # TOP SITE BUT NOT TOP 500 -
gs3-extensions/maori-lang-detection/conf/url-whitelist-filter.txt
r33569 r33604 2 2 # whitelist overrides blacklist and greylist. 3 3 # FORMAT: 4 # precede URL by ^ to greylist urls that match the given prefix5 # succeed URL by $ to greylist urls that match the given suffix6 # ^url$ will greylist urls that match the given url completely7 # Without either ^ or $ symbol, urls containing the given url will get greylisted4 # precede URL by ^ to whitelist urls that match the given prefix 5 # succeed URL by $ to whitelist urls that match the given suffix 6 # ^url$ will whitelist urls that match the given url completely 7 # Without either ^ or $ symbol, urls containing the given url will get whitelisted 8 8 9 9 # Special exception for this url on yale.edu, since we needed to blacklist … … 15 15 http://www.krassotkin.ru/sites/prayer.su/maori/ 16 16 https://mi.centr-zashity.ru/ 17 18 19 20 # WHITELIST WEBSITES THAT HAVE NON-AUTOMATED /mi/ SUBSECTIONS 21 # WE CONTROL WHAT PART OF THEM WILL BE DOWNLOADED (THE /mi SUBSECTION) 22 # IN sites-too-big-to-exhaustively-crawl.txt 23 #https://www.martinvrijland.nl/mi/te-mana-hinengaro/Ko-te-nuinga-ake-o-nga-tangata-kei-te-timata-ki-te-kite-kei-te-noho-tatou-i-roto-i-te-whakaata-ko-te-aha-tenei/ 24 #https://www.csunplugged.org/mi/principles/ 25 #http://www.gpedia.com/mi/gpedia/Reo_M%C4%81ori 26 27 https://www.martinvrijland.nl 28 https://www.csunplugged.org 29 http://www.gpedia.com 30 31 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33603 r33604 324 324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 325 325 if(!possibleProductDomains.contains(domainWithProtocol)) { 326 // more expensive test, so do this only if above conditions are true: 327 if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 326 327 String countryCode = ""; 328 try { 329 // more expensive test, so do this only if above conditions are true: 330 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile); 331 System.err.println("@@@@ Got country code: " + countryCode); 332 } catch(Exception exceptObj) { 333 countryCode = ""; // forces domain to be included for inspection 334 335 error("Could not check if domain " + domainWithProtocol 336 + " was in country: " + countryCode, 337 exceptObj); 338 } 339 340 boolean isInNZ = countryCode.toLowerCase().equals("nz"); 341 342 343 //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 344 if(!isInNZ) { 328 345 possibleProductDomains.add(domainWithProtocol); 329 // write both domain and URLout to file330 possibleProductSitesWriter.write( domainWithProtocol + "\n");346 // write both domain and a sample URL on that site out to file 347 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n"); 331 348 possibleProductSitesWriter.write("\t" + url + "\n"); 332 349 } 333 } else {334 // already wrote out domain to file , write just the URL out to file350 } /*else { 351 // already wrote out domain to file at some point, write just the URL out to file 335 352 possibleProductSitesWriter.write("\t" + url + "\n"); 336 }353 }*/ 337 354 } 338 355 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java
r33603 r33604 54 54 * version I'm using: https://github.com/maxmind/geoip-api-java 55 55 * Newer version: https://maxmind.github.io/GeoIP2-java/ 56 * 57 * @return 2 letter countrycode in uppercase or an exception 56 58 */ 59 public static String getCountryCodeOfDomain(String domainWithProtocol, File geoLiteCityDatFile) 60 throws Exception 61 { 62 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 63 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 64 String domain = domainWithProtocol.substring(startIndex); 65 66 // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP 67 LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE); 68 69 // get IP for domain 70 InetAddress inetAddress = InetAddress.getByName(domain); 71 String ipAddress = inetAddress.getHostAddress(); 72 73 // get location object for IP 74 Location location = cl.getLocation(ipAddress); 75 76 if(location == null) { 77 throw new Exception("@@@@ No location info in DB for: " + domain); 78 } else { 79 return location.countryCode; 80 } 81 82 } 83 57 84 public static boolean isDomainInCountry(String domainWithProtocol, 58 85 String countryCode, File geoLiteCityDatFile)
Note:
See TracChangeset
for help on using the changeset viewer.