Changeset 33603

Show
Ignore:
Timestamp:
24.10.2019 22:04:37 (3 weeks ago)
Author:
ak19
Message:

Incorporating Dr Nichols suggestion to help weed out product sites: if tld of seed URL addresses containing /mi/ is outside NZ, add to list of possible-product-sites.txt. This should be a smaller number hopefully than all urls containing /mi and, because they're located outside nz, more likely to be a product site than not.

Location:
gs3-extensions/maori-lang-detection
Files:
2 added
3 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt

    r33565 r33603  
    314314 
    315315 
    316  
     316------------------- 
     317Dr Nichols's suggestion: can store listing of potential product sites to inspect by checking url for /mi in combination with whether the domain's IP geolocates to OUTSIDE New Zealand (tld nz). 
     318* https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java 
     319  - https://mvnrepository.com/artifact/com.maxmind.geoip/geoip-api/1.2.10 
     320  - older .dat.gz file is archived at https://web.archive.org/web/20180917084618/http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz 
     321  - and newer geo country data at https://dev.maxmind.com/geoip/geoip2/geolite2/ 
     322* https://dev.maxmind.com/geoip/geoip2/geolite2/ 
     323* older GeoIp API (has LookupService): https://github.com/maxmind/geoip-api-java 
     324* Newer GeoIp2 API: https://dev.maxmind.com/geoip/geoip2/downloadable/#MaxMind_APIs 
     325    and https://maxmind.github.io/GeoIP2-java/doc/v2.12.0/ 
     326* https://maxmind.github.io/GeoIP2-java/ 
     327* https://github.com/AtlasOfLivingAustralia/ala-hub/issues/11 
     328 
     329 
     330--- 
     331https://check-host.net/ip-info 
     332https://ipinfo.info/html/ip_checker.php 
     333 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33582 r33603  
    275275     */ 
    276276    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 
    277                     File domainURLsFile, File topSiteMatchesFile) { 
     277                    File domainURLsFile, File topSiteMatchesFile, 
     278                    File possibleProductSitesFile) { 
    278279    // Maintain a Map of unique domains mapped to seed urls at that domain 
    279280    // TreeSet: by default, "the elements are ordered using their natural ordering" 
     
    286287    final String PROTOCOL_REGEX_PREFIX = "+^https?://"; 
    287288    final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 
     289 
     290    // keep an eye out on URLs we need to inspect later 
     291    Set<String> possibleProductDomains = new TreeSet<String>(); 
     292    File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile()); 
    288293     
    289294    try ( 
    290295         BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile)); 
     296         BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile)); 
    291297         ) { 
    292298         
     
    310316        } 
    311317 
     318        // Dr Nichols said that a url that was located outside the country and 
     319        // which had /mi/ URLs was more likely to be an autotranslated (product) site. 
     320        // Following Dr Nichols' idea, let's keep a look out for more product sites: 
     321        // if any URL contains /mi AND the tld of its domain is outside of New Zealand 
     322        // then add that domain (if not already added) and that url into a file 
     323        // for later manual inspection 
     324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     325            if(!possibleProductDomains.contains(domainWithProtocol)) { 
     326            // more expensive test, so do this only if above conditions are true: 
     327            if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 
     328                possibleProductDomains.add(domainWithProtocol); 
     329                // write both domain and URL out to file 
     330                possibleProductSitesWriter.write(domainWithProtocol + "\n"); 
     331                possibleProductSitesWriter.write("\t" + url + "\n"); 
     332            } 
     333            } else { 
     334            // already wrote out domain to file, write just the URL out to file 
     335            possibleProductSitesWriter.write("\t" + url + "\n"); 
     336            } 
     337        } 
    312338        } 
    313339    } catch (IOException ioe) { 
     
    334360         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 
    335361         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)); 
    336          BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)) 
     362         BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));        
    337363         ) { 
    338364 
     
    427453            siteURLsWriter.write(url + "\n"); 
    428454            } 
    429              
    430455             
    431456            if(allowedURLPatternRegex == null) { // entire site can be crawled 
     
    887912    File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 
    888913    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 
    889      
    890     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); 
     914    File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt"); 
     915     
     916    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 
    891917 
    892918    info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33467 r33603  
    22 
    33import java.io.*; 
     4import java.net.InetAddress; 
    45import java.util.zip.GZIPInputStream; 
     6 
     7import com.maxmind.geoip.*; // for LookupService and Location 
    58 
    69import org.apache.log4j.Logger; 
     
    4144    } 
    4245 
     46    /** 
     47     * Attribution following below is as per https://dev.maxmind.com/geoip/geoip2/geolite2/ 
     48     * 
     49     * This product includes GeoLite2 data created by MaxMind, available from 
     50     * <a href="https://www.maxmind.com">https://www.maxmind.com</a>. 
     51     * 
     52     * Usage: 
     53     * https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java 
     54     * version I'm using: https://github.com/maxmind/geoip-api-java 
     55     * Newer version: https://maxmind.github.io/GeoIP2-java/ 
     56     */ 
     57    public static boolean isDomainInCountry(String domainWithProtocol, 
     58                        String countryCode, File geoLiteCityDatFile) 
     59    { 
     60    countryCode = countryCode.toUpperCase(); 
     61     
     62    int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 
     63    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     64    String domain = domainWithProtocol.substring(startIndex);    
     65     
     66    boolean result = false; 
     67    try { 
     68        // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP 
     69        LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE); 
     70         
     71        // get IP for domain 
     72        InetAddress inetAddress = InetAddress.getByName(domain); 
     73        String ipAddress = inetAddress.getHostAddress(); 
     74         
     75        // get location object for IP 
     76        Location location = cl.getLocation(ipAddress); 
     77 
     78        if(location != null) { 
     79        // compare country code with tld parameter 
     80        System.err.println("@@@@ Got country code: " + location.countryCode); 
     81        result = location.countryCode.equals(countryCode); 
     82        } else { 
     83        System.err.println("@@@@ No location info in DB for: " + domainWithProtocol); 
     84        } 
     85    } catch(Exception e) { 
     86        e.printStackTrace(); 
     87        System.err.println("Could not check if domain " + domain + " was in country: " + countryCode); 
     88    } finally { 
     89        return result; 
     90    } 
     91    } 
    4392}