Changeset 33603


Ignore:
Timestamp:
2019-10-24T22:04:37+13:00 (4 years ago)
Author:
ak19
Message:

Incorporating Dr Nichols suggestion to help weed out product sites: if tld of seed URL addresses containing /mi/ is outside NZ, add to list of possible-product-sites.txt. This should be a smaller number hopefully than all urls containing /mi and, because they're located outside nz, more likely to be a product site than not.

Location:
gs3-extensions/maori-lang-detection
Files:
2 added
3 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt

    r33565 r33603  
    314314
    315315
    316 
     316-------------------
     317Dr Nichols's suggestion: can store listing of potential product sites to inspect by checking url for /mi in combination with whether the domain's IP geolocates to OUTSIDE New Zealand (tld nz).
     318* https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java
     319  - https://mvnrepository.com/artifact/com.maxmind.geoip/geoip-api/1.2.10
     320  - older .dat.gz file is archived at https://web.archive.org/web/20180917084618/http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz
     321  - and newer geo country data at https://dev.maxmind.com/geoip/geoip2/geolite2/
     322* https://dev.maxmind.com/geoip/geoip2/geolite2/
     323* older GeoIp API (has LookupService): https://github.com/maxmind/geoip-api-java
     324* Newer GeoIp2 API: https://dev.maxmind.com/geoip/geoip2/downloadable/#MaxMind_APIs
     325    and https://maxmind.github.io/GeoIP2-java/doc/v2.12.0/
     326* https://maxmind.github.io/GeoIP2-java/
     327* https://github.com/AtlasOfLivingAustralia/ala-hub/issues/11
     328
     329
     330---
     331https://check-host.net/ip-info
     332https://ipinfo.info/html/ip_checker.php
     333
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33582 r33603  
    275275     */
    276276    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
    277                     File domainURLsFile, File topSiteMatchesFile) {
     277                    File domainURLsFile, File topSiteMatchesFile,
     278                    File possibleProductSitesFile) {
    278279    // Maintain a Map of unique domains mapped to seed urls at that domain
    279280    // TreeSet: by default, "the elements are ordered using their natural ordering"
     
    286287    final String PROTOCOL_REGEX_PREFIX = "+^https?://";
    287288    final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
     289
     290    // keep an eye out on URLs we need to inspect later
     291    Set<String> possibleProductDomains = new TreeSet<String>();
     292    File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
    288293   
    289294    try (
    290295         BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
     296         BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
    291297         ) {
    292298       
     
    310316        }
    311317
     318        // Dr Nichols said that a url that was located outside the country and
     319        // which had /mi/ URLs was more likely to be an autotranslated (product) site.
     320        // Following Dr Nichols' idea, let's keep a look out for more product sites:
     321        // if any URL contains /mi AND the tld of its domain is outside of New Zealand
     322        // then add that domain (if not already added) and that url into a file
     323        // for later manual inspection
     324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
     325            if(!possibleProductDomains.contains(domainWithProtocol)) {
     326            // more expensive test, so do this only if above conditions are true:
     327            if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     328                possibleProductDomains.add(domainWithProtocol);
     329                // write both domain and URL out to file
     330                possibleProductSitesWriter.write(domainWithProtocol + "\n");
     331                possibleProductSitesWriter.write("\t" + url + "\n");
     332            }
     333            } else {
     334            // already wrote out domain to file, write just the URL out to file
     335            possibleProductSitesWriter.write("\t" + url + "\n");
     336            }
     337        }
    312338        }
    313339    } catch (IOException ioe) {
     
    334360         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
    335361         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
    336          BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
     362         BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));       
    337363         ) {
    338364
     
    427453            siteURLsWriter.write(url + "\n");
    428454            }
    429            
    430455           
    431456            if(allowedURLPatternRegex == null) { // entire site can be crawled
     
    887912    File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
    888913    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
    889    
    890     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
     914    File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
     915   
     916    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
    891917
    892918    info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33467 r33603  
    22
    33import java.io.*;
     4import java.net.InetAddress;
    45import java.util.zip.GZIPInputStream;
     6
     7import com.maxmind.geoip.*; // for LookupService and Location
    58
    69import org.apache.log4j.Logger;
     
    4144    }
    4245
     46    /**
     47     * Attribution following below is as per https://dev.maxmind.com/geoip/geoip2/geolite2/
     48     *
     49     * This product includes GeoLite2 data created by MaxMind, available from
     50     * <a href="https://www.maxmind.com">https://www.maxmind.com</a>.
     51     *
     52     * Usage:
     53     * https://stackoverflow.com/questions/1415851/best-way-to-get-geo-location-in-java
     54     * version I'm using: https://github.com/maxmind/geoip-api-java
     55     * Newer version: https://maxmind.github.io/GeoIP2-java/
     56     */
     57    public static boolean isDomainInCountry(String domainWithProtocol,
     58                        String countryCode, File geoLiteCityDatFile)
     59    {
     60    countryCode = countryCode.toUpperCase();
     61   
     62    int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
     63    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     64    String domain = domainWithProtocol.substring(startIndex);   
     65   
     66    boolean result = false;
     67    try {
     68        // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP
     69        LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE);
     70       
     71        // get IP for domain
     72        InetAddress inetAddress = InetAddress.getByName(domain);
     73        String ipAddress = inetAddress.getHostAddress();
     74       
     75        // get location object for IP
     76        Location location = cl.getLocation(ipAddress);
     77
     78        if(location != null) {
     79        // compare country code with tld parameter
     80        System.err.println("@@@@ Got country code: " + location.countryCode);
     81        result = location.countryCode.equals(countryCode);
     82        } else {
     83        System.err.println("@@@@ No location info in DB for: " + domainWithProtocol);
     84        }
     85    } catch(Exception e) {
     86        e.printStackTrace();
     87        System.err.println("Could not check if domain " + domain + " was in country: " + countryCode);
     88    } finally {
     89        return result;
     90    }
     91    }
    4392}
Note: See TracChangeset for help on using the changeset viewer.