Changeset 33604

Show
Ignore:
Timestamp:
24.10.2019 23:22:30 (3 weeks ago)
Author:
ak19
Message:

1. Better output into possible-product-sites.txt including the overseas country code prefix to help decide whether the site is worth keeping or not. 2. Updated whitelisting and top-sites filters to grab the /mi/ subsections of sites that don't appear to be autotranslated. This is done in preparation for blocking out product sites hereafter

Location:
gs3-extensions/maori-lang-detection
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

    r33569 r33604  
    6161# special case 
    6262mi.centr-zashity.ru,SINGLEPAGE 
     63 
     64martinvrijland.nl,martinvrijland.nl/mi/ 
     65csunplugged.org,csunplugged.org/mi/ 
     66gpedia.com,gpedia.com/mi/ 
    6367 
    6468# TOP SITE BUT NOT TOP 500 
  • gs3-extensions/maori-lang-detection/conf/url-whitelist-filter.txt

    r33569 r33604  
    22# whitelist overrides blacklist and greylist. 
    33# FORMAT: 
    4 # precede URL by ^ to greylist urls that match the given prefix 
    5 # succeed URL by $ to greylist urls that match the given suffix 
    6 # ^url$ will greylist urls that match the given url completely 
    7 # Without either ^ or $ symbol, urls containing the given url will get greylisted 
     4# precede URL by ^ to whitelist urls that match the given prefix 
     5# succeed URL by $ to whitelist urls that match the given suffix 
     6# ^url$ will whitelist urls that match the given url completely 
     7# Without either ^ or $ symbol, urls containing the given url will get whitelisted 
    88 
    99# Special exception for this url on yale.edu, since we needed to blacklist 
     
    1515http://www.krassotkin.ru/sites/prayer.su/maori/ 
    1616https://mi.centr-zashity.ru/ 
     17 
     18 
     19 
     20# WHITELIST WEBSITES THAT HAVE NON-AUTOMATED /mi/ SUBSECTIONS 
     21# WE CONTROL WHAT PART OF THEM WILL BE DOWNLOADED (THE /mi SUBSECTION) 
     22# IN sites-too-big-to-exhaustively-crawl.txt 
     23#https://www.martinvrijland.nl/mi/te-mana-hinengaro/Ko-te-nuinga-ake-o-nga-tangata-kei-te-timata-ki-te-kite-kei-te-noho-tatou-i-roto-i-te-whakaata-ko-te-aha-tenei/ 
     24#https://www.csunplugged.org/mi/principles/ 
     25#http://www.gpedia.com/mi/gpedia/Reo_M%C4%81ori 
     26 
     27https://www.martinvrijland.nl 
     28https://www.csunplugged.org 
     29http://www.gpedia.com 
     30 
     31 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33603 r33604  
    324324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    325325            if(!possibleProductDomains.contains(domainWithProtocol)) { 
    326             // more expensive test, so do this only if above conditions are true: 
    327             if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 
     326 
     327            String countryCode = ""; 
     328            try { 
     329                // more expensive test, so do this only if above conditions are true: 
     330                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile); 
     331                System.err.println("@@@@ Got country code: " + countryCode); 
     332            } catch(Exception exceptObj) { 
     333                countryCode = ""; // forces domain to be included for inspection 
     334                 
     335                error("Could not check if domain " + domainWithProtocol 
     336                  + " was in country: " + countryCode, 
     337                  exceptObj); 
     338            } 
     339 
     340            boolean isInNZ = countryCode.toLowerCase().equals("nz"); 
     341             
     342 
     343            //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 
     344            if(!isInNZ) { 
    328345                possibleProductDomains.add(domainWithProtocol); 
    329                 // write both domain and URL out to file 
    330                 possibleProductSitesWriter.write(domainWithProtocol + "\n"); 
     346                // write both domain and a sample URL on that site out to file 
     347                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");               
    331348                possibleProductSitesWriter.write("\t" + url + "\n"); 
    332349            } 
    333             } else { 
    334             // already wrote out domain to file, write just the URL out to file 
     350            } /*else { 
     351            // already wrote out domain to file at some point, write just the URL out to file 
    335352            possibleProductSitesWriter.write("\t" + url + "\n"); 
    336             } 
     353            }*/ 
    337354        } 
    338355        } 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33603 r33604  
    5454     * version I'm using: https://github.com/maxmind/geoip-api-java 
    5555     * Newer version: https://maxmind.github.io/GeoIP2-java/ 
     56     * 
     57     * @return 2 letter countrycode in uppercase or an exception 
    5658     */ 
     59    public static String getCountryCodeOfDomain(String domainWithProtocol, File geoLiteCityDatFile) 
     60    throws Exception 
     61    {    
     62    int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 
     63    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     64    String domain = domainWithProtocol.substring(startIndex);    
     65     
     66    // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP 
     67    LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE); 
     68         
     69    // get IP for domain 
     70    InetAddress inetAddress = InetAddress.getByName(domain); 
     71    String ipAddress = inetAddress.getHostAddress(); 
     72     
     73    // get location object for IP 
     74    Location location = cl.getLocation(ipAddress); 
     75     
     76    if(location == null) { 
     77        throw new Exception("@@@@ No location info in DB for: " + domain); 
     78    } else { 
     79        return location.countryCode; 
     80    } 
     81     
     82    } 
     83     
    5784    public static boolean isDomainInCountry(String domainWithProtocol, 
    5885                        String countryCode, File geoLiteCityDatFile)