Changeset 33604


Ignore:
Timestamp:
2019-10-24T23:22:30+13:00 (5 years ago)
Author:
ak19
Message:
  1. Better output into possible-product-sites.txt including the overseas country code prefix to help decide whether the site is worth keeping or not. 2. Updated whitelisting and top-sites filters to grab the /mi/ subsections of sites that don't appear to be autotranslated. This is done in preparation for blocking out product sites hereafter
Location:
gs3-extensions/maori-lang-detection
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

    r33569 r33604  
    6161# special case
    6262mi.centr-zashity.ru,SINGLEPAGE
     63
     64martinvrijland.nl,martinvrijland.nl/mi/
     65csunplugged.org,csunplugged.org/mi/
     66gpedia.com,gpedia.com/mi/
    6367
    6468# TOP SITE BUT NOT TOP 500
  • gs3-extensions/maori-lang-detection/conf/url-whitelist-filter.txt

    r33569 r33604  
    22# whitelist overrides blacklist and greylist.
    33# FORMAT:
    4 # precede URL by ^ to greylist urls that match the given prefix
    5 # succeed URL by $ to greylist urls that match the given suffix
    6 # ^url$ will greylist urls that match the given url completely
    7 # Without either ^ or $ symbol, urls containing the given url will get greylisted
     4# precede URL by ^ to whitelist urls that match the given prefix
     5# succeed URL by $ to whitelist urls that match the given suffix
     6# ^url$ will whitelist urls that match the given url completely
     7# Without either ^ or $ symbol, urls containing the given url will get whitelisted
    88
    99# Special exception for this url on yale.edu, since we needed to blacklist
     
    1515http://www.krassotkin.ru/sites/prayer.su/maori/
    1616https://mi.centr-zashity.ru/
     17
     18
     19
     20# WHITELIST WEBSITES THAT HAVE NON-AUTOMATED /mi/ SUBSECTIONS
     21# WE CONTROL WHAT PART OF THEM WILL BE DOWNLOADED (THE /mi SUBSECTION)
     22# IN sites-too-big-to-exhaustively-crawl.txt
     23#https://www.martinvrijland.nl/mi/te-mana-hinengaro/Ko-te-nuinga-ake-o-nga-tangata-kei-te-timata-ki-te-kite-kei-te-noho-tatou-i-roto-i-te-whakaata-ko-te-aha-tenei/
     24#https://www.csunplugged.org/mi/principles/
     25#http://www.gpedia.com/mi/gpedia/Reo_M%C4%81ori
     26
     27https://www.martinvrijland.nl
     28https://www.csunplugged.org
     29http://www.gpedia.com
     30
     31
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33603 r33604  
    324324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
    325325            if(!possibleProductDomains.contains(domainWithProtocol)) {
    326             // more expensive test, so do this only if above conditions are true:
    327             if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     326
     327            String countryCode = "";
     328            try {
     329                // more expensive test, so do this only if above conditions are true:
     330                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
     331                System.err.println("@@@@ Got country code: " + countryCode);
     332            } catch(Exception exceptObj) {
     333                countryCode = ""; // forces domain to be included for inspection
     334               
     335                error("Could not check if domain " + domainWithProtocol
     336                  + " was in country: " + countryCode,
     337                  exceptObj);
     338            }
     339
     340            boolean isInNZ = countryCode.toLowerCase().equals("nz");
     341           
     342
     343            //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     344            if(!isInNZ) {
    328345                possibleProductDomains.add(domainWithProtocol);
    329                 // write both domain and URL out to file
    330                 possibleProductSitesWriter.write(domainWithProtocol + "\n");
     346                // write both domain and a sample URL on that site out to file
     347                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");             
    331348                possibleProductSitesWriter.write("\t" + url + "\n");
    332349            }
    333             } else {
    334             // already wrote out domain to file, write just the URL out to file
     350            } /*else {
     351            // already wrote out domain to file at some point, write just the URL out to file
    335352            possibleProductSitesWriter.write("\t" + url + "\n");
    336             }
     353            }*/
    337354        }
    338355        }
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33603 r33604  
    5454     * version I'm using: https://github.com/maxmind/geoip-api-java
    5555     * Newer version: https://maxmind.github.io/GeoIP2-java/
     56     *
     57     * @return 2 letter countrycode in uppercase or an exception
    5658     */
     59    public static String getCountryCodeOfDomain(String domainWithProtocol, File geoLiteCityDatFile)
     60    throws Exception
     61    {   
     62    int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
     63    startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     64    String domain = domainWithProtocol.substring(startIndex);   
     65   
     66    // pass in the GeoLiteCity.dat file to be able to do the location lookup for domain's IP
     67    LookupService cl = new LookupService(geoLiteCityDatFile, LookupService.GEOIP_MEMORY_CACHE);
     68       
     69    // get IP for domain
     70    InetAddress inetAddress = InetAddress.getByName(domain);
     71    String ipAddress = inetAddress.getHostAddress();
     72   
     73    // get location object for IP
     74    Location location = cl.getLocation(ipAddress);
     75   
     76    if(location == null) {
     77        throw new Exception("@@@@ No location info in DB for: " + domain);
     78    } else {
     79        return location.countryCode;
     80    }
     81   
     82    }
     83   
    5784    public static boolean isDomainInCountry(String domainWithProtocol,
    5885                        String countryCode, File geoLiteCityDatFile)
Note: See TracChangeset for help on using the changeset viewer.