Ignore:
Timestamp:
2019-10-24T23:22:30+13:00 (5 years ago)
Author:
ak19
Message:
  1. Better output into possible-product-sites.txt including the overseas country code prefix to help decide whether the site is worth keeping or not. 2. Updated whitelisting and top-sites filters to grab the /mi/ subsections of sites that don't appear to be autotranslated. This is done in preparation for blocking out product sites hereafter
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33603 r33604  
    324324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
    325325            if(!possibleProductDomains.contains(domainWithProtocol)) {
    326             // more expensive test, so do this only if above conditions are true:
    327             if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     326
     327            String countryCode = "";
     328            try {
     329                // more expensive test, so do this only if above conditions are true:
     330                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
     331                System.err.println("@@@@ Got country code: " + countryCode);
     332            } catch(Exception exceptObj) {
     333                countryCode = ""; // forces domain to be included for inspection
     334               
     335                error("Could not check if domain " + domainWithProtocol
     336                  + " was in country: " + countryCode,
     337                  exceptObj);
     338            }
     339
     340            boolean isInNZ = countryCode.toLowerCase().equals("nz");
     341           
     342
     343            //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     344            if(!isInNZ) {
    328345                possibleProductDomains.add(domainWithProtocol);
    329                 // write both domain and URL out to file
    330                 possibleProductSitesWriter.write(domainWithProtocol + "\n");
     346                // write both domain and a sample URL on that site out to file
     347                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");             
    331348                possibleProductSitesWriter.write("\t" + url + "\n");
    332349            }
    333             } else {
    334             // already wrote out domain to file, write just the URL out to file
     350            } /*else {
     351            // already wrote out domain to file at some point, write just the URL out to file
    335352            possibleProductSitesWriter.write("\t" + url + "\n");
    336             }
     353            }*/
    337354        }
    338355        }
Note: See TracChangeset for help on using the changeset viewer.