Ignore:
Timestamp:
2019-10-24T22:04:37+13:00 (4 years ago)
Author:
ak19
Message:

Incorporating Dr Nichols suggestion to help weed out product sites: if tld of seed URL addresses containing /mi/ is outside NZ, add to list of possible-product-sites.txt. This should be a smaller number hopefully than all urls containing /mi and, because they're located outside nz, more likely to be a product site than not.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33582 r33603  
    275275     */
    276276    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
    277                     File domainURLsFile, File topSiteMatchesFile) {
     277                    File domainURLsFile, File topSiteMatchesFile,
     278                    File possibleProductSitesFile) {
    278279    // Maintain a Map of unique domains mapped to seed urls at that domain
    279280    // TreeSet: by default, "the elements are ordered using their natural ordering"
     
    286287    final String PROTOCOL_REGEX_PREFIX = "+^https?://";
    287288    final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
     289
     290    // keep an eye out on URLs we need to inspect later
     291    Set<String> possibleProductDomains = new TreeSet<String>();
     292    File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
    288293   
    289294    try (
    290295         BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
     296         BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
    291297         ) {
    292298       
     
    310316        }
    311317
     318        // Dr Nichols said that a url that was located outside the country and
     319        // which had /mi/ URLs was more likely to be an autotranslated (product) site.
     320        // Following Dr Nichols' idea, let's keep a look out for more product sites:
     321        // if any URL contains /mi AND the tld of its domain is outside of New Zealand
     322        // then add that domain (if not already added) and that url into a file
     323        // for later manual inspection
     324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
     325            if(!possibleProductDomains.contains(domainWithProtocol)) {
     326            // more expensive test, so do this only if above conditions are true:
     327            if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     328                possibleProductDomains.add(domainWithProtocol);
     329                // write both domain and URL out to file
     330                possibleProductSitesWriter.write(domainWithProtocol + "\n");
     331                possibleProductSitesWriter.write("\t" + url + "\n");
     332            }
     333            } else {
     334            // already wrote out domain to file, write just the URL out to file
     335            possibleProductSitesWriter.write("\t" + url + "\n");
     336            }
     337        }
    312338        }
    313339    } catch (IOException ioe) {
     
    334360         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
    335361         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
    336          BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
     362         BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));       
    337363         ) {
    338364
     
    427453            siteURLsWriter.write(url + "\n");
    428454            }
    429            
    430455           
    431456            if(allowedURLPatternRegex == null) { // entire site can be crawled
     
    887912    File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
    888913    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
    889    
    890     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
     914    File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
     915   
     916    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
    891917
    892918    info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
Note: See TracChangeset for help on using the changeset viewer.