Ignore:
Timestamp:
2019-12-17T19:31:28+13:00 (4 years ago)
Author:
ak19
Message:

Storing not just whether /mi(/) suffix is in path, but also whether http(s):mi. is in path, as storing these can help reducing number of auto-translated sites too in a similar way.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33801 r33808  
    7474   
    7575    private String geoLocationCountryCode = null; /** 2 letter country code */
    76     private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */
     76    private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */
     77    private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */
    7778   
    7879    private String domainOfSite;
     
    191192        // contains /mi(/) in its URL
    192193        String url = page.getPageURL();
    193         if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) {
    194             this.urlContainsLangCodeInPath = true;
     194        if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) {
     195            this.urlContainsLangCodeInPathSuffix = true;
     196        }
     197        if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) {
     198            this.urlContainsLangCodeInPathPrefix = true;
    195199        }
    196200        }
     
    328332          this.numPagesInMRI, this.numPagesContainingMRI,
    329333          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
    330           this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
     334          this.geoLocationCountryCode, this.urlContainsLangCodeInPathSuffix, this.urlContainsLangCodeInPathPrefix);
    331335
    332336    //mongodbAccess.insertWebsiteInfo(website);
Note: See TracChangeset for help on using the changeset viewer.