Changeset 33808

Show
Ignore:
Timestamp:
17.12.2019 19:31:28 (5 weeks ago)
Author:
ak19
Message:

Storing not just whether /mi(/) suffix is in path, but also whether http(s)://mi. is in path, as storing these can help reducing number of auto-translated sites too in a similar way.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33801 r33808  
    7474     
    7575    private String geoLocationCountryCode = null; /** 2 letter country code */ 
    76     private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) in its URL */ 
     76    private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */ 
     77    private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */ 
    7778     
    7879    private String domainOfSite; 
     
    191192        // contains /mi(/) in its URL 
    192193        String url = page.getPageURL(); 
    193         if(!this.urlContainsLangCodeInPath && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    194             this.urlContainsLangCodeInPath = true; 
     194        if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     195            this.urlContainsLangCodeInPathSuffix = true; 
     196        } 
     197        if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 
     198            this.urlContainsLangCodeInPathPrefix = true; 
    195199        } 
    196200        } 
     
    328332          this.numPagesInMRI, this.numPagesContainingMRI, 
    329333          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 
    330           this.geoLocationCountryCode, this.urlContainsLangCodeInPath); 
     334          this.geoLocationCountryCode, this.urlContainsLangCodeInPathSuffix, this.urlContainsLangCodeInPathPrefix); 
    331335 
    332336    //mongodbAccess.insertWebsiteInfo(website); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33801 r33808  
    2121     
    2222    public final String geoLocationCountryCode; 
    23     public final boolean urlContainsLangCodeInPath; 
     23    public final boolean urlContainsLangCodeInPathSuffix; 
     24    public final boolean urlContainsLangCodeInPathPrefix; 
    2425     
    2526    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 
     
    2728               int numPagesInMRI, int numPagesContainingMRI, 
    2829               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 
    29                String geoLocationCountryCode, boolean urlContainsLangCodeInPath) 
     30               String geoLocationCountryCode, boolean urlContainsLangCodeInPathSuffix, boolean urlContainsLangCodeInPathPrefix) 
    3031    { 
    3132    //this.id = siteCount; 
     
    4445     
    4546    this.geoLocationCountryCode = geoLocationCountryCode; 
    46     this.urlContainsLangCodeInPath = urlContainsLangCodeInPath; 
     47    this.urlContainsLangCodeInPathSuffix = urlContainsLangCodeInPathSuffix; 
     48    this.urlContainsLangCodeInPathPrefix = urlContainsLangCodeInPathPrefix; 
    4749    } 
    4850}