Changeset 33811

Show
Ignore:
Timestamp:
18.12.2019 16:51:34 (5 weeks ago)
Author:
ak19
Message:

Returning to using a single variable, urlContainsLangCodeInPath, to record both whether any page on a site contains /mi(/) OR http(s)://mi.* in its URL path.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33810 r33811  
    7474     
    7575    private String geoLocationCountryCode = null; /** 2 letter country code */ 
    76     private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */ 
    77     private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */ 
     76    private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */ 
    7877     
    7978    private String domainOfSite; 
     
    208207    } 
    209208     
    210     /*       
    211     // for every site, we just need to work out if any of its pages 
    212     // contains /mi(/) in its URL 
     209    /* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above. 
     210    // For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path 
    213211    for(TextDumpPage aPage : pages) { 
    214         String url = aPage.getPageURL(); 
    215         logger.debug("@@@@ pageURL: " + url); 
    216         if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    217         this.urlContainsLangCodeInPathSuffix = true; 
    218         logger.info("*********** URL CONTAINS SUFFIX"); 
    219         } 
    220         // And if any contains http(s)://mi. in its URL 
    221         if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 
    222         this.urlContainsLangCodeInPathPrefix = true; 
    223         } 
     212        inspectPageURLPath(aPage); 
    224213    } 
    225214    */ 
     
    233222    //logger.debug("@@@@ pageURL: " + url); 
    234223     
    235     // check if each page in site contains /mi(/) in URL, and if so set a site-level variable accordingly 
    236     if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    237         this.urlContainsLangCodeInPathSuffix = true; 
    238     } 
    239     // And if any page contains http(s)://mi. in its URL, then set site level variable for this accordingly 
    240     if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 
    241         this.urlContainsLangCodeInPathPrefix = true; 
     224    if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site, 
     225        // check if this page of the site contains /mi(/) or http(s)://mi in its URL path 
     226        if(url.contains("/mi/") || url.endsWith("/mi") || url.startsWith("https://mi.") || url.startsWith("http://mi.")) { 
     227        this.urlContainsLangCodeInPath = true; 
     228        } 
    242229    } 
    243230    } 
     
    356343          this.numPagesInMRI, this.numPagesContainingMRI, 
    357344          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl, 
    358           this.geoLocationCountryCode, this.urlContainsLangCodeInPathSuffix, this.urlContainsLangCodeInPathPrefix); 
     345          this.geoLocationCountryCode, this.urlContainsLangCodeInPath); 
    359346 
    360347    //mongodbAccess.insertWebsiteInfo(website); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33808 r33811  
    2121     
    2222    public final String geoLocationCountryCode; 
    23     public final boolean urlContainsLangCodeInPathSuffix; 
    24     public final boolean urlContainsLangCodeInPathPrefix; 
     23    public final boolean urlContainsLangCodeInPath; 
    2524     
    2625    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 
     
    2827               int numPagesInMRI, int numPagesContainingMRI, 
    2928               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 
    30                String geoLocationCountryCode, boolean urlContainsLangCodeInPathSuffix, boolean urlContainsLangCodeInPathPrefix) 
     29               String geoLocationCountryCode, boolean urlContainsLangCodeInPath) 
    3130    { 
    3231    //this.id = siteCount; 
     
    4544     
    4645    this.geoLocationCountryCode = geoLocationCountryCode; 
    47     this.urlContainsLangCodeInPathSuffix = urlContainsLangCodeInPathSuffix; 
    48     this.urlContainsLangCodeInPathPrefix = urlContainsLangCodeInPathPrefix; 
     46    this.urlContainsLangCodeInPath = urlContainsLangCodeInPath; 
    4947    } 
    5048}