Changeset 33811


Ignore:
Timestamp:
2019-12-18T16:51:34+13:00 (4 years ago)
Author:
ak19
Message:

Returning to using a single variable, urlContainsLangCodeInPath, to record both whether any page on a site contains /mi(/) OR http(s):mi.* in its URL path.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33810 r33811  
    7474   
    7575    private String geoLocationCountryCode = null; /** 2 letter country code */
    76     private boolean urlContainsLangCodeInPathSuffix = false; /** If any URL on this site contains a /mi(/) in its URL */
    77     private boolean urlContainsLangCodeInPathPrefix = false; /** If any URL on this site contains a http(s)://mi.* in its URL */
     76    private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */
    7877   
    7978    private String domainOfSite;
     
    208207    }
    209208   
    210     /*     
    211     // for every site, we just need to work out if any of its pages
    212     // contains /mi(/) in its URL
     209    /* No need to loop again through all pages. Instead, just inspectPageURLPath() as each page is created above.
     210    // For any site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi.* in its URL path
    213211    for(TextDumpPage aPage : pages) {
    214         String url = aPage.getPageURL();
    215         logger.debug("@@@@ pageURL: " + url);
    216         if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) {
    217         this.urlContainsLangCodeInPathSuffix = true;
    218         logger.info("*********** URL CONTAINS SUFFIX");
    219         }
    220         // And if any contains http(s)://mi. in its URL
    221         if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) {
    222         this.urlContainsLangCodeInPathPrefix = true;
    223         }
     212        inspectPageURLPath(aPage);
    224213    }
    225214    */
     
    233222    //logger.debug("@@@@ pageURL: " + url);
    234223   
    235     // check if each page in site contains /mi(/) in URL, and if so set a site-level variable accordingly
    236     if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) {
    237         this.urlContainsLangCodeInPathSuffix = true;
    238     }
    239     // And if any page contains http(s)://mi. in its URL, then set site level variable for this accordingly
    240     if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) {
    241         this.urlContainsLangCodeInPathPrefix = true;
     224    if(!this.urlContainsLangCodeInPath) { // if not already set to true for any previous page in this site,
     225        // check if this page of the site contains /mi(/) or http(s)://mi in its URL path
     226        if(url.contains("/mi/") || url.endsWith("/mi") || url.startsWith("https://mi.") || url.startsWith("http://mi.")) {
     227        this.urlContainsLangCodeInPath = true;
     228        }
    242229    }
    243230    }
     
    356343          this.numPagesInMRI, this.numPagesContainingMRI,
    357344          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
    358           this.geoLocationCountryCode, this.urlContainsLangCodeInPathSuffix, this.urlContainsLangCodeInPathPrefix);
     345          this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
    359346
    360347    //mongodbAccess.insertWebsiteInfo(website);
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33808 r33811  
    2121   
    2222    public final String geoLocationCountryCode;
    23     public final boolean urlContainsLangCodeInPathSuffix;
    24     public final boolean urlContainsLangCodeInPathPrefix;
     23    public final boolean urlContainsLangCodeInPath;
    2524   
    2625    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite,
     
    2827               int numPagesInMRI, int numPagesContainingMRI,
    2928               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl,
    30                String geoLocationCountryCode, boolean urlContainsLangCodeInPathSuffix, boolean urlContainsLangCodeInPathPrefix)
     29               String geoLocationCountryCode, boolean urlContainsLangCodeInPath)
    3130    {
    3231    //this.id = siteCount;
     
    4544   
    4645    this.geoLocationCountryCode = geoLocationCountryCode;
    47     this.urlContainsLangCodeInPathSuffix = urlContainsLangCodeInPathSuffix;
    48     this.urlContainsLangCodeInPathPrefix = urlContainsLangCodeInPathPrefix;
     46    this.urlContainsLangCodeInPath = urlContainsLangCodeInPath;
    4947    }
    5048}
Note: See TracChangeset for help on using the changeset viewer.