Changeset 33810

Show
Ignore:
Timestamp:
17.12.2019 21:48:08 (5 weeks ago)
Author:
ak19
Message:

Bugfix: mi in url path should be checked for for each page of site, not just last page. Got closing bracket in loop in the wrong place.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33808 r33810  
    5555    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName()); 
    5656 
    57     static boolean DEBUG_MODE = true; 
     57    static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class 
    5858 
    5959    /** Counter for number of sites. 
     
    159159            //page.getText(); 
    160160            pages.add(page); 
     161            inspectPageURLPath(page); 
    161162            pageDump = null; 
    162163 
     
    176177        // can throw away any newlines between text start and end markers. 
    177178         
    178         prevLine = line;         
     179        prevLine = line; 
    179180        } 
    180181 
     
    187188        TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 
    188189        pages.add(page); 
     190        inspectPageURLPath(page); 
    189191        pageDump = null; 
    190  
    191         // for every site, we just need to work out if any of its pages 
    192         // contains /mi(/) in its URL 
    193         String url = page.getPageURL(); 
    194         if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    195             this.urlContainsLangCodeInPathSuffix = true; 
    196         } 
    197         if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 
    198             this.urlContainsLangCodeInPathPrefix = true; 
    199         } 
    200         } 
     192        }    
     193 
    201194         
    202195    } catch (IOException ioe) { 
     
    215208    } 
    216209     
    217      
     210    /*       
     211    // for every site, we just need to work out if any of its pages 
     212    // contains /mi(/) in its URL 
     213    for(TextDumpPage aPage : pages) { 
     214        String url = aPage.getPageURL(); 
     215        logger.debug("@@@@ pageURL: " + url); 
     216        if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     217        this.urlContainsLangCodeInPathSuffix = true; 
     218        logger.info("*********** URL CONTAINS SUFFIX"); 
     219        } 
     220        // And if any contains http(s)://mi. in its URL 
     221        if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 
     222        this.urlContainsLangCodeInPathPrefix = true; 
     223        } 
     224    } 
     225    */ 
    218226    prepareSiteStats(mongodbAccess); 
    219227    } 
    220228 
     229    /** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL. 
     230      * This method is called on each page of a site as the page is created. */ 
     231    private void inspectPageURLPath(TextDumpPage page) { 
     232    String url = page.getPageURL(); 
     233    //logger.debug("@@@@ pageURL: " + url); 
     234     
     235    // check if each page in site contains /mi(/) in URL, and if so set a site-level variable accordingly 
     236    if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     237        this.urlContainsLangCodeInPathSuffix = true; 
     238    } 
     239    // And if any page contains http(s)://mi. in its URL, then set site level variable for this accordingly 
     240    if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) { 
     241        this.urlContainsLangCodeInPathPrefix = true; 
     242    } 
     243    } 
     244     
    221245     
    222246    private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {