Changeset 33810


Ignore:
Timestamp:
2019-12-17T21:48:08+13:00 (4 years ago)
Author:
ak19
Message:

Bugfix: mi in url path should be checked for for each page of site, not just last page. Got closing bracket in loop in the wrong place.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33808 r33810  
    5555    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToMongoDB.class.getName());
    5656
    57     static boolean DEBUG_MODE = true;
     57    static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
    5858
    5959    /** Counter for number of sites.
     
    159159            //page.getText();
    160160            pages.add(page);
     161            inspectPageURLPath(page);
    161162            pageDump = null;
    162163
     
    176177        // can throw away any newlines between text start and end markers.
    177178       
    178         prevLine = line;       
     179        prevLine = line;
    179180        }
    180181
     
    187188        TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
    188189        pages.add(page);
     190        inspectPageURLPath(page);
    189191        pageDump = null;
    190 
    191         // for every site, we just need to work out if any of its pages
    192         // contains /mi(/) in its URL
    193         String url = page.getPageURL();
    194         if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) {
    195             this.urlContainsLangCodeInPathSuffix = true;
    196         }
    197         if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) {
    198             this.urlContainsLangCodeInPathPrefix = true;
    199         }
    200         }
     192        }   
     193
    201194       
    202195    } catch (IOException ioe) {
     
    215208    }
    216209   
    217    
     210    /*     
     211    // for every site, we just need to work out if any of its pages
     212    // contains /mi(/) in its URL
     213    for(TextDumpPage aPage : pages) {
     214        String url = aPage.getPageURL();
     215        logger.debug("@@@@ pageURL: " + url);
     216        if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) {
     217        this.urlContainsLangCodeInPathSuffix = true;
     218        logger.info("*********** URL CONTAINS SUFFIX");
     219        }
     220        // And if any contains http(s)://mi. in its URL
     221        if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) {
     222        this.urlContainsLangCodeInPathPrefix = true;
     223        }
     224    }
     225    */
    218226    prepareSiteStats(mongodbAccess);
    219227    }
    220228
     229    /** for every site, we just need to work out if any of its pages contains /mi(/) or http(s)://mi. in its URL.
     230      * This method is called on each page of a site as the page is created. */
     231    private void inspectPageURLPath(TextDumpPage page) {
     232    String url = page.getPageURL();
     233    //logger.debug("@@@@ pageURL: " + url);
     234   
     235    // check if each page in site contains /mi(/) in URL, and if so set a site-level variable accordingly
     236    if(!this.urlContainsLangCodeInPathSuffix && (url.contains("/mi/") || url.endsWith("/mi"))) {
     237        this.urlContainsLangCodeInPathSuffix = true;
     238    }
     239    // And if any page contains http(s)://mi. in its URL, then set site level variable for this accordingly
     240    if(!this.urlContainsLangCodeInPathPrefix && (url.startsWith("https://mi.") || url.startsWith("http://mi."))) {
     241        this.urlContainsLangCodeInPathPrefix = true;
     242    }
     243    }
     244   
    221245   
    222246    private void prepareSiteStats(MongoDBAccess mongodbAccess) throws IOException {
Note: See TracChangeset for help on using the changeset viewer.