Changeset 33480

Show
Ignore:
Timestamp:
16.09.2019 19:45:01 (4 weeks ago)
Author:
ak19
Message:

Much harder to remove pages where words are fused together as some are shorter than valid word-lengths of 15 chars, some are long, when the number of valid words still come to more than the required number of 20. The next solution was to ignore pages that had more than 2 instances of camelcase, but valid pages (actual Maori language pages) may end up with a few more camelcased words if navigation items get fused together. Not sure what to do.

Location:
gs3-extensions/maori-lang-detection
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33467 r33480  
    1515WETprocessor.min.content.length.wrapped.line=500 
    1616WETprocessor.min.spaces.per.wrapped.line=10 
     17 
     18# Arbitrary cutoff values for WETProcessor.java 
     19# for determining whether a WET record has sufficient and sensible content 
     20WETprocessor.max.word.length=15 
     21WETprocessor.min.num.words=20 
     22WETprocessor.max.words.camelcase=10 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33471 r33480  
    3131    // In Java, can initialize static final variables inside a static block 
    3232    // But the unavoidable try/catch in this static block prevents initialization of 
    33     // the static final int variables further below that therefore need to be declared 
    34     // and initialized thereafter. 
     33    // the static final int variables (seen further below) inside the block itself, 
     34    // that therefore need to be declared and initialized thereafter. 
    3535    static {     
    36      
    3736    // load up the properties from the config file 
    3837    try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 
     
    5049    // Providing fall-back cuttoff values if config.properties doesn't load 
    5150    // or doesn't have the named props. But what happens when Integer.parseInt throws an exception? 
     51    /* 
    5252    private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100")); 
    5353    private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2")); 
    5454    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 
    5555    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 
    56  
     56    */ 
     57    private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces) 
     58    private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20")); 
     59    private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10")); 
     60     
    5761    // File paths shared across WETProcessor instances 
    5862    private static File discardFolder; 
     
    202206 
    203207    // want to match "product(s)" but not "production" 
    204      
    205208    //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) { 
     209     
     210 
    206211    if(recordURI.contains("product") && !recordURI.contains("production")) { 
    207212 
     
    209214        // These curiously often tend to have "product(s)" in the URL 
    210215        parentFolder = WETProcessor.discardFolder; 
    211     }    
     216    } 
     217    /* 
    212218    else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 
    213219        parentFolder = WETProcessor.keepFolder; 
     
    224230        } 
    225231    } 
    226  
     232    */ 
     233 
     234    else { 
     235        // If a web page's WET record contains a certain minimum number of words, 
     236        // we will think it's a meaningful web page and has sufficient content for text analysis 
     237        // to have been successful. Cut off values at present are: 
     238        // - a minimum of 20 words 
     239        // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point 
     240        // to words having been glued together. This is used by irrelevant sites and moreover 
     241        // can't be analysed for language, so may not be actually MRI. 
     242         
     243        // Though StringTokenizer still in use, as seen in discussion at 
     244        // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated 
     245        // String.split(regex) seems better for splitting on general whitespace 
     246        String[] allWords = record.split("\\s"); 
     247        int validWordCount = 0; 
     248        int numCamelCaseWords = 0; 
     249        for(int i = 0; i < allWords.length; i++) { 
     250        String word = allWords[i]; 
     251 
     252        // throw away if n words contain camelcase, which is another case of words glued together 
     253        if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) { 
     254            numCamelCaseWords++;             
     255        } 
     256         
     257        // In Maori, word length of 1 is not uncommon 
     258        // but let's skip camelcased words when counting valid words 
     259        else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++; 
     260        } 
     261 
     262        // dump if too many camelcase words (ideally keep none of that kind?) 
     263        if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) { 
     264        parentFolder = WETProcessor.discardFolder; 
     265        System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 
     266        } 
     267        else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
     268        parentFolder = WETProcessor.keepFolder; 
     269        System.err.println("@@@KEEPING"); 
     270        } 
     271    } 
    227272    // if parentFolder still not set, set to discard pile folder 
    228273    if(parentFolder == null) {