Changeset 33480


Ignore:
Timestamp:
2019-09-16T19:45:01+12:00 (5 years ago)
Author:
ak19
Message:

Much harder to remove pages where words are fused together as some are shorter than valid word-lengths of 15 chars, some are long, when the number of valid words still come to more than the required number of 20. The next solution was to ignore pages that had more than 2 instances of camelcase, but valid pages (actual Maori language pages) may end up with a few more camelcased words if navigation items get fused together. Not sure what to do.

Location:
gs3-extensions/maori-lang-detection
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33467 r33480  
    1515WETprocessor.min.content.length.wrapped.line=500
    1616WETprocessor.min.spaces.per.wrapped.line=10
     17
     18# Arbitrary cutoff values for WETProcessor.java
     19# for determining whether a WET record has sufficient and sensible content
     20WETprocessor.max.word.length=15
     21WETprocessor.min.num.words=20
     22WETprocessor.max.words.camelcase=10
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33471 r33480  
    3131    // In Java, can initialize static final variables inside a static block
    3232    // But the unavoidable try/catch in this static block prevents initialization of
    33     // the static final int variables further below that therefore need to be declared
    34     // and initialized thereafter.
     33    // the static final int variables (seen further below) inside the block itself,
     34    // that therefore need to be declared and initialized thereafter.
    3535    static {   
    36    
    3736    // load up the properties from the config file
    3837    try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
     
    5049    // Providing fall-back cuttoff values if config.properties doesn't load
    5150    // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
     51    /*
    5252    private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
    5353    private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
    5454    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
    5555    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
    56 
     56    */
     57    private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
     58    private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
     59    private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
     60   
    5761    // File paths shared across WETProcessor instances
    5862    private static File discardFolder;
     
    202206
    203207    // want to match "product(s)" but not "production"
    204    
    205208    //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) {
     209   
     210
    206211    if(recordURI.contains("product") && !recordURI.contains("production")) {
    207212
     
    209214        // These curiously often tend to have "product(s)" in the URL
    210215        parentFolder = WETProcessor.discardFolder;
    211     }   
     216    }
     217    /*
    212218    else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
    213219        parentFolder = WETProcessor.keepFolder;
     
    224230        }
    225231    }
    226 
     232    */
     233
     234    else {
     235        // If a web page's WET record contains a certain minimum number of words,
     236        // we will think it's a meaningful web page and has sufficient content for text analysis
     237        // to have been successful. Cut off values at present are:
     238        // - a minimum of 20 words
     239        // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
     240        // to words having been glued together. This is used by irrelevant sites and moreover
     241        // can't be analysed for language, so may not be actually MRI.
     242       
     243        // Though StringTokenizer still in use, as seen in discussion at
     244        // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
     245        // String.split(regex) seems better for splitting on general whitespace
     246        String[] allWords = record.split("\\s");
     247        int validWordCount = 0;
     248        int numCamelCaseWords = 0;
     249        for(int i = 0; i < allWords.length; i++) {
     250        String word = allWords[i];
     251
     252        // throw away if n words contain camelcase, which is another case of words glued together
     253        if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
     254            numCamelCaseWords++;           
     255        }
     256       
     257        // In Maori, word length of 1 is not uncommon
     258        // but let's skip camelcased words when counting valid words
     259        else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
     260        }
     261
     262        // dump if too many camelcase words (ideally keep none of that kind?)
     263        if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
     264        parentFolder = WETProcessor.discardFolder;
     265        System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
     266        }
     267        else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
     268        parentFolder = WETProcessor.keepFolder;
     269        System.err.println("@@@KEEPING");
     270        }
     271    }
    227272    // if parentFolder still not set, set to discard pile folder
    228273    if(parentFolder == null) {
Note: See TracChangeset for help on using the changeset viewer.