Changeset 31779

Show
Ignore:
Timestamp:
07.07.2017 16:11:22 (3 weeks ago)
Author:
davidb
Message:

Change in how POS words are checked against the Whitelist. Previously words were case-folded before being checked in the Whitelist, however this could lead to words not being included if they only appear in capitalized form (as in Sherlock) in the text, and never in lowercase (sherlock). The change addresses this issue by only mapping to lowercase after the POS word -- left in its native form -- has been checked against the Whitelist, which also operates with POS words in their native form

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

    r31677 r31779  
    339339    } 
    340340     
     341    protected static ArrayList<String> lowerCaseTerms(String word) 
     342    { 
     343        ArrayList<String> words_out = new ArrayList<String>(); 
     344         
     345        Reader reader = new StringReader(word); 
     346         
     347        Tokenizer tokenizer = new StandardTokenizer();           
     348        tokenizer.setReader(reader); 
     349        CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); 
     350 
     351        TokenStream token_stream = new LowerCaseFilter(tokenizer); 
     352         
     353        try { 
     354            token_stream.reset(); 
     355 
     356            while (token_stream.incrementToken()) { 
     357                String term = charTermAttribute.toString(); 
     358                 
     359                words_out.add(term); 
     360            } 
     361 
     362            token_stream.end(); 
     363            token_stream.close(); 
     364        }  
     365        catch (IOException e) { 
     366            e.printStackTrace(); 
     367        }        
     368         
     369        return words_out; 
     370    } 
     371     
    341372    protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in, 
    342373                                                                         WhitelistBloomFilter whitelist_bloomfilter) 
    343374    { 
    344         ArrayList<POSString> words_out = new ArrayList<POSString>(); 
     375        ArrayList<POSString> pos_words_out = new ArrayList<POSString>(); 
    345376 
    346377        for (POSString pos_word: words_in) { 
    347378            String word = pos_word.getString(); 
     379            String[] pos_tags = pos_word.getPOSTags(); 
     380             
    348381            if (whitelist_bloomfilter.contains(word)) { 
    349                 words_out.add(pos_word); 
    350             } 
    351         } 
    352          
    353         return words_out; 
     382                             
     383                ArrayList<String> word_terms = lowerCaseTerms(word); 
     384                for (String term: word_terms) { 
     385                    POSString pos_term = new POSString(term, pos_tags); 
     386                     
     387                    pos_words_out.add(pos_term); 
     388                } 
     389                 
     390                // The old, direct way of adding the value in 
     391                //pos_words_out.add(pos_word); 
     392            } 
     393            else { 
     394                // else clause won't happen so often  
     395                //   (has to be an 'obscure' word *not* be in the whitelist to get here) 
     396                // break down the word into terms, and see if any of them are in the whitelist instead 
     397                 
     398                ArrayList<String> word_terms = lowerCaseTerms(word); 
     399                for (String term: word_terms) { 
     400                     
     401                    if (whitelist_bloomfilter.contains(term)) { 
     402                        POSString pos_term = new POSString(term, pos_tags); 
     403                         
     404                        pos_words_out.add(pos_term); 
     405                    } 
     406                } 
     407                 
     408             
     409            } 
     410        } 
     411         
     412        return pos_words_out; 
    354413    } 
    355414     
     
    435494    { 
    436495        ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize); 
    437         ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 
     496        //ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 
    438497         
    439498        ArrayList<POSString> tokens = null; 
    440499        if (whitelist_bloomfilter != null) { 
    441             tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter); 
    442         } 
    443         else { 
     500            tokens =  getTokenPosCountWordsMapWhitelist(cs_tokens,whitelist_bloomfilter); 
     501            //tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter); 
     502        } 
     503        else { 
     504            ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 
    444505            tokens = lc_tokens; 
    445506        }