Changeset 31779


Ignore:
Timestamp:
07/07/17 16:11:22 (3 years ago)
Author:
davidb
Message:

Change in how POS words are checked against the Whitelist. Previously words were case-folded before being checked in the Whitelist, however this could lead to words not being included if they only appear in capitalized form (as in Sherlock) in the text, and never in lowercase (sherlock). The change addresses this issue by only mapping to lowercase after the POS word -- left in its native form -- has been checked against the Whitelist, which also operates with POS words in their native form

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

    r31677 r31779  
    339339    }
    340340   
     341    protected static ArrayList<String> lowerCaseTerms(String word)
     342    {
     343        ArrayList<String> words_out = new ArrayList<String>();
     344       
     345        Reader reader = new StringReader(word);
     346       
     347        Tokenizer tokenizer = new StandardTokenizer();         
     348        tokenizer.setReader(reader);
     349        CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
     350
     351        TokenStream token_stream = new LowerCaseFilter(tokenizer);
     352       
     353        try {
     354            token_stream.reset();
     355
     356            while (token_stream.incrementToken()) {
     357                String term = charTermAttribute.toString();
     358               
     359                words_out.add(term);
     360            }
     361
     362            token_stream.end();
     363            token_stream.close();
     364        }
     365        catch (IOException e) {
     366            e.printStackTrace();
     367        }       
     368       
     369        return words_out;
     370    }
     371   
    341372    protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
    342373                                                                         WhitelistBloomFilter whitelist_bloomfilter)
    343374    {
    344         ArrayList<POSString> words_out = new ArrayList<POSString>();
     375        ArrayList<POSString> pos_words_out = new ArrayList<POSString>();
    345376
    346377        for (POSString pos_word: words_in) {
    347378            String word = pos_word.getString();
     379            String[] pos_tags = pos_word.getPOSTags();
     380           
    348381            if (whitelist_bloomfilter.contains(word)) {
    349                 words_out.add(pos_word);
    350             }
    351         }
    352        
    353         return words_out;
     382                           
     383                ArrayList<String> word_terms = lowerCaseTerms(word);
     384                for (String term: word_terms) {
     385                    POSString pos_term = new POSString(term, pos_tags);
     386                   
     387                    pos_words_out.add(pos_term);
     388                }
     389               
     390                // The old, direct way of adding the value in
     391                //pos_words_out.add(pos_word);
     392            }
     393            else {
     394                // else clause won't happen so often
     395                //   (has to be an 'obscure' word *not* be in the whitelist to get here)
     396                // break down the word into terms, and see if any of them are in the whitelist instead
     397               
     398                ArrayList<String> word_terms = lowerCaseTerms(word);
     399                for (String term: word_terms) {
     400                   
     401                    if (whitelist_bloomfilter.contains(term)) {
     402                        POSString pos_term = new POSString(term, pos_tags);
     403                       
     404                        pos_words_out.add(pos_term);
     405                    }
     406                }
     407               
     408           
     409            }
     410        }
     411       
     412        return pos_words_out;
    354413    }
    355414   
     
    435494    {
    436495        ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
    437         ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
     496        //ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
    438497       
    439498        ArrayList<POSString> tokens = null;
    440499        if (whitelist_bloomfilter != null) {
    441             tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
    442         }
    443         else {
     500            tokens =  getTokenPosCountWordsMapWhitelist(cs_tokens,whitelist_bloomfilter);
     501            //tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
     502        }
     503        else {
     504            ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
    444505            tokens = lc_tokens;
    445506        }
Note: See TracChangeset for help on using the changeset viewer.