Context Navigation

← Previous Change
Next Change →

trunk

Timestamp:

2017-07-07T16:11:22+12:00 (7 years ago)

Author:

davidb

Message:

Change in how POS words are checked against the Whitelist. Previously words were case-folded before being checked in the Whitelist, however this could lead to words not being included if they only appear in capitalized form (as in Sherlock) in the text, and never in lowercase (sherlock). The change addresses this issue by only mapping to lowercase after the POS word -- left in its native form -- has been checked against the Whitelist, which also operates with POS words in their native form

File:

: 1 edited

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

-              r31677
+              r31779
+    }
+    protected static ArrayList<String> lowerCaseTerms(String word)
+    {
+        ArrayList<String> words_out = new ArrayList<String>();
+        Reader reader = new StringReader(word);
+        Tokenizer tokenizer = new StandardTokenizer();
+        tokenizer.setReader(reader);
+        CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
+        TokenStream token_stream = new LowerCaseFilter(tokenizer);
+        try {
+            token_stream.reset();
+            while (token_stream.incrementToken()) {
+                String term = charTermAttribute.toString();
+                words_out.add(term);
+            }
+            token_stream.end();
+            token_stream.close();
+        }
+        catch (IOException e) {
+            e.printStackTrace();
+        }
+        return words_out;
+    }
     protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
                                                                          WhitelistBloomFilter whitelist_bloomfilter)
+    {
         ArrayList<POSString> words_out = new ArrayList<POSString>();
+        ArrayList<POSString> pos_words_out = new ArrayList<POSString>();
         for (POSString pos_word: words_in) {
             String word = pos_word.getString();
+            String[] pos_tags = pos_word.getPOSTags();
             if (whitelist_bloomfilter.contains(word)) {
+                words_out.add(pos_word);
+            }
+        }
+        return words_out;
+                ArrayList<String> word_terms = lowerCaseTerms(word);
+                for (String term: word_terms) {
+                    POSString pos_term = new POSString(term, pos_tags);
+                    pos_words_out.add(pos_term);
+                }
+                // The old, direct way of adding the value in
+                //pos_words_out.add(pos_word);
+            }
+            else {
+                // else clause won't happen so often
+                //   (has to be an 'obscure' word *not* be in the whitelist to get here)
+                // break down the word into terms, and see if any of them are in the whitelist instead
+                ArrayList<String> word_terms = lowerCaseTerms(word);
+                for (String term: word_terms) {
+                    if (whitelist_bloomfilter.contains(term)) {
+                        POSString pos_term = new POSString(term, pos_tags);
+                        pos_words_out.add(pos_term);
+                    }
+                }
+            }
+        }
+        return pos_words_out;
+    }
 …
+    {
         ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
         ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
+        //ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
         ArrayList<POSString> tokens = null;
         if (whitelist_bloomfilter != null) {
+            tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
+        }
+        else {
+            tokens =  getTokenPosCountWordsMapWhitelist(cs_tokens,whitelist_bloomfilter);
+            //tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
+        }
+        else {
+            ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
             tokens = lc_tokens;
+        }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 31779 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk

Legend:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

Download in other formats: