Context Navigation

← Previous Change
Next Change →

solr-ingest

Timestamp:

2017-01-05T23:09:29+13:00 (7 years ago)

Author:

davidb

Message:

Code moved to store fields for multilingual use using dynamic Solr fields *_htrctoken. Text is now also put in as separate tokens

Location:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures

Files:

: 2 edited

PerVolumeWordStreamFlatmap.java (modified) (1 diff)
SolrDocJSON.java (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java

r31252	r31273
90	90	if (ef_page != null) {
91	91
92		ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page, _icu_tokenize);
	92	ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountWhitelistText(volume_id, page_id, ef_page, _icu_tokenize);
93	93	all_word_list.addAll(page_word_list);
94	94	}

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

-              r31260
+              r31273
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 …
 public class SolrDocJSON {
     protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
                                                              boolean icu_tokenize)
 …
+    }
+    protected static ArrayList<String> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
+            boolean icu_tokenize)
+    {
+        ArrayList<String> words = new ArrayList<String>();
+        if (ef_token_pos_count != null) {
+            Iterator<String> word_token_iter = ef_token_pos_count.keys();
+            while (word_token_iter.hasNext()) {
+                String word_token = word_token_iter.next();
+                if (icu_tokenize == true) {
+                    Reader reader = new StringReader(word_token);
+                    ICUTokenizer icu_tokenizer = new ICUTokenizer();
+                    icu_tokenizer.setReader(reader);
+                    CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
+                    TokenStream token_stream = icu_tokenizer;
+                    try {
+                        token_stream.reset();
+                        while (token_stream.incrementToken()) {
+                            String term = charTermAttribute.toString();
+                            words.add(term);
+                        }
+                        token_stream.end();
+                        token_stream.close();
+                    }
+                    catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                }
+                else {
+                    words.add(word_token);
+                }
+            }
+        }
+        else {
+            System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
+        }
+        return words;
+    }
+    protected static ArrayList<String> getTokenPosCountWordsMapCaseInsensitive(ArrayList<String> words_in)
+    {
+        ArrayList<String> words_out = new ArrayList<String>();
+        for (String word: words_in) {
+            Reader reader = new StringReader(word);
+            Tokenizer tokenizer = new StandardTokenizer();
+            tokenizer.setReader(reader);
+            CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
+            TokenStream token_stream = new LowerCaseFilter(tokenizer);
+            try {
+                token_stream.reset();
+                while (token_stream.incrementToken()) {
+                    String term = charTermAttribute.toString();
+                    words_out.add(term);
+                }
+                token_stream.end();
+                token_stream.close();
+            }
+            catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+        return words_out;
+    }
+    protected static ArrayList<String> getTokenPosCountWordsMapWhitelist(ArrayList<String> words_in,
+                                                                         WhitelistBloomFilter whitelist_bloomfilter)
+    {
+        ArrayList<String> words_out = new ArrayList<String>();
+        for (String word: words_in) {
+            if (whitelist_bloomfilter.contains(word)) {
+                words_out.add(word);
+            }
+        }
+        return words_out;
+    }
     protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
+    {
 …
         StringBuilder sb = new StringBuilder();
         if (whitelist_bloomfilter == null) {
 …
         return sb.toString();
+    }
+    protected static ArrayList<String> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
+                                               WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
+    {
+        ArrayList<String> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
+        ArrayList<String> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
+        ArrayList<String> tokens = null;
+        if (whitelist_bloomfilter != null) {
+            tokens =  getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
+        }
+        else {
+            tokens = lc_tokens;
+        }
+        return tokens;
+    }
+    protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<String> text_al,
+                                                    JSONObject solr_doc_json)
+    {
+        // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
+        JSONArray ef_languages = ef_page.getJSONArray("languages");
+        if (ef_languages != null) {
+            int lang_len = ef_languages.length();
+            String [] lang_list = new String[lang_len];
+            for (int i=0; i<lang_len; i++) {
+                JSONObject lang_rec = ef_languages.getJSONObject(i);
+                Iterator<String> lang_key_iter = lang_rec.keys();
+                while (lang_key_iter.hasNext()) {
+                    String lang_label = lang_key_iter.next();
+                    String solr_field = lang_label + "_htrctoken";
+                    lang_list[i] = solr_field;
+                }
+            }
+            int text_len = text_al.size();
+            for (int ti=0; ti<text_len; ti++) {
+                String text_value = text_al.get(ti);
+                for (int li=0; li<lang_len; li++) {
+                    String lang_text_field = lang_list[li];
+                    solr_doc_json.put(lang_text_field, text_value);
+                }
+            }
+        }
+    }
     protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
                                                     WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
 …
                     JSONObject solr_add_json = new JSONObject();
                     String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
+                    ArrayList<String> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
                     JSONObject solr_doc_json = new JSONObject();
                     solr_doc_json.put("id", page_id);
                     solr_doc_json.put("volumeid_s", volume_id);
+                    if (!text.equals("")) {
+                        solr_doc_json.put("eftext_txt", text);
+                    if (text_al.size()>0) {
+                        addSolrLanguageTextFields(ef_page,text_al, solr_doc_json);
+                        //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
+                    }
                     else {
 …
+    }
     public static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
+    public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
                                                               boolean icu_tokenize)
+    {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 31273 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest

Legend:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

Download in other formats: