- Timestamp:
- 2017-01-05T23:09:29+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java
r31252 r31273 90 90 if (ef_page != null) { 91 91 92 ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCount Text(volume_id, page_id, ef_page, _icu_tokenize);92 ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountWhitelistText(volume_id, page_id, ef_page, _icu_tokenize); 93 93 all_word_list.addAll(page_word_list); 94 94 } -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31260 r31273 18 18 import org.apache.lucene.analysis.Tokenizer; 19 19 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; 20 import org.apache.lucene.analysis.standard.StandardTokenizer; 20 21 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 21 22 import org.apache.lucene.analysis.core.LowerCaseFilter; … … 23 24 public class SolrDocJSON { 24 25 26 25 27 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id, 26 28 boolean icu_tokenize) … … 86 88 } 87 89 90 protected static ArrayList<String> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id, 91 boolean icu_tokenize) 92 { 93 ArrayList<String> words = new ArrayList<String>(); 94 95 if (ef_token_pos_count != null) { 96 97 Iterator<String> word_token_iter = ef_token_pos_count.keys(); 98 while (word_token_iter.hasNext()) { 99 String word_token = word_token_iter.next(); 100 101 if (icu_tokenize == true) { 102 Reader reader = new StringReader(word_token); 103 104 ICUTokenizer icu_tokenizer = new ICUTokenizer(); 105 icu_tokenizer.setReader(reader); 106 107 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class); 108 109 TokenStream token_stream = icu_tokenizer; 110 111 try { 112 token_stream.reset(); 113 114 while (token_stream.incrementToken()) { 115 String term = charTermAttribute.toString(); 116 words.add(term); 117 } 118 119 token_stream.end(); 120 token_stream.close(); 121 } 122 catch (IOException e) { 123 e.printStackTrace(); 124 } 125 } 126 else { 127 words.add(word_token); 128 } 129 } 130 } 131 else { 132 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); 133 } 134 135 return words; 136 } 137 protected static ArrayList<String> getTokenPosCountWordsMapCaseInsensitive(ArrayList<String> words_in) 138 { 139 ArrayList<String> words_out = new ArrayList<String>(); 140 141 for (String word: words_in) { 142 143 Reader reader = new StringReader(word); 144 145 Tokenizer tokenizer = new StandardTokenizer(); 146 tokenizer.setReader(reader); 147 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); 148 149 TokenStream token_stream = new LowerCaseFilter(tokenizer); 150 151 try { 152 token_stream.reset(); 153 154 while (token_stream.incrementToken()) { 155 String term = charTermAttribute.toString(); 156 words_out.add(term); 157 } 158 159 token_stream.end(); 160 token_stream.close(); 161 } 162 catch (IOException e) { 163 e.printStackTrace(); 164 } 165 166 } 167 168 return words_out; 169 } 170 171 protected static ArrayList<String> getTokenPosCountWordsMapWhitelist(ArrayList<String> words_in, 172 WhitelistBloomFilter whitelist_bloomfilter) 173 { 174 ArrayList<String> words_out = new ArrayList<String>(); 175 176 for (String word: words_in) { 177 178 if (whitelist_bloomfilter.contains(word)) { 179 words_out.add(word); 180 } 181 } 182 183 return words_out; 184 } 185 88 186 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id) 89 187 { … … 121 219 122 220 StringBuilder sb = new StringBuilder(); 123 221 124 222 if (whitelist_bloomfilter == null) { 125 223 … … 160 258 return sb.toString(); 161 259 } 162 260 261 protected static ArrayList<String> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id, 262 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize) 263 { 264 ArrayList<String> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize); 265 ArrayList<String> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 266 267 ArrayList<String> tokens = null; 268 if (whitelist_bloomfilter != null) { 269 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter); 270 } 271 else { 272 tokens = lc_tokens; 273 } 274 275 return tokens; 276 } 277 278 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<String> text_al, 279 JSONObject solr_doc_json) 280 { 281 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}] 282 JSONArray ef_languages = ef_page.getJSONArray("languages"); 283 if (ef_languages != null) { 284 285 int lang_len = ef_languages.length(); 286 String [] lang_list = new String[lang_len]; 287 288 for (int i=0; i<lang_len; i++) { 289 JSONObject lang_rec = ef_languages.getJSONObject(i); 290 291 Iterator<String> lang_key_iter = lang_rec.keys(); 292 while (lang_key_iter.hasNext()) { 293 String lang_label = lang_key_iter.next(); 294 295 String solr_field = lang_label + "_htrctoken"; 296 lang_list[i] = solr_field; 297 } 298 } 299 300 int text_len = text_al.size(); 301 for (int ti=0; ti<text_len; ti++) { 302 String text_value = text_al.get(ti); 303 for (int li=0; li<lang_len; li++) { 304 String lang_text_field = lang_list[li]; 305 306 solr_doc_json.put(lang_text_field, text_value); 307 308 } 309 } 310 311 } 312 } 163 313 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page, 164 314 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize) … … 174 324 JSONObject solr_add_json = new JSONObject(); 175 325 176 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);326 ArrayList<String> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize); 177 327 178 328 JSONObject solr_doc_json = new JSONObject(); 179 329 solr_doc_json.put("id", page_id); 180 330 solr_doc_json.put("volumeid_s", volume_id); 181 if (!text.equals("")) { 182 solr_doc_json.put("eftext_txt", text); 331 if (text_al.size()>0) { 332 addSolrLanguageTextFields(ef_page,text_al, solr_doc_json); 333 //solr_doc_json.put("eftext_txt", text_al.toString()); // **** 183 334 } 184 335 else { … … 250 401 } 251 402 252 public static ArrayList<String> generateTokenPosCount Text(String volume_id, String page_id, JSONObject ef_page,403 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page, 253 404 boolean icu_tokenize) 254 405 {
Note:
See TracChangeset
for help on using the changeset viewer.