Changeset 31375 for other-projects/hathitrust/wcsa
- Timestamp:
- 2017-01-31T21:35:50+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java
r31266 r31375 33 33 34 34 protected WhitelistBloomFilter _whitelist_bloomfilter; 35 35 protected UniversalPOSLangMap _universal_langmap = null; 36 36 37 37 protected DoubleAccumulator _progress_accum; … … 136 136 // Convert to Solr add form 137 137 JSONObject solr_add_doc_json 138 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter,_icu_tokenize); 138 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, 139 _whitelist_bloomfilter, _universal_langmap, 140 _icu_tokenize); 139 141 solr_add_doc_json.put("filename_json_bz2", output_json_bz2); 140 142 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31372 r31375 3 3 import java.io.IOException; 4 4 import java.util.ArrayList; 5 import java.util.HashMap; 5 6 import java.util.Iterator; 6 7 … … 29 30 protected String _input_dir; 30 31 protected String _whitelist_filename; 32 protected String _langmap_directory; 31 33 32 34 protected String _solr_url; … … 36 38 37 39 protected WhitelistBloomFilter _whitelist_bloomfilter; 38 39 40 41 boolean _icu_tokenize; 42 boolean _strict_file_io; 43 44 public PerVolumeJSON(String input_dir, String whitelist_filename, 40 protected UniversalPOSLangMap _universal_langmap; 41 42 boolean _icu_tokenize; 43 boolean _strict_file_io; 44 45 public PerVolumeJSON(String input_dir, String whitelist_filename, String langmap_directcory, 45 46 String solr_url, String output_dir, int verbosity, 46 47 boolean icu_tokenize, boolean strict_file_io) … … 48 49 _input_dir = input_dir; 49 50 _whitelist_filename = whitelist_filename; 51 _langmap_directory = langmap_directcory; 50 52 51 53 _solr_url = solr_url; … … 57 59 58 60 _whitelist_bloomfilter = null; 61 _universal_langmap = null; 59 62 } 60 63 … … 66 69 _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true); 67 70 } 68 71 72 if ((_langmap_directory != null) && (_universal_langmap == null)) { 73 _universal_langmap = new UniversalPOSLangMap(_langmap_directory); 74 } 69 75 int ef_num_pages = 0; 70 76 … … 107 113 // Convert to Solr add form 108 114 JSONObject solr_add_doc_json 109 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _ icu_tokenize);115 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _universal_langmap, _icu_tokenize); 110 116 111 117 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31374 r31375 21 21 private static final long serialVersionUID = 1L; 22 22 23 // Following details on number of partitions to use given in24 // "Parallelized collections" section of:25 // https://spark.apache.org/docs/2.0.1/programming-guide.html26 //27 // For a more detailed discussion see:28 // http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/29 30 23 protected static final int DEFAULT_NUM_CORES = 10; 31 24 protected static final int MINIMUM_NUM_PARTITIONS = 10*DEFAULT_NUM_CORES; … … 36 29 //protected String _json_list_filename; 37 30 protected String _whitelist_filename; 31 protected String _langmap_directory; 32 38 33 protected String _solr_url; 39 34 protected String _output_dir; … … 49 44 boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist"); 50 45 _whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null; 46 47 boolean use_langmap = Boolean.getBoolean("wcsa-ef-ingest.use-langmap"); 48 _langmap_directory = (use_langmap) ? System.getProperty("wcsa-ef-ingest.langmap-directory") : null; 49 51 50 52 51 _solr_url = solr_url; … … 110 109 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 111 110 112 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename, 111 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename, _langmap_directory, 113 112 _solr_url,_output_dir,_verbosity, 114 113 icu_tokenize,strict_file_io); -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31308 r31375 11 11 import java.net.URL; 12 12 import java.util.ArrayList; 13 import java.util.HashMap; 13 14 import java.util.Iterator; 15 import java.util.Set; 16 14 17 import org.apache.commons.compress.compressors.CompressorException; 15 18 import org.json.JSONArray; … … 88 91 } 89 92 90 protected static ArrayList< String> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,93 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id, 91 94 boolean icu_tokenize) 92 95 { 93 ArrayList< String> words = new ArrayList<String>();96 ArrayList<POSString> words = new ArrayList<POSString>(); 94 97 95 98 if (ef_token_pos_count != null) { … … 99 102 String word_token = word_token_iter.next(); 100 103 104 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token); 105 Set<String> pos_keys = pos_json_object.keySet(); 106 int pos_keys_len = pos_keys.size(); 107 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null; 108 101 109 if (icu_tokenize == true) { 102 110 Reader reader = new StringReader(word_token); … … 114 122 while (token_stream.incrementToken()) { 115 123 String term = charTermAttribute.toString(); 116 words.add(term); 124 125 POSString pos_string = new POSString(term,pos_tags); 126 127 words.add(pos_string); 117 128 } 118 129 … … 125 136 } 126 137 else { 127 words.add(word_token); 138 POSString pos_word_token = new POSString(word_token,pos_tags); 139 140 words.add(pos_word_token); 128 141 } 129 142 } … … 135 148 return words; 136 149 } 137 protected static ArrayList<String> getTokenPosCountWordsMapCaseInsensitive(ArrayList<String> words_in) 138 { 139 ArrayList<String> words_out = new ArrayList<String>(); 140 141 for (String word: words_in) { 150 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in) 151 { 152 ArrayList<POSString> words_out = new ArrayList<POSString>(); 153 154 for (POSString pos_word: words_in) { 155 String word = pos_word.getString(); 156 String[] pos_tags = pos_word.getPOSTags(); 142 157 143 158 Reader reader = new StringReader(word); … … 154 169 while (token_stream.incrementToken()) { 155 170 String term = charTermAttribute.toString(); 156 words_out.add(term); 171 172 POSString pos_term = new POSString(term,pos_tags); 173 words_out.add(pos_term); 157 174 } 158 175 … … 169 186 } 170 187 171 protected static ArrayList< String> getTokenPosCountWordsMapWhitelist(ArrayList<String> words_in,188 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in, 172 189 WhitelistBloomFilter whitelist_bloomfilter) 173 190 { 174 ArrayList< String> words_out = new ArrayList<String>();175 176 for ( Stringword: words_in) {177 191 ArrayList<POSString> words_out = new ArrayList<POSString>(); 192 193 for (POSString pos_word: words_in) { 194 String word = pos_word.getString(); 178 195 if (whitelist_bloomfilter.contains(word)) { 179 words_out.add( word);196 words_out.add(pos_word); 180 197 } 181 198 } … … 259 276 } 260 277 261 protected static ArrayList<String> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id, 262 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize) 263 { 264 ArrayList<String> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize); 265 ArrayList<String> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 266 267 ArrayList<String> tokens = null; 278 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id, 279 WhitelistBloomFilter whitelist_bloomfilter, 280 UniversalPOSLangMap universal_langmap, 281 boolean icu_tokenize) 282 { 283 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize); 284 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 285 286 ArrayList<POSString> tokens = null; 268 287 if (whitelist_bloomfilter != null) { 269 288 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter); … … 276 295 } 277 296 278 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<String> text_al, 297 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al, 298 UniversalPOSLangMap universal_langmap, 279 299 JSONObject solr_doc_json) 280 300 { … … 293 313 String lang_label = lang_key_iter.next(); 294 314 295 String solr_field = lang_label + "_htrctoken"; 296 lang_list[i] = solr_field; 315 lang_list[i] = lang_label; 297 316 } 298 317 } … … 301 320 302 321 for (int li=0; li<lang_len; li++) { 303 String lang_text_field = lang_list[li]; 304 305 JSONArray json_values = new JSONArray(); 306 for (int ti=0; ti<text_len; ti++) { 307 String text_value = text_al.get(ti); 308 json_values.put(text_value); 309 } 310 solr_doc_json.put(lang_text_field, json_values); 322 String lang_key = lang_list[li]; 323 324 if (universal_langmap.containsLanguage(lang_key)) 325 { 326 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>(); 327 328 for (int ti=0; ti<text_len; ti++) { 329 POSString pos_text_value = text_al.get(ti); 330 String text_value = pos_text_value.getString(); 331 332 String[] pos_tags = pos_text_value.getPOSTags(); 333 int pos_tags_len = pos_tags.length; 334 335 for (int pti=0; ti<pos_tags_len; pti++) { 336 String opennlp_pos_key = pos_tags[pti]; 337 338 String upos = universal_langmap.getUniversalLanguagePOS(lang_key, opennlp_pos_key); 339 String pos_lang_text_field = lang_key + "_" + upos + "_htrctoken"; 340 341 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) { 342 JSONArray empty_json_values = new JSONArray(); 343 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values); 344 } 345 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value); 346 } 347 } 348 349 // Now add each of the POS language fields into solr_doc_json 350 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet(); 351 for (String plf_key : pos_lang_field_keys) { 352 String lang_text_field = plf_key; 353 JSONArray json_values = pos_lang_text_field_map.get(plf_key); 354 355 solr_doc_json.put(lang_text_field, json_values); 356 } 357 } 358 else { 359 String lang_text_field = lang_key + "_htrctoken"; 360 361 JSONArray json_values = new JSONArray(); 362 for (int ti=0; ti<text_len; ti++) { 363 POSString pos_text_value = text_al.get(ti); 364 String text_value = pos_text_value.getString(); 365 json_values.put(text_value); 366 } 367 solr_doc_json.put(lang_text_field, json_values); 368 369 } 370 371 311 372 } 312 373 … … 314 375 } 315 376 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page, 316 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize) 377 WhitelistBloomFilter whitelist_bloomfilter, 378 UniversalPOSLangMap universal_langmap, 379 boolean icu_tokenize) 317 380 { 318 381 JSONObject solr_update_json = null; … … 326 389 JSONObject solr_add_json = new JSONObject(); 327 390 328 ArrayList< String> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);391 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize); 329 392 330 393 JSONObject solr_doc_json = new JSONObject(); … … 332 395 solr_doc_json.put("volumeid_s", volume_id); 333 396 if (text_al.size()>0) { 334 addSolrLanguageTextFields(ef_page,text_al, solr_doc_json);397 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json); 335 398 //solr_doc_json.put("eftext_txt", text_al.toString()); // **** 336 399 } … … 502 565 503 566 567 System.out.println("**** post_url = " + post_url); 568 504 569 try { 505 570 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
Note:
See TracChangeset
for help on using the changeset viewer.