Changeset 31509
- Timestamp:
- 2017-03-13T20:50:06+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31505 r31509 19 19 import org.json.JSONArray; 20 20 import org.json.JSONObject; 21 22 import scala.Tuple2; 23 21 24 import org.apache.lucene.analysis.TokenStream; 22 25 import org.apache.lucene.analysis.Tokenizer; … … 388 391 int text_len = text_al.size(); 389 392 393 /* 390 394 for (int li=0; li<lang_len; li++) { 391 395 String lang_key = lang_list[li]; … … 393 397 if (universal_langmap.containsLanguage(lang_key)) 394 398 { 399 */ 395 400 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>(); 396 401 … … 405 410 String opennlp_pos_key = pos_tags[pti]; 406 411 407 String upos = universal_langmap.getUniversalLanguagePOS(lang_key, opennlp_pos_key); 408 String pos_lang_text_field = lang_key + "_" + upos + "_htrctoken"; 412 Tuple2<String,String> lang_pos_pair = universal_langmap.getUniversalLanguagePOSPair(lang_list, opennlp_pos_key); 413 String selected_lang = lang_pos_pair._1; 414 String upos = lang_pos_pair._2; 415 416 String pos_lang_text_field = selected_lang; 417 if (upos != null) { 418 pos_lang_text_field += "_" + upos; 419 } 420 pos_lang_text_field += "_htrctoken"; 409 421 410 422 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) { … … 424 436 solr_doc_json.put(lang_text_field, json_values); 425 437 } 438 /* 426 439 } 427 440 else { … … 440 453 441 454 } 442 455 */ 443 456 } 444 457 } -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/UniversalPOSLangMap.java
r31506 r31509 11 11 import java.util.stream.Collectors; 12 12 import java.util.stream.Stream; 13 14 import scala.Tuple2; 13 15 14 16 public class UniversalPOSLangMap … … 83 85 } 84 86 85 public String getUniversalLanguagePOS (String lang_key,String opennlp_pos_key)87 public String getUniversalLanguagePOSUnchecked(String lang_key,String opennlp_pos_key) 86 88 { 87 89 String universal_pos = null; … … 90 92 if (langmap != null) { 91 93 universal_pos = langmap.get(opennlp_pos_key); 92 if (universal_pos == null) {93 String missing_lang_pos = lang_key + ":" + opennlp_pos_key;94 95 Integer mpos_freq = 0;96 if (_missing_pos.containsKey(missing_lang_pos)) {97 mpos_freq = _missing_pos.get(missing_lang_pos);98 }99 else {100 System.err.println("Warning: for language key '"+lang_key101 +"' failed to find POS '" + opennlp_pos_key + "'");102 System.err.println("Defaulting to POS 'X' (i.e., 'other')");103 }104 mpos_freq++;105 _missing_pos.put(lang_key,mpos_freq);106 107 universal_pos = "X";108 }109 94 } 110 95 … … 112 97 } 113 98 99 public String getUniversalLanguagePOSChecked(String lang_key,String opennlp_pos_key) 100 { 101 if (!_all_langmaps.containsKey(lang_key)) { 102 // Not a language with a POS map 103 return ""; 104 } 105 106 String universal_pos = null; 107 108 HashMap<String,String> langmap = _all_langmaps.get(lang_key); 109 universal_pos = langmap.get(opennlp_pos_key); 110 111 if (universal_pos == null) { 112 String missing_lang_pos = lang_key + ":" + opennlp_pos_key; 113 114 Integer mpos_freq = 0; 115 if (_missing_pos.containsKey(missing_lang_pos)) { 116 mpos_freq = _missing_pos.get(missing_lang_pos); 117 } 118 else { 119 System.err.println("Warning: for language key '"+lang_key 120 +"' failed to find POS '" + opennlp_pos_key + "'"); 121 System.err.println("Defaulting to POS 'X' (i.e., 'other')"); 122 } 123 mpos_freq++; 124 _missing_pos.put(missing_lang_pos,mpos_freq); 125 126 universal_pos = "X"; 127 } 128 129 return universal_pos; 130 } 131 132 public Tuple2<String,String> getUniversalLanguagePOSPair(String[] lang_keys,String opennlp_pos_key) 133 { 134 String universal_pos = null; 135 String selected_lang = null; 136 137 for (int li=0; li<lang_keys.length; li++) { 138 String lang_key = lang_keys[li]; 139 140 universal_pos = getUniversalLanguagePOSUnchecked(lang_key,opennlp_pos_key); 141 if (universal_pos != null) { 142 selected_lang = lang_key; 143 break; 144 } 145 } 146 147 if (universal_pos == null) { 148 // Failed to any match in any of the given languages 149 // => Lock onto the first language (highest probability when modeled) 150 selected_lang = lang_keys[0]; 151 152 if (!_all_langmaps.containsKey(selected_lang)) { 153 // Not a language with a POS map 154 return new Tuple2<String,String>(selected_lang,null); 155 } 156 157 // If here, then is a POS language => default to "X" 158 159 String missing_lang_pos = selected_lang + ":" + opennlp_pos_key; 160 161 Integer mpos_freq = 0; 162 if (_missing_pos.containsKey(missing_lang_pos)) { 163 mpos_freq = _missing_pos.get(missing_lang_pos); 164 } 165 else { 166 System.err.println("Warning: for language key '"+selected_lang 167 +"' failed to find POS '" + opennlp_pos_key + "'"); 168 System.err.println("Defaulting to POS 'X' (i.e., 'other')"); 169 } 170 mpos_freq++; 171 _missing_pos.put(missing_lang_pos,mpos_freq); 172 173 universal_pos = "X"; 174 } 175 176 return new Tuple2<String,String>(selected_lang,universal_pos); 177 } 114 178 }
Note:
See TracChangeset
for help on using the changeset viewer.