Changeset 31258 for other-projects/hathitrust
- Timestamp:
- 2016-12-20T23:39:40+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
- Files:
-
- 3 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31254 r31258 31 31 if (ef_token_pos_count != null) { 32 32 33 Iterator<String> token_iter = ef_token_pos_count.keys();34 while ( token_iter.hasNext()) {35 String token =token_iter.next();33 Iterator<String> word_token_iter = ef_token_pos_count.keys(); 34 while (word_token_iter.hasNext()) { 35 String word_token = word_token_iter.next(); 36 36 37 37 if (icu_tokenize == true) { 38 Reader reader = new StringReader( token);38 Reader reader = new StringReader(word_token); 39 39 40 40 ICUTokenizer icu_tokenizer = new ICUTokenizer(); … … 68 68 } 69 69 else { 70 words.add( token);70 words.add(word_token); 71 71 } 72 72 } … … 84 84 return words; 85 85 } 86 87 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id) 88 { 89 ArrayList<String> pos_labels = new ArrayList<String>(); 90 91 if (ef_token_pos_count != null) { 92 93 Iterator<String> word_token_iter = ef_token_pos_count.keys(); 94 while (word_token_iter.hasNext()) { 95 String word_token = word_token_iter.next(); 96 97 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token); 98 99 Iterator<String> pos_token_iter = word_pos_labels.keys(); 100 while (pos_token_iter.hasNext()) { 101 String pos_token = pos_token_iter.next(); 102 103 pos_labels.add(pos_token); 104 } 105 } 106 } 107 else { 108 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); 109 } 110 111 return pos_labels; 112 } 113 86 114 87 115 … … 221 249 } 222 250 223 p rotectedstatic ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,224 251 public static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page, 252 boolean icu_tokenize) 225 253 { 226 254 ArrayList<String> word_list = null; … … 241 269 } 242 270 271 return word_list; 272 } 273 274 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page) 275 { 276 ArrayList<String> word_list = null; 277 278 if (ef_page != null) { 279 JSONObject ef_body = ef_page.getJSONObject("body"); 280 if (ef_body != null) { 281 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); 282 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id); 283 } 284 else { 285 System.err.println("Warning: empty body field for '" + page_id + "'"); 286 } 287 288 } 289 else { 290 System.err.println("Warning: null page for '" + page_id + "'"); 291 } 292 243 293 return word_list; 244 294 }
Note:
See TracChangeset
for help on using the changeset viewer.