Changeset 31245 for other-projects/hathitrust/wcsa
- Timestamp:
- 2016-12-18T17:18:13+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31244 r31245 24 24 public class SolrDocJSON { 25 25 26 protected static String generateSolrText(JSONObject ef_token_pos_count, WhitelistBloomFilter whitelist_bloomfilter) 27 { 28 boolean solr_icu_tokenize = true; 29 30 ArrayList<String> tokens = new ArrayList<String>(); 31 26 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id) 27 { 28 boolean solr_icu_tokenize = true; 29 30 ArrayList<String> words = new ArrayList<String>(); 31 32 if (ef_token_pos_count != null) { 33 32 34 Iterator<String> token_iter = ef_token_pos_count.keys(); 33 34 35 while (token_iter.hasNext()) { 35 36 String token = token_iter.next(); 36 37 37 38 if (solr_icu_tokenize == true) { 38 39 Reader reader = new StringReader(token); … … 48 49 while (tokenizer.incrementToken()) { 49 50 String term = charTermAttribute.toString(); 50 tokens.add(term);51 words.add(term); 51 52 } 52 53 … … 59 60 } 60 61 else { 61 tokens.add(token); 62 } 63 } 64 65 StringBuilder sb = new StringBuilder(); 66 67 if (whitelist_bloomfilter == null) { 68 69 boolean first_append = true; 70 71 for (int i=0; i<tokens.size(); i++) { 72 String token = tokens.get(i); 73 62 words.add(token); 63 } 64 } 65 } 66 else { 67 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); 68 } 69 70 /* Alternative way to get at keys 71 Set<String> token_keys = ef_token_pos_count.keySet(); 72 for (String token : token_keys) { 73 sb.append(token + " "); 74 } 75 */ 76 return words; 77 } 78 79 80 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id, 81 WhitelistBloomFilter whitelist_bloomfilter) 82 { 83 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id); 84 85 StringBuilder sb = new StringBuilder(); 86 87 if (whitelist_bloomfilter == null) { 88 89 boolean first_append = true; 90 91 for (int i=0; i<tokens.size(); i++) { 92 String token = tokens.get(i); 93 94 if (!first_append) { 95 sb.append(" "); 96 } 97 else { 98 first_append = false; 99 } 100 sb.append(token); 101 } 102 } 103 else { 104 boolean first_append = true; 105 106 for (int i=0; i<tokens.size(); i++) { 107 String token = tokens.get(i); 108 109 if (whitelist_bloomfilter.contains(token)) { 74 110 if (!first_append) { 75 111 sb.append(" "); … … 79 115 } 80 116 sb.append(token); 81 } 82 } 83 else { 84 boolean first_append = true; 85 86 for (int i=0; i<tokens.size(); i++) { 87 String token = tokens.get(i); 88 89 if (whitelist_bloomfilter.contains(token)) { 90 if (!first_append) { 91 sb.append(" "); 92 } 93 else { 94 first_append = false; 95 } 96 sb.append(token); 97 } 98 } 99 100 } 101 /* 102 Set<String> token_keys = ef_token_pos_count.keySet(); 103 for (String token : token_keys) { 104 sb.append(token + " "); 105 } 106 */ 107 108 return sb.toString(); 109 } 117 } 118 } 119 120 } 121 122 123 return sb.toString(); 124 } 110 125 111 126 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page, … … 122 137 JSONObject solr_add_json = new JSONObject(); 123 138 124 String text = generateSolrText(ef_token_pos_count, whitelist_bloomfilter);139 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter); 125 140 126 141 JSONObject solr_doc_json = new JSONObject(); … … 200 215 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page) 201 216 { 202 ArrayList<String> word_list = n ew ArrayList<String>();217 ArrayList<String> word_list = null; 203 218 204 219 if (ef_page != null) { … … 206 221 if (ef_body != null) { 207 222 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); 208 if (ef_token_pos_count != null) { 209 210 Iterator<String> token_iter = ef_token_pos_count.keys(); 211 while (token_iter.hasNext()) { 212 String token = token_iter.next(); 213 word_list.add(token); 214 } 215 } 216 else { 217 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); 218 } 223 word_list = getTokenPosCountWords(ef_token_pos_count,page_id); 219 224 } 220 225 else {
Note:
See TracChangeset
for help on using the changeset viewer.