source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31375

Last change on this file since 31375 was 31375, checked in by davidb, 7 years ago

Initial cut at including POS information to solr index

  • Property svn:executable set to *
File size: 17.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.HashMap;
14import java.util.Iterator;
15import java.util.Set;
16
17import org.apache.commons.compress.compressors.CompressorException;
18import org.json.JSONArray;
19import org.json.JSONObject;
20import org.apache.lucene.analysis.TokenStream;
21import org.apache.lucene.analysis.Tokenizer;
22import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
23import org.apache.lucene.analysis.standard.StandardTokenizer;
24import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25import org.apache.lucene.analysis.core.LowerCaseFilter;
26
27public class SolrDocJSON {
28
29
30 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
31 boolean icu_tokenize)
32 {
33 boolean lowercase_filter = true;
34
35 ArrayList<String> words = new ArrayList<String>();
36
37 if (ef_token_pos_count != null) {
38
39 Iterator<String> word_token_iter = ef_token_pos_count.keys();
40 while (word_token_iter.hasNext()) {
41 String word_token = word_token_iter.next();
42
43 if (icu_tokenize) {
44 Reader reader = new StringReader(word_token);
45
46 ICUTokenizer icu_tokenizer = new ICUTokenizer();
47 icu_tokenizer.setReader(reader);
48
49 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
50
51 TokenStream token_stream = null;
52
53 if (lowercase_filter) {
54 token_stream = new LowerCaseFilter(icu_tokenizer);
55 }
56 else {
57 token_stream = icu_tokenizer;
58 }
59
60 try {
61 token_stream.reset();
62
63 while (token_stream.incrementToken()) {
64 String term = charTermAttribute.toString();
65 words.add(term);
66 }
67
68 token_stream.end();
69 token_stream.close();
70 }
71 catch (IOException e) {
72 e.printStackTrace();
73 }
74 }
75 else {
76 words.add(word_token);
77 }
78 }
79 }
80 else {
81 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
82 }
83
84 /* Alternative way to get at keys
85 Set<String> token_keys = ef_token_pos_count.keySet();
86 for (String token : token_keys) {
87 sb.append(token + " ");
88 }
89*/
90 return words;
91 }
92
93 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
94 boolean icu_tokenize)
95 {
96 ArrayList<POSString> words = new ArrayList<POSString>();
97
98 if (ef_token_pos_count != null) {
99
100 Iterator<String> word_token_iter = ef_token_pos_count.keys();
101 while (word_token_iter.hasNext()) {
102 String word_token = word_token_iter.next();
103
104 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
105 Set<String> pos_keys = pos_json_object.keySet();
106 int pos_keys_len = pos_keys.size();
107 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
108
109 if (icu_tokenize == true) {
110 Reader reader = new StringReader(word_token);
111
112 ICUTokenizer icu_tokenizer = new ICUTokenizer();
113 icu_tokenizer.setReader(reader);
114
115 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
116
117 TokenStream token_stream = icu_tokenizer;
118
119 try {
120 token_stream.reset();
121
122 while (token_stream.incrementToken()) {
123 String term = charTermAttribute.toString();
124
125 POSString pos_string = new POSString(term,pos_tags);
126
127 words.add(pos_string);
128 }
129
130 token_stream.end();
131 token_stream.close();
132 }
133 catch (IOException e) {
134 e.printStackTrace();
135 }
136 }
137 else {
138 POSString pos_word_token = new POSString(word_token,pos_tags);
139
140 words.add(pos_word_token);
141 }
142 }
143 }
144 else {
145 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
146 }
147
148 return words;
149 }
150 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
151 {
152 ArrayList<POSString> words_out = new ArrayList<POSString>();
153
154 for (POSString pos_word: words_in) {
155 String word = pos_word.getString();
156 String[] pos_tags = pos_word.getPOSTags();
157
158 Reader reader = new StringReader(word);
159
160 Tokenizer tokenizer = new StandardTokenizer();
161 tokenizer.setReader(reader);
162 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
163
164 TokenStream token_stream = new LowerCaseFilter(tokenizer);
165
166 try {
167 token_stream.reset();
168
169 while (token_stream.incrementToken()) {
170 String term = charTermAttribute.toString();
171
172 POSString pos_term = new POSString(term,pos_tags);
173 words_out.add(pos_term);
174 }
175
176 token_stream.end();
177 token_stream.close();
178 }
179 catch (IOException e) {
180 e.printStackTrace();
181 }
182
183 }
184
185 return words_out;
186 }
187
188 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
189 WhitelistBloomFilter whitelist_bloomfilter)
190 {
191 ArrayList<POSString> words_out = new ArrayList<POSString>();
192
193 for (POSString pos_word: words_in) {
194 String word = pos_word.getString();
195 if (whitelist_bloomfilter.contains(word)) {
196 words_out.add(pos_word);
197 }
198 }
199
200 return words_out;
201 }
202
203 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
204 {
205 ArrayList<String> pos_labels = new ArrayList<String>();
206
207 if (ef_token_pos_count != null) {
208
209 Iterator<String> word_token_iter = ef_token_pos_count.keys();
210 while (word_token_iter.hasNext()) {
211 String word_token = word_token_iter.next();
212
213 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
214
215 Iterator<String> pos_token_iter = word_pos_labels.keys();
216 while (pos_token_iter.hasNext()) {
217 String pos_token = pos_token_iter.next();
218
219 pos_labels.add(pos_token);
220 }
221 }
222 }
223 else {
224 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
225 }
226
227 return pos_labels;
228 }
229
230
231
232 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
233 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
234 {
235 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
236
237 StringBuilder sb = new StringBuilder();
238
239 if (whitelist_bloomfilter == null) {
240
241 boolean first_append = true;
242
243 for (int i=0; i<tokens.size(); i++) {
244 String token = tokens.get(i);
245
246 if (!first_append) {
247 sb.append(" ");
248 }
249 else {
250 first_append = false;
251 }
252 sb.append(token);
253 }
254 }
255 else {
256 boolean first_append = true;
257
258 for (int i=0; i<tokens.size(); i++) {
259 String token = tokens.get(i);
260
261 if (whitelist_bloomfilter.contains(token)) {
262 if (!first_append) {
263 sb.append(" ");
264 }
265 else {
266 first_append = false;
267 }
268 sb.append(token);
269 }
270 }
271
272 }
273
274
275 return sb.toString();
276 }
277
278 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
279 WhitelistBloomFilter whitelist_bloomfilter,
280 UniversalPOSLangMap universal_langmap,
281 boolean icu_tokenize)
282 {
283 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
284 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
285
286 ArrayList<POSString> tokens = null;
287 if (whitelist_bloomfilter != null) {
288 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
289 }
290 else {
291 tokens = lc_tokens;
292 }
293
294 return tokens;
295 }
296
297 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
298 UniversalPOSLangMap universal_langmap,
299 JSONObject solr_doc_json)
300 {
301 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
302 JSONArray ef_languages = ef_page.getJSONArray("languages");
303 if (ef_languages != null) {
304
305 int lang_len = ef_languages.length();
306 String [] lang_list = new String[lang_len];
307
308 for (int i=0; i<lang_len; i++) {
309 JSONObject lang_rec = ef_languages.getJSONObject(i);
310
311 Iterator<String> lang_key_iter = lang_rec.keys();
312 while (lang_key_iter.hasNext()) {
313 String lang_label = lang_key_iter.next();
314
315 lang_list[i] = lang_label;
316 }
317 }
318
319 int text_len = text_al.size();
320
321 for (int li=0; li<lang_len; li++) {
322 String lang_key = lang_list[li];
323
324 if (universal_langmap.containsLanguage(lang_key))
325 {
326 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
327
328 for (int ti=0; ti<text_len; ti++) {
329 POSString pos_text_value = text_al.get(ti);
330 String text_value = pos_text_value.getString();
331
332 String[] pos_tags = pos_text_value.getPOSTags();
333 int pos_tags_len = pos_tags.length;
334
335 for (int pti=0; ti<pos_tags_len; pti++) {
336 String opennlp_pos_key = pos_tags[pti];
337
338 String upos = universal_langmap.getUniversalLanguagePOS(lang_key, opennlp_pos_key);
339 String pos_lang_text_field = lang_key + "_" + upos + "_htrctoken";
340
341 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
342 JSONArray empty_json_values = new JSONArray();
343 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
344 }
345 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
346 }
347 }
348
349 // Now add each of the POS language fields into solr_doc_json
350 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
351 for (String plf_key : pos_lang_field_keys) {
352 String lang_text_field = plf_key;
353 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
354
355 solr_doc_json.put(lang_text_field, json_values);
356 }
357 }
358 else {
359 String lang_text_field = lang_key + "_htrctoken";
360
361 JSONArray json_values = new JSONArray();
362 for (int ti=0; ti<text_len; ti++) {
363 POSString pos_text_value = text_al.get(ti);
364 String text_value = pos_text_value.getString();
365 json_values.put(text_value);
366 }
367 solr_doc_json.put(lang_text_field, json_values);
368
369 }
370
371
372 }
373
374 }
375 }
376 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
377 WhitelistBloomFilter whitelist_bloomfilter,
378 UniversalPOSLangMap universal_langmap,
379 boolean icu_tokenize)
380 {
381 JSONObject solr_update_json = null;
382
383 if (ef_page != null) {
384 JSONObject ef_body = ef_page.getJSONObject("body");
385 if (ef_body != null) {
386 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
387 if (ef_token_pos_count != null) {
388
389 JSONObject solr_add_json = new JSONObject();
390
391 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
392
393 JSONObject solr_doc_json = new JSONObject();
394 solr_doc_json.put("id", page_id);
395 solr_doc_json.put("volumeid_s", volume_id);
396 if (text_al.size()>0) {
397 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
398 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
399 }
400 else {
401 solr_doc_json.put("efnotext_b", true);
402 }
403 solr_add_json.put("commitWithin", 5000);
404 solr_add_json.put("doc", solr_doc_json);
405
406 solr_update_json = new JSONObject();
407 solr_update_json.put("add",solr_add_json);
408
409 }
410 else {
411 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
412 }
413 }
414 else {
415 System.err.println("Warning: empty body field for '" + page_id + "'");
416 }
417
418 }
419 else {
420 System.err.println("Warning: null page for '" + page_id + "'");
421 }
422
423
424 /*
425
426 /update/json/docs
427 */
428
429 // For Reference ...
430 // Example documentation on Solr JSON syntax:
431 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
432 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
433
434 /*
435 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
436 {
437 "add": {
438 "doc": {
439 "id": "DOC1",
440 "my_boosted_field": { use a map with boost/value for a boosted field
441 "boost": 2.3,
442 "value": "test"
443 },
444 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
445 }
446 },
447 "add": {
448 "commitWithin": 5000, commit this document within 5 seconds
449 "overwrite": false, don't check for existing documents with the same uniqueKey
450 "boost": 3.45, a document boost
451 "doc": {
452 "f1": "v1", Can use repeated keys for a multi-valued field
453 "f1": "v2"
454 }
455 },
456
457 "commit": {},
458 "optimize": { "waitSearcher":false },
459
460 "delete": { "id":"ID" }, delete by ID
461 "delete": { "query":"QUERY" } delete by query
462 }'
463 */
464
465 return solr_update_json;
466 }
467
468 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
469 boolean icu_tokenize)
470 {
471 ArrayList<String> word_list = null;
472
473 if (ef_page != null) {
474 JSONObject ef_body = ef_page.getJSONObject("body");
475 if (ef_body != null) {
476 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
477 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
478 }
479 else {
480 System.err.println("Warning: empty body field for '" + page_id + "'");
481 }
482
483 }
484 else {
485 System.err.println("Warning: null page for '" + page_id + "'");
486 }
487
488 return word_list;
489 }
490
491 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
492 {
493 ArrayList<String> word_list = null;
494
495 if (ef_page != null) {
496 JSONObject ef_body = ef_page.getJSONObject("body");
497 if (ef_body != null) {
498 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
499 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
500 }
501 else {
502 System.err.println("Warning: empty body field for '" + page_id + "'");
503 }
504
505 }
506 else {
507 System.err.println("Warning: null page for '" + page_id + "'");
508 }
509
510 return word_list;
511 }
512
513 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
514 {
515 ArrayList<String> lang_list = new ArrayList<String>();;
516
517 if (ef_page != null) {
518 JSONArray ef_languages = ef_page.getJSONArray("languages");
519 if (ef_languages != null) {
520
521 int lang_len = ef_languages.length();
522 for (int i=0; i<lang_len; i++) {
523 JSONObject lang_rec = ef_languages.getJSONObject(i);
524
525 Iterator<String> lang_key_iter = lang_rec.keys();
526 while (lang_key_iter.hasNext()) {
527 String lang_label = lang_key_iter.next();
528
529 lang_list.add(lang_label);
530 }
531 }
532 }
533 else {
534 System.err.println("Warning: empty languages field for '" + page_id + "'");
535 }
536
537 }
538 else {
539 System.err.println("Warning: null page for '" + page_id + "'");
540 }
541
542 return lang_list;
543 }
544
545 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
546 {
547 try {
548 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
549 bw.write(solr_add_doc_json.toString());
550 bw.close();
551 } catch (IOException e) {
552 e.printStackTrace();
553 } catch (CompressorException e) {
554 e.printStackTrace();
555 }
556 }
557
558 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
559 {
560
561 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
562 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
563 //curl_popen += " --data-binary '";
564 //curl_popen += "'"
565
566
567 System.out.println("**** post_url = " + post_url);
568
569 try {
570 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
571 httpcon.setDoOutput(true);
572 httpcon.setRequestProperty("Content-Type", "application/json");
573 httpcon.setRequestProperty("Accept", "application/json");
574 httpcon.setRequestMethod("POST");
575 httpcon.connect();
576
577 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
578 OutputStream os = httpcon.getOutputStream();
579 os.write(outputBytes);
580 os.close();
581
582
583 // Read response
584 StringBuilder sb = new StringBuilder();
585 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
586 String decodedString;
587 while ((decodedString = in.readLine()) != null) {
588 sb.append(decodedString);
589 }
590 in.close();
591
592 JSONObject solr_status_json = new JSONObject(sb.toString());
593 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
594 if (response_header_json != null) {
595 int status = response_header_json.getInt("status");
596 if (status != 0) {
597 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
598 System.err.println("Full response was: " + sb);
599 }
600 }
601 else {
602 System.err.println("Failed response to Solr POST: " + sb);
603 }
604
605
606
607 }
608 catch (Exception e) {
609 e.printStackTrace();
610 }
611
612 }
613}
Note: See TracBrowser for help on using the repository browser.