source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31509

Last change on this file since 31509 was 31509, checked in by davidb, 7 years ago

LangPos determination changed to lock into first match, rather than trying to populate all model-predicted languages with the determined POS (which gets mixed up when l=en,de and POS=NE for example -- de goes wrong, has no match)

  • Property svn:executable set to *
File size: 20.0 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStream;
7import java.io.InputStreamReader;
8import java.io.OutputStream;
9import java.io.Reader;
10import java.io.StringReader;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.util.ArrayList;
14import java.util.HashMap;
15import java.util.Iterator;
16import java.util.Set;
17
18import org.apache.commons.compress.compressors.CompressorException;
19import org.json.JSONArray;
20import org.json.JSONObject;
21
22import scala.Tuple2;
23
24import org.apache.lucene.analysis.TokenStream;
25import org.apache.lucene.analysis.Tokenizer;
26import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
27import org.apache.lucene.analysis.standard.StandardTokenizer;
28import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29import org.apache.lucene.analysis.core.LowerCaseFilter;
30
31public class SolrDocJSON {
32
33 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata)
34 {
35 JSONObject solr_update_json = null;
36
37
38 String [] metadata_single = new String[] {
39 "accessProfile",
40 "rightsAttributes",
41 "hathitrustRecordNumber",
42 "title",
43 "imprint",
44 "pubDate",
45 "pubPlace",
46 "language",
47 "issuance",
48 "typeOfResource"
49 };
50
51 String [] metadata_multiple = new String[] {
52 "oclc",
53 "isbn",
54 "issn",
55 "lccn",
56 "genre",
57 "names"
58 };
59
60 if (ef_metadata != null) {
61
62 // For JSON Solr format see:
63 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
64
65 //String title= ef_metadata.getString("title");
66 JSONObject solr_add_json = new JSONObject();
67
68 JSONObject solr_doc_json = new JSONObject();
69 solr_doc_json.put("id", volume_id);
70
71 for (String metaname: metadata_single) {
72 String metavalue = ef_metadata.getString(metaname);
73 if (metavalue != null) {
74 solr_doc_json.put(metaname+"_t",metavalue);
75 }
76 }
77
78 for (String metaname: metadata_multiple) {
79 JSONArray metavalues = ef_metadata.getJSONArray(metaname);
80 if (metavalues != null) {
81 solr_doc_json.put(metaname+"_t",metavalues);
82 }
83 }
84
85 solr_add_json.put("commitWithin", 5000);
86 solr_add_json.put("doc", solr_doc_json);
87
88 solr_update_json = new JSONObject();
89 solr_update_json.put("add",solr_add_json);
90
91 }
92 else {
93 System.err.println("Warning: null metadata for '" + volume_id + "'");
94 }
95
96 return solr_update_json;
97 }
98
99
100
101 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
102 boolean icu_tokenize)
103 {
104 boolean lowercase_filter = true;
105
106 ArrayList<String> words = new ArrayList<String>();
107
108 if (ef_token_pos_count != null) {
109
110 Iterator<String> word_token_iter = ef_token_pos_count.keys();
111 while (word_token_iter.hasNext()) {
112 String word_token = word_token_iter.next();
113
114 if (icu_tokenize) {
115 Reader reader = new StringReader(word_token);
116
117 ICUTokenizer icu_tokenizer = new ICUTokenizer();
118 icu_tokenizer.setReader(reader);
119
120 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
121
122 TokenStream token_stream = null;
123
124 if (lowercase_filter) {
125 token_stream = new LowerCaseFilter(icu_tokenizer);
126 }
127 else {
128 token_stream = icu_tokenizer;
129 }
130
131 try {
132 token_stream.reset();
133
134 while (token_stream.incrementToken()) {
135 String term = charTermAttribute.toString();
136 words.add(term);
137 }
138
139 token_stream.end();
140 token_stream.close();
141 }
142 catch (IOException e) {
143 e.printStackTrace();
144 }
145 }
146 else {
147 words.add(word_token);
148 }
149 }
150 }
151 else {
152 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
153 }
154
155 /* Alternative way to get at keys
156 Set<String> token_keys = ef_token_pos_count.keySet();
157 for (String token : token_keys) {
158 sb.append(token + " ");
159 }
160*/
161 return words;
162 }
163
164 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
165 boolean icu_tokenize)
166 {
167 ArrayList<POSString> words = new ArrayList<POSString>();
168
169 if (ef_token_pos_count != null) {
170
171 Iterator<String> word_token_iter = ef_token_pos_count.keys();
172 while (word_token_iter.hasNext()) {
173 String word_token = word_token_iter.next();
174
175 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
176
177 Set<String> pos_keys = pos_json_object.keySet();
178 int pos_keys_len = pos_keys.size();
179 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
180
181 if (icu_tokenize == true) {
182 Reader reader = new StringReader(word_token);
183
184 ICUTokenizer icu_tokenizer = new ICUTokenizer();
185 icu_tokenizer.setReader(reader);
186
187 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
188
189 TokenStream token_stream = icu_tokenizer;
190
191 try {
192 token_stream.reset();
193
194 while (token_stream.incrementToken()) {
195 String term = charTermAttribute.toString();
196
197 POSString pos_string = new POSString(term,pos_tags);
198
199 words.add(pos_string);
200 }
201
202 token_stream.end();
203 token_stream.close();
204 }
205 catch (IOException e) {
206 e.printStackTrace();
207 }
208 }
209 else {
210 POSString pos_word_token = new POSString(word_token,pos_tags);
211
212 words.add(pos_word_token);
213 }
214 }
215 }
216 else {
217 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
218 }
219
220 return words;
221 }
222 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
223 {
224 ArrayList<POSString> words_out = new ArrayList<POSString>();
225
226 for (POSString pos_word: words_in) {
227 String word = pos_word.getString();
228 String[] pos_tags = pos_word.getPOSTags();
229
230 Reader reader = new StringReader(word);
231
232 Tokenizer tokenizer = new StandardTokenizer();
233 tokenizer.setReader(reader);
234 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
235
236 TokenStream token_stream = new LowerCaseFilter(tokenizer);
237
238 try {
239 token_stream.reset();
240
241 while (token_stream.incrementToken()) {
242 String term = charTermAttribute.toString();
243
244 POSString pos_term = new POSString(term,pos_tags);
245 words_out.add(pos_term);
246 }
247
248 token_stream.end();
249 token_stream.close();
250 }
251 catch (IOException e) {
252 e.printStackTrace();
253 }
254
255 }
256
257 return words_out;
258 }
259
260 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
261 WhitelistBloomFilter whitelist_bloomfilter)
262 {
263 ArrayList<POSString> words_out = new ArrayList<POSString>();
264
265 for (POSString pos_word: words_in) {
266 String word = pos_word.getString();
267 if (whitelist_bloomfilter.contains(word)) {
268 words_out.add(pos_word);
269 }
270 }
271
272 return words_out;
273 }
274
275 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
276 {
277 ArrayList<String> pos_labels = new ArrayList<String>();
278
279 if (ef_token_pos_count != null) {
280
281 Iterator<String> word_token_iter = ef_token_pos_count.keys();
282 while (word_token_iter.hasNext()) {
283 String word_token = word_token_iter.next();
284
285 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
286
287 Iterator<String> pos_token_iter = word_pos_labels.keys();
288 while (pos_token_iter.hasNext()) {
289 String pos_token = pos_token_iter.next();
290
291 pos_labels.add(pos_token);
292 }
293 }
294 }
295 else {
296 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
297 }
298
299 return pos_labels;
300 }
301
302
303
304 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
305 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
306 {
307 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
308
309 StringBuilder sb = new StringBuilder();
310
311 if (whitelist_bloomfilter == null) {
312
313 boolean first_append = true;
314
315 for (int i=0; i<tokens.size(); i++) {
316 String token = tokens.get(i);
317
318 if (!first_append) {
319 sb.append(" ");
320 }
321 else {
322 first_append = false;
323 }
324 sb.append(token);
325 }
326 }
327 else {
328 boolean first_append = true;
329
330 for (int i=0; i<tokens.size(); i++) {
331 String token = tokens.get(i);
332
333 if (whitelist_bloomfilter.contains(token)) {
334 if (!first_append) {
335 sb.append(" ");
336 }
337 else {
338 first_append = false;
339 }
340 sb.append(token);
341 }
342 }
343
344 }
345
346
347 return sb.toString();
348 }
349
350 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
351 WhitelistBloomFilter whitelist_bloomfilter,
352 UniversalPOSLangMap universal_langmap,
353 boolean icu_tokenize)
354 {
355 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
356 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
357
358 ArrayList<POSString> tokens = null;
359 if (whitelist_bloomfilter != null) {
360 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
361 }
362 else {
363 tokens = lc_tokens;
364 }
365
366 return tokens;
367 }
368
369 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
370 UniversalPOSLangMap universal_langmap,
371 JSONObject solr_doc_json)
372 {
373 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
374 JSONArray ef_languages = ef_page.getJSONArray("languages");
375 if (ef_languages != null) {
376
377 int lang_len = ef_languages.length();
378 String [] lang_list = new String[lang_len];
379
380 for (int i=0; i<lang_len; i++) {
381 JSONObject lang_rec = ef_languages.getJSONObject(i);
382
383 Iterator<String> lang_key_iter = lang_rec.keys();
384 while (lang_key_iter.hasNext()) {
385 String lang_label = lang_key_iter.next();
386
387 lang_list[i] = lang_label;
388 }
389 }
390
391 int text_len = text_al.size();
392
393 /*
394 for (int li=0; li<lang_len; li++) {
395 String lang_key = lang_list[li];
396
397 if (universal_langmap.containsLanguage(lang_key))
398 {
399 */
400 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
401
402 for (int ti=0; ti<text_len; ti++) {
403 POSString pos_text_value = text_al.get(ti);
404 String text_value = pos_text_value.getString();
405
406 String[] pos_tags = pos_text_value.getPOSTags();
407 int pos_tags_len = pos_tags.length;
408
409 for (int pti=0; pti<pos_tags_len; pti++) {
410 String opennlp_pos_key = pos_tags[pti];
411
412 Tuple2<String,String> lang_pos_pair = universal_langmap.getUniversalLanguagePOSPair(lang_list, opennlp_pos_key);
413 String selected_lang = lang_pos_pair._1;
414 String upos = lang_pos_pair._2;
415
416 String pos_lang_text_field = selected_lang;
417 if (upos != null) {
418 pos_lang_text_field += "_" + upos;
419 }
420 pos_lang_text_field += "_htrctoken";
421
422 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
423 JSONArray empty_json_values = new JSONArray();
424 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
425 }
426 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
427 }
428 }
429
430 // Now add each of the POS language fields into solr_doc_json
431 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
432 for (String plf_key : pos_lang_field_keys) {
433 String lang_text_field = plf_key;
434 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
435
436 solr_doc_json.put(lang_text_field, json_values);
437 }
438 /*
439 }
440 else {
441 String lang_text_field = lang_key + "_htrctoken";
442
443 JSONArray json_values = new JSONArray();
444 for (int ti=0; ti<text_len; ti++) {
445 POSString pos_text_value = text_al.get(ti);
446 String text_value = pos_text_value.getString();
447 json_values.put(text_value);
448 }
449 solr_doc_json.put(lang_text_field, json_values);
450
451 }
452
453
454 }
455 */
456 }
457 }
458
459 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
460 WhitelistBloomFilter whitelist_bloomfilter,
461 UniversalPOSLangMap universal_langmap,
462 boolean icu_tokenize)
463 {
464 JSONObject solr_update_json = null;
465
466 if (ef_page != null) {
467 JSONObject ef_body = ef_page.getJSONObject("body");
468 if (ef_body != null) {
469 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
470 if (ef_token_pos_count != null) {
471
472 JSONObject solr_add_json = new JSONObject();
473
474 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
475
476 JSONObject solr_doc_json = new JSONObject();
477 solr_doc_json.put("id", page_id);
478 solr_doc_json.put("volumeid_s", volume_id);
479 if (text_al.size()>0) {
480 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
481 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
482 }
483 else {
484 solr_doc_json.put("efnotext_b", true);
485 }
486 solr_add_json.put("commitWithin", 5000);
487 solr_add_json.put("doc", solr_doc_json);
488
489 solr_update_json = new JSONObject();
490 solr_update_json.put("add",solr_add_json);
491
492 }
493 else {
494 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
495 }
496 }
497 else {
498 System.err.println("Warning: empty body field for '" + page_id + "'");
499 }
500
501 }
502 else {
503 System.err.println("Warning: null page for '" + page_id + "'");
504 }
505
506
507 /*
508
509 /update/json/docs
510 */
511
512 // For Reference ...
513 // Example documentation on Solr JSON syntax:
514 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
515 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
516
517 /*
518 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
519 {
520 "add": {
521 "doc": {
522 "id": "DOC1",
523 "my_boosted_field": { use a map with boost/value for a boosted field
524 "boost": 2.3,
525 "value": "test"
526 },
527 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
528 }
529 },
530 "add": {
531 "commitWithin": 5000, commit this document within 5 seconds
532 "overwrite": false, don't check for existing documents with the same uniqueKey
533 "boost": 3.45, a document boost
534 "doc": {
535 "f1": "v1", Can use repeated keys for a multi-valued field
536 "f1": "v2"
537 }
538 },
539
540 "commit": {},
541 "optimize": { "waitSearcher":false },
542
543 "delete": { "id":"ID" }, delete by ID
544 "delete": { "query":"QUERY" } delete by query
545 }'
546 */
547
548 return solr_update_json;
549 }
550
551 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
552 boolean icu_tokenize)
553 {
554 ArrayList<String> word_list = null;
555
556 if (ef_page != null) {
557 JSONObject ef_body = ef_page.getJSONObject("body");
558 if (ef_body != null) {
559 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
560 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
561 }
562 else {
563 System.err.println("Warning: empty body field for '" + page_id + "'");
564 }
565
566 }
567 else {
568 System.err.println("Warning: null page for '" + page_id + "'");
569 }
570
571 return word_list;
572 }
573
574 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
575 {
576 ArrayList<String> word_list = null;
577
578 if (ef_page != null) {
579 JSONObject ef_body = ef_page.getJSONObject("body");
580 if (ef_body != null) {
581 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
582 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
583 }
584 else {
585 System.err.println("Warning: empty body field for '" + page_id + "'");
586 }
587
588 }
589 else {
590 System.err.println("Warning: null page for '" + page_id + "'");
591 }
592
593 return word_list;
594 }
595
596 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
597 {
598 ArrayList<String> lang_list = new ArrayList<String>();;
599
600 if (ef_page != null) {
601 JSONArray ef_languages = ef_page.getJSONArray("languages");
602 if (ef_languages != null) {
603
604 int lang_len = ef_languages.length();
605 for (int i=0; i<lang_len; i++) {
606 JSONObject lang_rec = ef_languages.getJSONObject(i);
607
608 Iterator<String> lang_key_iter = lang_rec.keys();
609 while (lang_key_iter.hasNext()) {
610 String lang_label = lang_key_iter.next();
611
612 lang_list.add(lang_label);
613 }
614 }
615 }
616 else {
617 System.err.println("Warning: empty languages field for '" + page_id + "'");
618 }
619
620 }
621 else {
622 System.err.println("Warning: null page for '" + page_id + "'");
623 }
624
625 return lang_list;
626 }
627
628 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
629 {
630 try {
631 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
632 bw.write(solr_add_doc_json.toString());
633 bw.close();
634 } catch (IOException e) {
635 e.printStackTrace();
636 } catch (CompressorException e) {
637 e.printStackTrace();
638 }
639 }
640
641 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json,
642 String volume_id, String page_id)
643 {
644
645 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
646 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
647 //curl_popen += " --data-binary '";
648 //curl_popen += "'"
649
650
651 // System.out.println("Post URL: " + post_url);
652
653 try {
654 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
655 httpcon.setDoOutput(true);
656 httpcon.setRequestProperty("Content-Type", "application/json");
657 httpcon.setRequestProperty("Accept", "application/json");
658 httpcon.setRequestMethod("POST");
659 httpcon.connect();
660
661 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
662 OutputStream os = httpcon.getOutputStream();
663 os.write(outputBytes);
664 os.close();
665
666
667 // Read response
668 StringBuilder sb = new StringBuilder();
669 InputStream is = httpcon.getInputStream();
670 BufferedReader in = new BufferedReader(new InputStreamReader(is));
671 String decodedString;
672 while ((decodedString = in.readLine()) != null) {
673 sb.append(decodedString);
674 }
675 in.close();
676
677 JSONObject solr_status_json = new JSONObject(sb.toString());
678 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
679 if (response_header_json != null) {
680 int status = response_header_json.getInt("status");
681 if (status != 0) {
682 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
683 System.err.println("Full response was: " + sb);
684 }
685 }
686 else {
687 System.err.println("Failed response to Solr POST: " + sb);
688 }
689
690
691
692 }
693 catch (IOException e) {
694 System.err.println("Solr core update failed when processing id: " + volume_id + "." + page_id);
695 e.printStackTrace();
696 }
697
698 catch (Exception e) {
699 e.printStackTrace();
700 }
701 }
702}
Note: See TracBrowser for help on using the repository browser.