source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31505

Last change on this file since 31505 was 31505, checked in by davidb, 7 years ago

Added in storing of top-level document metadata as separate solr-doc

  • Property svn:executable set to *
File size: 19.7 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStream;
7import java.io.InputStreamReader;
8import java.io.OutputStream;
9import java.io.Reader;
10import java.io.StringReader;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.util.ArrayList;
14import java.util.HashMap;
15import java.util.Iterator;
16import java.util.Set;
17
18import org.apache.commons.compress.compressors.CompressorException;
19import org.json.JSONArray;
20import org.json.JSONObject;
21import org.apache.lucene.analysis.TokenStream;
22import org.apache.lucene.analysis.Tokenizer;
23import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
24import org.apache.lucene.analysis.standard.StandardTokenizer;
25import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26import org.apache.lucene.analysis.core.LowerCaseFilter;
27
28public class SolrDocJSON {
29
30 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata)
31 {
32 JSONObject solr_update_json = null;
33
34
35 String [] metadata_single = new String[] {
36 "accessProfile",
37 "rightsAttributes",
38 "hathitrustRecordNumber",
39 "title",
40 "imprint",
41 "pubDate",
42 "pubPlace",
43 "language",
44 "issuance",
45 "typeOfResource"
46 };
47
48 String [] metadata_multiple = new String[] {
49 "oclc",
50 "isbn",
51 "issn",
52 "lccn",
53 "genre",
54 "names"
55 };
56
57 if (ef_metadata != null) {
58
59 // For JSON Solr format see:
60 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
61
62 //String title= ef_metadata.getString("title");
63 JSONObject solr_add_json = new JSONObject();
64
65 JSONObject solr_doc_json = new JSONObject();
66 solr_doc_json.put("id", volume_id);
67
68 for (String metaname: metadata_single) {
69 String metavalue = ef_metadata.getString(metaname);
70 if (metavalue != null) {
71 solr_doc_json.put(metaname+"_t",metavalue);
72 }
73 }
74
75 for (String metaname: metadata_multiple) {
76 JSONArray metavalues = ef_metadata.getJSONArray(metaname);
77 if (metavalues != null) {
78 solr_doc_json.put(metaname+"_t",metavalues);
79 }
80 }
81
82 solr_add_json.put("commitWithin", 5000);
83 solr_add_json.put("doc", solr_doc_json);
84
85 solr_update_json = new JSONObject();
86 solr_update_json.put("add",solr_add_json);
87
88 }
89 else {
90 System.err.println("Warning: null metadata for '" + volume_id + "'");
91 }
92
93 return solr_update_json;
94 }
95
96
97
98 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
99 boolean icu_tokenize)
100 {
101 boolean lowercase_filter = true;
102
103 ArrayList<String> words = new ArrayList<String>();
104
105 if (ef_token_pos_count != null) {
106
107 Iterator<String> word_token_iter = ef_token_pos_count.keys();
108 while (word_token_iter.hasNext()) {
109 String word_token = word_token_iter.next();
110
111 if (icu_tokenize) {
112 Reader reader = new StringReader(word_token);
113
114 ICUTokenizer icu_tokenizer = new ICUTokenizer();
115 icu_tokenizer.setReader(reader);
116
117 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
118
119 TokenStream token_stream = null;
120
121 if (lowercase_filter) {
122 token_stream = new LowerCaseFilter(icu_tokenizer);
123 }
124 else {
125 token_stream = icu_tokenizer;
126 }
127
128 try {
129 token_stream.reset();
130
131 while (token_stream.incrementToken()) {
132 String term = charTermAttribute.toString();
133 words.add(term);
134 }
135
136 token_stream.end();
137 token_stream.close();
138 }
139 catch (IOException e) {
140 e.printStackTrace();
141 }
142 }
143 else {
144 words.add(word_token);
145 }
146 }
147 }
148 else {
149 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
150 }
151
152 /* Alternative way to get at keys
153 Set<String> token_keys = ef_token_pos_count.keySet();
154 for (String token : token_keys) {
155 sb.append(token + " ");
156 }
157*/
158 return words;
159 }
160
161 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
162 boolean icu_tokenize)
163 {
164 ArrayList<POSString> words = new ArrayList<POSString>();
165
166 if (ef_token_pos_count != null) {
167
168 Iterator<String> word_token_iter = ef_token_pos_count.keys();
169 while (word_token_iter.hasNext()) {
170 String word_token = word_token_iter.next();
171
172 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
173
174 Set<String> pos_keys = pos_json_object.keySet();
175 int pos_keys_len = pos_keys.size();
176 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
177
178 if (icu_tokenize == true) {
179 Reader reader = new StringReader(word_token);
180
181 ICUTokenizer icu_tokenizer = new ICUTokenizer();
182 icu_tokenizer.setReader(reader);
183
184 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
185
186 TokenStream token_stream = icu_tokenizer;
187
188 try {
189 token_stream.reset();
190
191 while (token_stream.incrementToken()) {
192 String term = charTermAttribute.toString();
193
194 POSString pos_string = new POSString(term,pos_tags);
195
196 words.add(pos_string);
197 }
198
199 token_stream.end();
200 token_stream.close();
201 }
202 catch (IOException e) {
203 e.printStackTrace();
204 }
205 }
206 else {
207 POSString pos_word_token = new POSString(word_token,pos_tags);
208
209 words.add(pos_word_token);
210 }
211 }
212 }
213 else {
214 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
215 }
216
217 return words;
218 }
219 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
220 {
221 ArrayList<POSString> words_out = new ArrayList<POSString>();
222
223 for (POSString pos_word: words_in) {
224 String word = pos_word.getString();
225 String[] pos_tags = pos_word.getPOSTags();
226
227 Reader reader = new StringReader(word);
228
229 Tokenizer tokenizer = new StandardTokenizer();
230 tokenizer.setReader(reader);
231 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
232
233 TokenStream token_stream = new LowerCaseFilter(tokenizer);
234
235 try {
236 token_stream.reset();
237
238 while (token_stream.incrementToken()) {
239 String term = charTermAttribute.toString();
240
241 POSString pos_term = new POSString(term,pos_tags);
242 words_out.add(pos_term);
243 }
244
245 token_stream.end();
246 token_stream.close();
247 }
248 catch (IOException e) {
249 e.printStackTrace();
250 }
251
252 }
253
254 return words_out;
255 }
256
257 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
258 WhitelistBloomFilter whitelist_bloomfilter)
259 {
260 ArrayList<POSString> words_out = new ArrayList<POSString>();
261
262 for (POSString pos_word: words_in) {
263 String word = pos_word.getString();
264 if (whitelist_bloomfilter.contains(word)) {
265 words_out.add(pos_word);
266 }
267 }
268
269 return words_out;
270 }
271
272 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
273 {
274 ArrayList<String> pos_labels = new ArrayList<String>();
275
276 if (ef_token_pos_count != null) {
277
278 Iterator<String> word_token_iter = ef_token_pos_count.keys();
279 while (word_token_iter.hasNext()) {
280 String word_token = word_token_iter.next();
281
282 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
283
284 Iterator<String> pos_token_iter = word_pos_labels.keys();
285 while (pos_token_iter.hasNext()) {
286 String pos_token = pos_token_iter.next();
287
288 pos_labels.add(pos_token);
289 }
290 }
291 }
292 else {
293 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
294 }
295
296 return pos_labels;
297 }
298
299
300
301 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
302 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
303 {
304 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
305
306 StringBuilder sb = new StringBuilder();
307
308 if (whitelist_bloomfilter == null) {
309
310 boolean first_append = true;
311
312 for (int i=0; i<tokens.size(); i++) {
313 String token = tokens.get(i);
314
315 if (!first_append) {
316 sb.append(" ");
317 }
318 else {
319 first_append = false;
320 }
321 sb.append(token);
322 }
323 }
324 else {
325 boolean first_append = true;
326
327 for (int i=0; i<tokens.size(); i++) {
328 String token = tokens.get(i);
329
330 if (whitelist_bloomfilter.contains(token)) {
331 if (!first_append) {
332 sb.append(" ");
333 }
334 else {
335 first_append = false;
336 }
337 sb.append(token);
338 }
339 }
340
341 }
342
343
344 return sb.toString();
345 }
346
347 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
348 WhitelistBloomFilter whitelist_bloomfilter,
349 UniversalPOSLangMap universal_langmap,
350 boolean icu_tokenize)
351 {
352 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
353 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
354
355 ArrayList<POSString> tokens = null;
356 if (whitelist_bloomfilter != null) {
357 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
358 }
359 else {
360 tokens = lc_tokens;
361 }
362
363 return tokens;
364 }
365
366 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
367 UniversalPOSLangMap universal_langmap,
368 JSONObject solr_doc_json)
369 {
370 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
371 JSONArray ef_languages = ef_page.getJSONArray("languages");
372 if (ef_languages != null) {
373
374 int lang_len = ef_languages.length();
375 String [] lang_list = new String[lang_len];
376
377 for (int i=0; i<lang_len; i++) {
378 JSONObject lang_rec = ef_languages.getJSONObject(i);
379
380 Iterator<String> lang_key_iter = lang_rec.keys();
381 while (lang_key_iter.hasNext()) {
382 String lang_label = lang_key_iter.next();
383
384 lang_list[i] = lang_label;
385 }
386 }
387
388 int text_len = text_al.size();
389
390 for (int li=0; li<lang_len; li++) {
391 String lang_key = lang_list[li];
392
393 if (universal_langmap.containsLanguage(lang_key))
394 {
395 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
396
397 for (int ti=0; ti<text_len; ti++) {
398 POSString pos_text_value = text_al.get(ti);
399 String text_value = pos_text_value.getString();
400
401 String[] pos_tags = pos_text_value.getPOSTags();
402 int pos_tags_len = pos_tags.length;
403
404 for (int pti=0; pti<pos_tags_len; pti++) {
405 String opennlp_pos_key = pos_tags[pti];
406
407 String upos = universal_langmap.getUniversalLanguagePOS(lang_key, opennlp_pos_key);
408 String pos_lang_text_field = lang_key + "_" + upos + "_htrctoken";
409
410 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
411 JSONArray empty_json_values = new JSONArray();
412 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
413 }
414 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
415 }
416 }
417
418 // Now add each of the POS language fields into solr_doc_json
419 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
420 for (String plf_key : pos_lang_field_keys) {
421 String lang_text_field = plf_key;
422 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
423
424 solr_doc_json.put(lang_text_field, json_values);
425 }
426 }
427 else {
428 String lang_text_field = lang_key + "_htrctoken";
429
430 JSONArray json_values = new JSONArray();
431 for (int ti=0; ti<text_len; ti++) {
432 POSString pos_text_value = text_al.get(ti);
433 String text_value = pos_text_value.getString();
434 json_values.put(text_value);
435 }
436 solr_doc_json.put(lang_text_field, json_values);
437
438 }
439
440
441 }
442
443 }
444 }
445
446 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
447 WhitelistBloomFilter whitelist_bloomfilter,
448 UniversalPOSLangMap universal_langmap,
449 boolean icu_tokenize)
450 {
451 JSONObject solr_update_json = null;
452
453 if (ef_page != null) {
454 JSONObject ef_body = ef_page.getJSONObject("body");
455 if (ef_body != null) {
456 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
457 if (ef_token_pos_count != null) {
458
459 JSONObject solr_add_json = new JSONObject();
460
461 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
462
463 JSONObject solr_doc_json = new JSONObject();
464 solr_doc_json.put("id", page_id);
465 solr_doc_json.put("volumeid_s", volume_id);
466 if (text_al.size()>0) {
467 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
468 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
469 }
470 else {
471 solr_doc_json.put("efnotext_b", true);
472 }
473 solr_add_json.put("commitWithin", 5000);
474 solr_add_json.put("doc", solr_doc_json);
475
476 solr_update_json = new JSONObject();
477 solr_update_json.put("add",solr_add_json);
478
479 }
480 else {
481 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
482 }
483 }
484 else {
485 System.err.println("Warning: empty body field for '" + page_id + "'");
486 }
487
488 }
489 else {
490 System.err.println("Warning: null page for '" + page_id + "'");
491 }
492
493
494 /*
495
496 /update/json/docs
497 */
498
499 // For Reference ...
500 // Example documentation on Solr JSON syntax:
501 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
502 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
503
504 /*
505 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
506 {
507 "add": {
508 "doc": {
509 "id": "DOC1",
510 "my_boosted_field": { use a map with boost/value for a boosted field
511 "boost": 2.3,
512 "value": "test"
513 },
514 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
515 }
516 },
517 "add": {
518 "commitWithin": 5000, commit this document within 5 seconds
519 "overwrite": false, don't check for existing documents with the same uniqueKey
520 "boost": 3.45, a document boost
521 "doc": {
522 "f1": "v1", Can use repeated keys for a multi-valued field
523 "f1": "v2"
524 }
525 },
526
527 "commit": {},
528 "optimize": { "waitSearcher":false },
529
530 "delete": { "id":"ID" }, delete by ID
531 "delete": { "query":"QUERY" } delete by query
532 }'
533 */
534
535 return solr_update_json;
536 }
537
538 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
539 boolean icu_tokenize)
540 {
541 ArrayList<String> word_list = null;
542
543 if (ef_page != null) {
544 JSONObject ef_body = ef_page.getJSONObject("body");
545 if (ef_body != null) {
546 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
547 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
548 }
549 else {
550 System.err.println("Warning: empty body field for '" + page_id + "'");
551 }
552
553 }
554 else {
555 System.err.println("Warning: null page for '" + page_id + "'");
556 }
557
558 return word_list;
559 }
560
561 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
562 {
563 ArrayList<String> word_list = null;
564
565 if (ef_page != null) {
566 JSONObject ef_body = ef_page.getJSONObject("body");
567 if (ef_body != null) {
568 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
569 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
570 }
571 else {
572 System.err.println("Warning: empty body field for '" + page_id + "'");
573 }
574
575 }
576 else {
577 System.err.println("Warning: null page for '" + page_id + "'");
578 }
579
580 return word_list;
581 }
582
583 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
584 {
585 ArrayList<String> lang_list = new ArrayList<String>();;
586
587 if (ef_page != null) {
588 JSONArray ef_languages = ef_page.getJSONArray("languages");
589 if (ef_languages != null) {
590
591 int lang_len = ef_languages.length();
592 for (int i=0; i<lang_len; i++) {
593 JSONObject lang_rec = ef_languages.getJSONObject(i);
594
595 Iterator<String> lang_key_iter = lang_rec.keys();
596 while (lang_key_iter.hasNext()) {
597 String lang_label = lang_key_iter.next();
598
599 lang_list.add(lang_label);
600 }
601 }
602 }
603 else {
604 System.err.println("Warning: empty languages field for '" + page_id + "'");
605 }
606
607 }
608 else {
609 System.err.println("Warning: null page for '" + page_id + "'");
610 }
611
612 return lang_list;
613 }
614
615 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
616 {
617 try {
618 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
619 bw.write(solr_add_doc_json.toString());
620 bw.close();
621 } catch (IOException e) {
622 e.printStackTrace();
623 } catch (CompressorException e) {
624 e.printStackTrace();
625 }
626 }
627
628 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json,
629 String volume_id, String page_id)
630 {
631
632 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
633 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
634 //curl_popen += " --data-binary '";
635 //curl_popen += "'"
636
637
638 // System.out.println("Post URL: " + post_url);
639
640 try {
641 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
642 httpcon.setDoOutput(true);
643 httpcon.setRequestProperty("Content-Type", "application/json");
644 httpcon.setRequestProperty("Accept", "application/json");
645 httpcon.setRequestMethod("POST");
646 httpcon.connect();
647
648 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
649 OutputStream os = httpcon.getOutputStream();
650 os.write(outputBytes);
651 os.close();
652
653
654 // Read response
655 StringBuilder sb = new StringBuilder();
656 InputStream is = httpcon.getInputStream();
657 BufferedReader in = new BufferedReader(new InputStreamReader(is));
658 String decodedString;
659 while ((decodedString = in.readLine()) != null) {
660 sb.append(decodedString);
661 }
662 in.close();
663
664 JSONObject solr_status_json = new JSONObject(sb.toString());
665 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
666 if (response_header_json != null) {
667 int status = response_header_json.getInt("status");
668 if (status != 0) {
669 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
670 System.err.println("Full response was: " + sb);
671 }
672 }
673 else {
674 System.err.println("Failed response to Solr POST: " + sb);
675 }
676
677
678
679 }
680 catch (IOException e) {
681 System.err.println("Solr core update failed when processing id: " + volume_id + "." + page_id);
682 e.printStackTrace();
683 }
684
685 catch (Exception e) {
686 e.printStackTrace();
687 }
688 }
689}
Note: See TracBrowser for help on using the repository browser.