source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31597

Last change on this file since 31597 was 31597, checked in by davidb, 7 years ago

Additional _s and _ss fields to help with faceting. Temporarily commented out the full-text page part.

  • Property svn:executable set to *
File size: 20.1 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStream;
7import java.io.InputStreamReader;
8import java.io.OutputStream;
9import java.io.Reader;
10import java.io.StringReader;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.util.ArrayList;
14import java.util.HashMap;
15import java.util.Iterator;
16import java.util.Set;
17
18import org.apache.commons.compress.compressors.CompressorException;
19import org.json.JSONArray;
20import org.json.JSONObject;
21
22import scala.Tuple2;
23
24import org.apache.lucene.analysis.TokenStream;
25import org.apache.lucene.analysis.Tokenizer;
26import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
27import org.apache.lucene.analysis.standard.StandardTokenizer;
28import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29import org.apache.lucene.analysis.core.LowerCaseFilter;
30
31public class SolrDocJSON {
32
33 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata)
34 {
35 JSONObject solr_update_json = null;
36
37
38 String [] metadata_single = new String[] {
39 "accessProfile",
40 "rightsAttributes",
41 "hathitrustRecordNumber",
42 "title",
43 "imprint",
44 "pubDate",
45 "pubPlace",
46 "language",
47 "issuance",
48 "typeOfResource"
49 };
50
51 String [] metadata_multiple = new String[] {
52 "oclc",
53 "isbn",
54 "issn",
55 "lccn",
56 "genre",
57 "names"
58 };
59
60 if (ef_metadata != null) {
61
62 // For JSON Solr format see:
63 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
64
65 //String title= ef_metadata.getString("title");
66 JSONObject solr_add_json = new JSONObject();
67
68 JSONObject solr_doc_json = new JSONObject();
69 solr_doc_json.put("id", volume_id);
70
71 for (String metaname: metadata_single) {
72 String metavalue = ef_metadata.getString(metaname);
73 if (metavalue != null) {
74 solr_doc_json.put(metaname+"_t",metavalue);
75 solr_doc_json.put(metaname+"_s",metavalue);
76 }
77 }
78
79 for (String metaname: metadata_multiple) {
80 JSONArray metavalues = ef_metadata.getJSONArray(metaname);
81 if (metavalues != null) {
82 solr_doc_json.put(metaname+"_t",metavalues);
83 solr_doc_json.put(metaname+"_ss",metavalues);
84 }
85 }
86
87 solr_add_json.put("commitWithin", 60000); // used to be 5000
88 solr_add_json.put("doc", solr_doc_json);
89
90 solr_update_json = new JSONObject();
91 solr_update_json.put("add",solr_add_json);
92
93 }
94 else {
95 System.err.println("Warning: null metadata for '" + volume_id + "'");
96 }
97
98 return solr_update_json;
99 }
100
101
102
103 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
104 boolean icu_tokenize)
105 {
106 boolean lowercase_filter = true;
107
108 ArrayList<String> words = new ArrayList<String>();
109
110 if (ef_token_pos_count != null) {
111
112 Iterator<String> word_token_iter = ef_token_pos_count.keys();
113 while (word_token_iter.hasNext()) {
114 String word_token = word_token_iter.next();
115
116 if (icu_tokenize) {
117 Reader reader = new StringReader(word_token);
118
119 ICUTokenizer icu_tokenizer = new ICUTokenizer();
120 icu_tokenizer.setReader(reader);
121
122 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
123
124 TokenStream token_stream = null;
125
126 if (lowercase_filter) {
127 token_stream = new LowerCaseFilter(icu_tokenizer);
128 }
129 else {
130 token_stream = icu_tokenizer;
131 }
132
133 try {
134 token_stream.reset();
135
136 while (token_stream.incrementToken()) {
137 String term = charTermAttribute.toString();
138 words.add(term);
139 }
140
141 token_stream.end();
142 token_stream.close();
143 }
144 catch (IOException e) {
145 e.printStackTrace();
146 }
147 }
148 else {
149 words.add(word_token);
150 }
151 }
152 }
153 else {
154 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
155 }
156
157 /* Alternative way to get at keys
158 Set<String> token_keys = ef_token_pos_count.keySet();
159 for (String token : token_keys) {
160 sb.append(token + " ");
161 }
162*/
163 return words;
164 }
165
166 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
167 boolean icu_tokenize)
168 {
169 ArrayList<POSString> words = new ArrayList<POSString>();
170
171 if (ef_token_pos_count != null) {
172
173 Iterator<String> word_token_iter = ef_token_pos_count.keys();
174 while (word_token_iter.hasNext()) {
175 String word_token = word_token_iter.next();
176
177 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
178
179 Set<String> pos_keys = pos_json_object.keySet();
180 int pos_keys_len = pos_keys.size();
181 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
182
183 if (icu_tokenize == true) {
184 Reader reader = new StringReader(word_token);
185
186 ICUTokenizer icu_tokenizer = new ICUTokenizer();
187 icu_tokenizer.setReader(reader);
188
189 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
190
191 TokenStream token_stream = icu_tokenizer;
192
193 try {
194 token_stream.reset();
195
196 while (token_stream.incrementToken()) {
197 String term = charTermAttribute.toString();
198
199 POSString pos_string = new POSString(term,pos_tags);
200
201 words.add(pos_string);
202 }
203
204 token_stream.end();
205 token_stream.close();
206 }
207 catch (IOException e) {
208 e.printStackTrace();
209 }
210 }
211 else {
212 POSString pos_word_token = new POSString(word_token,pos_tags);
213
214 words.add(pos_word_token);
215 }
216 }
217 }
218 else {
219 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
220 }
221
222 return words;
223 }
224 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
225 {
226 ArrayList<POSString> words_out = new ArrayList<POSString>();
227
228 for (POSString pos_word: words_in) {
229 String word = pos_word.getString();
230 String[] pos_tags = pos_word.getPOSTags();
231
232 Reader reader = new StringReader(word);
233
234 Tokenizer tokenizer = new StandardTokenizer();
235 tokenizer.setReader(reader);
236 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
237
238 TokenStream token_stream = new LowerCaseFilter(tokenizer);
239
240 try {
241 token_stream.reset();
242
243 while (token_stream.incrementToken()) {
244 String term = charTermAttribute.toString();
245
246 POSString pos_term = new POSString(term,pos_tags);
247 words_out.add(pos_term);
248 }
249
250 token_stream.end();
251 token_stream.close();
252 }
253 catch (IOException e) {
254 e.printStackTrace();
255 }
256
257 }
258
259 return words_out;
260 }
261
262 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
263 WhitelistBloomFilter whitelist_bloomfilter)
264 {
265 ArrayList<POSString> words_out = new ArrayList<POSString>();
266
267 for (POSString pos_word: words_in) {
268 String word = pos_word.getString();
269 if (whitelist_bloomfilter.contains(word)) {
270 words_out.add(pos_word);
271 }
272 }
273
274 return words_out;
275 }
276
277 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
278 {
279 ArrayList<String> pos_labels = new ArrayList<String>();
280
281 if (ef_token_pos_count != null) {
282
283 Iterator<String> word_token_iter = ef_token_pos_count.keys();
284 while (word_token_iter.hasNext()) {
285 String word_token = word_token_iter.next();
286
287 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
288
289 Iterator<String> pos_token_iter = word_pos_labels.keys();
290 while (pos_token_iter.hasNext()) {
291 String pos_token = pos_token_iter.next();
292
293 pos_labels.add(pos_token);
294 }
295 }
296 }
297 else {
298 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
299 }
300
301 return pos_labels;
302 }
303
304
305
306 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
307 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
308 {
309 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
310
311 StringBuilder sb = new StringBuilder();
312
313 if (whitelist_bloomfilter == null) {
314
315 boolean first_append = true;
316
317 for (int i=0; i<tokens.size(); i++) {
318 String token = tokens.get(i);
319
320 if (!first_append) {
321 sb.append(" ");
322 }
323 else {
324 first_append = false;
325 }
326 sb.append(token);
327 }
328 }
329 else {
330 boolean first_append = true;
331
332 for (int i=0; i<tokens.size(); i++) {
333 String token = tokens.get(i);
334
335 if (whitelist_bloomfilter.contains(token)) {
336 if (!first_append) {
337 sb.append(" ");
338 }
339 else {
340 first_append = false;
341 }
342 sb.append(token);
343 }
344 }
345
346 }
347
348
349 return sb.toString();
350 }
351
352 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
353 WhitelistBloomFilter whitelist_bloomfilter,
354 UniversalPOSLangMap universal_langmap,
355 boolean icu_tokenize)
356 {
357 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
358 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
359
360 ArrayList<POSString> tokens = null;
361 if (whitelist_bloomfilter != null) {
362 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
363 }
364 else {
365 tokens = lc_tokens;
366 }
367
368 return tokens;
369 }
370
371 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
372 UniversalPOSLangMap universal_langmap,
373 JSONObject solr_doc_json)
374 {
375 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
376 JSONArray ef_languages = ef_page.getJSONArray("languages");
377 if ((ef_languages != null) && (ef_languages.length()>0)) {
378
379 int lang_len = ef_languages.length();
380 String [] lang_list = new String[lang_len];
381
382 for (int i=0; i<lang_len; i++) {
383 JSONObject lang_rec = ef_languages.getJSONObject(i);
384
385 Iterator<String> lang_key_iter = lang_rec.keys();
386 while (lang_key_iter.hasNext()) {
387 String lang_label = lang_key_iter.next();
388
389 lang_list[i] = lang_label;
390 }
391 }
392
393 int text_len = text_al.size();
394
395 /*
396 for (int li=0; li<lang_len; li++) {
397 String lang_key = lang_list[li];
398
399 if (universal_langmap.containsLanguage(lang_key))
400 {
401 */
402 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
403
404 for (int ti=0; ti<text_len; ti++) {
405 POSString pos_text_value = text_al.get(ti);
406 String text_value = pos_text_value.getString();
407
408 String[] pos_tags = pos_text_value.getPOSTags();
409 int pos_tags_len = pos_tags.length;
410
411 for (int pti=0; pti<pos_tags_len; pti++) {
412 String opennlp_pos_key = pos_tags[pti];
413
414 Tuple2<String,String> lang_pos_pair = universal_langmap.getUniversalLanguagePOSPair(lang_list, opennlp_pos_key);
415 String selected_lang = lang_pos_pair._1;
416 String upos = lang_pos_pair._2;
417
418 String pos_lang_text_field = selected_lang;
419 if (upos != null) {
420 pos_lang_text_field += "_" + upos;
421 }
422 pos_lang_text_field += "_htrctoken";
423
424 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
425 JSONArray empty_json_values = new JSONArray();
426 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
427 }
428 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
429 }
430 }
431
432 // Now add each of the POS language fields into solr_doc_json
433 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
434 for (String plf_key : pos_lang_field_keys) {
435 String lang_text_field = plf_key;
436 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
437
438 solr_doc_json.put(lang_text_field, json_values);
439 }
440 /*
441 }
442 else {
443 String lang_text_field = lang_key + "_htrctoken";
444
445 JSONArray json_values = new JSONArray();
446 for (int ti=0; ti<text_len; ti++) {
447 POSString pos_text_value = text_al.get(ti);
448 String text_value = pos_text_value.getString();
449 json_values.put(text_value);
450 }
451 solr_doc_json.put(lang_text_field, json_values);
452
453 }
454
455
456 }
457 */
458 }
459 }
460
461 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
462 WhitelistBloomFilter whitelist_bloomfilter,
463 UniversalPOSLangMap universal_langmap,
464 boolean icu_tokenize)
465 {
466 JSONObject solr_update_json = null;
467
468 if (ef_page != null) {
469 JSONObject ef_body = ef_page.getJSONObject("body");
470 if (ef_body != null) {
471 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
472 if (ef_token_pos_count != null) {
473
474 JSONObject solr_add_json = new JSONObject();
475
476 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
477
478 JSONObject solr_doc_json = new JSONObject();
479 solr_doc_json.put("id", page_id);
480 solr_doc_json.put("volumeid_s", volume_id);
481 if (text_al.size()>0) {
482 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
483 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
484 }
485 else {
486 solr_doc_json.put("efnotext_b", true);
487 }
488 solr_add_json.put("commitWithin", 5000);
489 solr_add_json.put("doc", solr_doc_json);
490
491 solr_update_json = new JSONObject();
492 solr_update_json.put("add",solr_add_json);
493
494 }
495 else {
496 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
497 }
498 }
499 else {
500 System.err.println("Warning: empty body field for '" + page_id + "'");
501 }
502
503 }
504 else {
505 System.err.println("Warning: null page for '" + page_id + "'");
506 }
507
508
509 /*
510
511 /update/json/docs
512 */
513
514 // For Reference ...
515 // Example documentation on Solr JSON syntax:
516 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
517 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
518
519 /*
520 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
521 {
522 "add": {
523 "doc": {
524 "id": "DOC1",
525 "my_boosted_field": { use a map with boost/value for a boosted field
526 "boost": 2.3,
527 "value": "test"
528 },
529 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
530 }
531 },
532 "add": {
533 "commitWithin": 5000, commit this document within 5 seconds
534 "overwrite": false, don't check for existing documents with the same uniqueKey
535 "boost": 3.45, a document boost
536 "doc": {
537 "f1": "v1", Can use repeated keys for a multi-valued field
538 "f1": "v2"
539 }
540 },
541
542 "commit": {},
543 "optimize": { "waitSearcher":false },
544
545 "delete": { "id":"ID" }, delete by ID
546 "delete": { "query":"QUERY" } delete by query
547 }'
548 */
549
550 return solr_update_json;
551 }
552
553 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
554 boolean icu_tokenize)
555 {
556 ArrayList<String> word_list = null;
557
558 if (ef_page != null) {
559 JSONObject ef_body = ef_page.getJSONObject("body");
560 if (ef_body != null) {
561 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
562 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
563 }
564 else {
565 System.err.println("Warning: empty body field for '" + page_id + "'");
566 }
567
568 }
569 else {
570 System.err.println("Warning: null page for '" + page_id + "'");
571 }
572
573 return word_list;
574 }
575
576 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
577 {
578 ArrayList<String> word_list = null;
579
580 if (ef_page != null) {
581 JSONObject ef_body = ef_page.getJSONObject("body");
582 if (ef_body != null) {
583 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
584 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
585 }
586 else {
587 System.err.println("Warning: empty body field for '" + page_id + "'");
588 }
589
590 }
591 else {
592 System.err.println("Warning: null page for '" + page_id + "'");
593 }
594
595 return word_list;
596 }
597
598 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
599 {
600 ArrayList<String> lang_list = new ArrayList<String>();;
601
602 if (ef_page != null) {
603 JSONArray ef_languages = ef_page.getJSONArray("languages");
604 if (ef_languages != null) {
605
606 int lang_len = ef_languages.length();
607 for (int i=0; i<lang_len; i++) {
608 JSONObject lang_rec = ef_languages.getJSONObject(i);
609
610 Iterator<String> lang_key_iter = lang_rec.keys();
611 while (lang_key_iter.hasNext()) {
612 String lang_label = lang_key_iter.next();
613
614 lang_list.add(lang_label);
615 }
616 }
617 }
618 else {
619 System.err.println("Warning: empty languages field for '" + page_id + "'");
620 }
621
622 }
623 else {
624 System.err.println("Warning: null page for '" + page_id + "'");
625 }
626
627 return lang_list;
628 }
629
630 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
631 {
632 try {
633 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
634 bw.write(solr_add_doc_json.toString());
635 bw.close();
636 } catch (IOException e) {
637 e.printStackTrace();
638 } catch (CompressorException e) {
639 e.printStackTrace();
640 }
641 }
642
643 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json,
644 String volume_id, String page_id)
645 {
646
647 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
648 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
649 //curl_popen += " --data-binary '";
650 //curl_popen += "'"
651
652
653 // System.out.println("Post URL: " + post_url);
654
655 try {
656 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
657 httpcon.setDoOutput(true);
658 httpcon.setRequestProperty("Content-Type", "application/json");
659 httpcon.setRequestProperty("Accept", "application/json");
660 httpcon.setRequestMethod("POST");
661 httpcon.connect();
662
663 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
664 OutputStream os = httpcon.getOutputStream();
665 os.write(outputBytes);
666 os.close();
667
668
669 // Read response
670 StringBuilder sb = new StringBuilder();
671 InputStream is = httpcon.getInputStream();
672 BufferedReader in = new BufferedReader(new InputStreamReader(is));
673 String decodedString;
674 while ((decodedString = in.readLine()) != null) {
675 sb.append(decodedString);
676 }
677 in.close();
678
679 JSONObject solr_status_json = new JSONObject(sb.toString());
680 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
681 if (response_header_json != null) {
682 int status = response_header_json.getInt("status");
683 if (status != 0) {
684 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
685 System.err.println("Full response was: " + sb);
686 }
687 }
688 else {
689 System.err.println("Failed response to Solr POST: " + sb);
690 }
691
692
693
694 }
695 catch (IOException e) {
696 System.err.println("Solr core update failed when processing id: " + volume_id + "." + page_id);
697 e.printStackTrace();
698 }
699
700 catch (Exception e) {
701 e.printStackTrace();
702 }
703 }
704}
Note: See TracBrowser for help on using the repository browser.