source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31786

Last change on this file since 31786 was 31786, checked in by davidb, 7 years ago

extra param in call; change to case-folding _htrctokentext

  • Property svn:executable set to *
File size: 25.5 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStream;
7import java.io.InputStreamReader;
8import java.io.OutputStream;
9import java.io.Reader;
10import java.io.StringReader;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.util.ArrayList;
14import java.util.HashMap;
15import java.util.Iterator;
16import java.util.Set;
17
18import org.apache.commons.compress.compressors.CompressorException;
19import org.json.JSONArray;
20import org.json.JSONObject;
21
22import scala.Tuple2;
23
24import org.apache.lucene.analysis.TokenStream;
25import org.apache.lucene.analysis.Tokenizer;
26import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
27import org.apache.lucene.analysis.standard.StandardTokenizer;
28import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29import org.apache.lucene.analysis.core.LowerCaseFilter;
30
31public class SolrDocJSON {
32
33 protected static String [] metadata_single = new String[] {
34 "accessProfile",
35 "bibliographicFormat",
36 "dateCreated", // date
37 //"enumerationChronology", // What is this?
38 //"governmentDocument", // bool: true/false
39 "handleUrl",
40 "hathitrustRecordNumber", // int?
41 "htBibUrl",
42 "imprint",
43 "issuance",
44 "language",
45 "lastUpdateDate",
46 "pubDate",
47 "pubPlace",
48 "rightsAttributes",
49 "schemaVersion",
50 "sourceInstitution",
51 "sourceInstitutionRecordNumber",
52 "title",
53 "typeOfResource",
54 "volumeIdentifier"
55 };
56
57 protected static String [] metadata_multiple = new String[] {
58 "oclc",
59 "isbn",
60 "issn",
61 "lccn",
62 "genre",
63 "names"
64 };
65
66 protected static String [] metadata_hashmap_multiple = new String[] {
67 "classification"
68 };
69
70 protected static JSONObject generateMetadataSolrDocJSON(String id, JSONObject ef_metadata, boolean is_page_level)
71 {
72 /*
73 Example JSON for id: "gri.ark:/13960/t0003qw46
74 metadata: {
75
76 "accessProfile": "open",
77 "bibliographicFormat": "BK",
78 "classification": {
79 "lcc": [
80 "ND646 .B8 1900"
81 ]
82 },
83 "dateCreated": "2016-06-19T08:30:16.11199Z",
84 "enumerationChronology": " ",
85 "genre": [
86 "not fiction"
87 ],
88 "governmentDocument": false,
89 "handleUrl": "http://hdl.handle.net/2027/gri.ark:/13960/t0003qw46",
90 "hathitrustRecordNumber": "100789562",
91 "htBibUrl": "http://catalog.hathitrust.org/api/volumes/full/htid/gri.ark:/13960/t0003qw46.json",
92 "imprint": "Burlington Fine Arts Club, 1900.",
93 "isbn": [],
94 "issn": [],
95 "issuance": "monographic",
96 "language": "eng",
97 "lastUpdateDate": "2015-09-14 13:25:03",
98 "lccn": [],
99 "names": [
100 "Burlington Fine Arts Club "
101 ],
102 "oclc": [
103 "25259734"
104 ],
105 "pubDate": "1900",
106 "pubPlace": "enk",
107 "rightsAttributes": "pd",
108 "schemaVersion": "1.3",
109 "sourceInstitution": "CMALG",
110 "sourceInstitutionRecordNumber": "9928077890001551",
111 "title": "Exhibition of pictures by Dutch masters of the seventeenth century.",
112 "typeOfResource": "text",
113 "volumeIdentifier": "gri.ark:/13960/t0003qw46"
114
115 }
116
117 */
118
119
120 // For JSON Solr format see:
121 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
122
123 //String title= ef_metadata.getString("title");
124
125 JSONObject solr_doc_json = new JSONObject();
126 solr_doc_json.put("id", id);
127
128 for (String metaname: metadata_single) {
129 String metavalue = ef_metadata.getString(metaname);
130
131 if (metavalue != null) {
132 if (is_page_level) {
133 solr_doc_json.put("volume"+metaname+"_txt",metavalue);
134 solr_doc_json.put("volume"+metaname+"_htrcstring",metavalue);
135 }
136 else {
137 solr_doc_json.put(metaname+"_t",metavalue);
138 solr_doc_json.put(metaname+"_s",metavalue);
139 }
140 }
141 }
142
143 for (String metaname: metadata_multiple) {
144 JSONArray metavalues = ef_metadata.getJSONArray(metaname);
145 if (metavalues != null) {
146 if (is_page_level) {
147 solr_doc_json.put("volume"+metaname+"_txt",metavalues);
148 solr_doc_json.put("volume"+metaname+"_htrcstrings",metavalues);
149 }
150 else {
151 solr_doc_json.put(metaname+"_t",metavalues);
152 solr_doc_json.put(metaname+"_ss",metavalues);
153 }
154 }
155 }
156
157 for (String metaname: metadata_hashmap_multiple) {
158 JSONObject metakeys = ef_metadata.getJSONObject(metaname);
159
160 if (metakeys != null) {
161
162 Iterator<String> metakey_iter = metakeys.keys();
163 while (metakey_iter.hasNext()) {
164 String metakey = metakey_iter.next();
165
166 JSONArray metavalues = metakeys.getJSONArray(metakey);
167 if (metavalues != null) {
168 String combined_metaname = metaname + "_" + metakey;
169 if (is_page_level) {
170 solr_doc_json.put("volume"+combined_metaname+"_txt",metavalues);
171 solr_doc_json.put("volume"+combined_metaname+"_htrcstrings",metavalues);
172 }
173 else {
174 solr_doc_json.put(combined_metaname+"_t",metavalues);
175 solr_doc_json.put(combined_metaname+"_ss",metavalues);
176 }
177 }
178 }
179 }
180 }
181
182 return solr_doc_json;
183
184 }
185
186 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata)
187 {
188 JSONObject solr_update_json = null;
189
190 if (ef_metadata != null) {
191
192 // For JSON Solr format see:
193 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
194
195 //String title= ef_metadata.getString("title");
196 JSONObject solr_add_json = new JSONObject();
197
198 JSONObject solr_doc_json = generateMetadataSolrDocJSON(volume_id,ef_metadata,false);
199
200 solr_add_json.put("commitWithin", 60000); // used to be 5000
201 solr_add_json.put("doc", solr_doc_json);
202
203 solr_update_json = new JSONObject();
204 solr_update_json.put("add",solr_add_json);
205
206 }
207 else {
208 System.err.println("Warning: null metadata for '" + volume_id + "'");
209 }
210
211 return solr_update_json;
212 }
213
214
215
216
217
218 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
219 boolean icu_tokenize)
220 {
221 boolean lowercase_filter = true;
222
223 ArrayList<String> words = new ArrayList<String>();
224
225 if (ef_token_pos_count != null) {
226
227 Iterator<String> word_token_iter = ef_token_pos_count.keys();
228 while (word_token_iter.hasNext()) {
229 String word_token = word_token_iter.next();
230
231 if (icu_tokenize) {
232 Reader reader = new StringReader(word_token);
233
234 ICUTokenizer icu_tokenizer = new ICUTokenizer();
235 icu_tokenizer.setReader(reader);
236
237 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
238
239 TokenStream token_stream = null;
240
241 if (lowercase_filter) {
242 token_stream = new LowerCaseFilter(icu_tokenizer);
243 }
244 else {
245 token_stream = icu_tokenizer;
246 }
247
248 try {
249 token_stream.reset();
250
251 while (token_stream.incrementToken()) {
252 String term = charTermAttribute.toString();
253 words.add(term);
254 }
255
256 token_stream.end();
257 token_stream.close();
258 }
259 catch (IOException e) {
260 e.printStackTrace();
261 }
262 }
263 else {
264 words.add(word_token);
265 }
266 }
267 }
268 else {
269 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
270 }
271
272 /* Alternative way to get at keys
273 Set<String> token_keys = ef_token_pos_count.keySet();
274 for (String token : token_keys) {
275 sb.append(token + " ");
276 }
277*/
278 return words;
279 }
280
281 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
282 boolean icu_tokenize)
283 {
284 ArrayList<POSString> words = new ArrayList<POSString>();
285
286 if (ef_token_pos_count != null) {
287
288 Iterator<String> word_token_iter = ef_token_pos_count.keys();
289 while (word_token_iter.hasNext()) {
290 String word_token = word_token_iter.next();
291
292 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
293
294 Set<String> pos_keys = pos_json_object.keySet();
295 int pos_keys_len = pos_keys.size();
296 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
297
298 if (icu_tokenize == true) {
299 Reader reader = new StringReader(word_token);
300
301 ICUTokenizer icu_tokenizer = new ICUTokenizer();
302 icu_tokenizer.setReader(reader);
303
304 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
305
306 TokenStream token_stream = icu_tokenizer;
307
308 try {
309 token_stream.reset();
310
311 while (token_stream.incrementToken()) {
312 String term = charTermAttribute.toString();
313
314 POSString pos_string = new POSString(term,pos_tags);
315
316 words.add(pos_string);
317 }
318
319 token_stream.end();
320 token_stream.close();
321 }
322 catch (IOException e) {
323 e.printStackTrace();
324 }
325 }
326 else {
327 POSString pos_word_token = new POSString(word_token,pos_tags);
328
329 words.add(pos_word_token);
330 }
331 }
332 }
333 else {
334 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
335 }
336
337 return words;
338 }
339 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
340 {
341 ArrayList<POSString> words_out = new ArrayList<POSString>();
342
343 for (POSString pos_word: words_in) {
344 String word = pos_word.getString();
345 String[] pos_tags = pos_word.getPOSTags();
346
347 Reader reader = new StringReader(word);
348
349 Tokenizer tokenizer = new StandardTokenizer();
350 tokenizer.setReader(reader);
351 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
352
353 TokenStream token_stream = new LowerCaseFilter(tokenizer);
354
355 try {
356 token_stream.reset();
357
358 while (token_stream.incrementToken()) {
359 String term = charTermAttribute.toString();
360
361 POSString pos_term = new POSString(term,pos_tags);
362 words_out.add(pos_term);
363 }
364
365 token_stream.end();
366 token_stream.close();
367 }
368 catch (IOException e) {
369 e.printStackTrace();
370 }
371
372 }
373
374 return words_out;
375 }
376
377 protected static ArrayList<String> lowerCaseTerms(String word)
378 {
379 ArrayList<String> words_out = new ArrayList<String>();
380
381 Reader reader = new StringReader(word);
382
383 Tokenizer tokenizer = new StandardTokenizer();
384 tokenizer.setReader(reader);
385 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
386
387 TokenStream token_stream = new LowerCaseFilter(tokenizer);
388
389 try {
390 token_stream.reset();
391
392 while (token_stream.incrementToken()) {
393 String term = charTermAttribute.toString();
394
395 words_out.add(term);
396 }
397
398 token_stream.end();
399 token_stream.close();
400 }
401 catch (IOException e) {
402 e.printStackTrace();
403 }
404
405 return words_out;
406 }
407
408 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
409 WhitelistBloomFilter whitelist_bloomfilter)
410 {
411 ArrayList<POSString> pos_words_out = new ArrayList<POSString>();
412
413 for (POSString pos_word: words_in) {
414 String word = pos_word.getString();
415 String[] pos_tags = pos_word.getPOSTags();
416
417 if (whitelist_bloomfilter.contains(word)) {
418
419 ArrayList<String> word_terms = lowerCaseTerms(word);
420 for (String term: word_terms) {
421 POSString pos_term = new POSString(term, pos_tags);
422
423 pos_words_out.add(pos_term);
424 }
425
426 // The old, direct way of adding the value in
427 //pos_words_out.add(pos_word);
428 }
429 else {
430 // else clause won't happen so often
431 // (has to be an 'obscure' word *not* be in the whitelist to get here)
432 // break down the word into terms, and see if any of them are in the whitelist instead
433
434 ArrayList<String> word_terms = lowerCaseTerms(word);
435 for (String term: word_terms) {
436
437 if (whitelist_bloomfilter.contains(term)) {
438 POSString pos_term = new POSString(term, pos_tags);
439
440 pos_words_out.add(pos_term);
441 }
442 }
443
444
445 }
446 }
447
448 return pos_words_out;
449 }
450
451 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
452 {
453 ArrayList<String> pos_labels = new ArrayList<String>();
454
455 if (ef_token_pos_count != null) {
456
457 Iterator<String> word_token_iter = ef_token_pos_count.keys();
458 while (word_token_iter.hasNext()) {
459 String word_token = word_token_iter.next();
460
461 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
462
463 Iterator<String> pos_token_iter = word_pos_labels.keys();
464 while (pos_token_iter.hasNext()) {
465 String pos_token = pos_token_iter.next();
466
467 pos_labels.add(pos_token);
468 }
469 }
470 }
471 else {
472 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
473 }
474
475 return pos_labels;
476 }
477
478
479
480 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
481 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
482 {
483 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
484
485 StringBuilder sb = new StringBuilder();
486
487 if (whitelist_bloomfilter == null) {
488
489 boolean first_append = true;
490
491 for (int i=0; i<tokens.size(); i++) {
492 String token = tokens.get(i);
493
494 if (!first_append) {
495 sb.append(" ");
496 }
497 else {
498 first_append = false;
499 }
500 sb.append(token);
501 }
502 }
503 else {
504 boolean first_append = true;
505
506 for (int i=0; i<tokens.size(); i++) {
507 String token = tokens.get(i);
508
509 if (whitelist_bloomfilter.contains(token)) {
510 if (!first_append) {
511 sb.append(" ");
512 }
513 else {
514 first_append = false;
515 }
516 sb.append(token);
517 }
518 }
519
520 }
521
522
523 return sb.toString();
524 }
525
526 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
527 WhitelistBloomFilter whitelist_bloomfilter,
528 UniversalPOSLangMap universal_langmap,
529 boolean icu_tokenize)
530 {
531 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
532 //ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
533
534 ArrayList<POSString> tokens = null;
535 if (whitelist_bloomfilter != null) {
536 tokens = getTokenPosCountWordsMapWhitelist(cs_tokens,whitelist_bloomfilter);
537 //tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
538 }
539 else {
540 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
541 tokens = lc_tokens;
542 }
543
544 return tokens;
545 }
546
547 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
548 UniversalPOSLangMap universal_langmap,
549 JSONObject solr_doc_json)
550 {
551 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
552 JSONArray ef_languages = ef_page.getJSONArray("languages");
553 if ((ef_languages != null) && (ef_languages.length()>0)) {
554
555 int lang_len = ef_languages.length();
556 String [] lang_list = new String[lang_len];
557
558 for (int i=0; i<lang_len; i++) {
559 JSONObject lang_rec = ef_languages.getJSONObject(i);
560
561 Iterator<String> lang_key_iter = lang_rec.keys();
562 while (lang_key_iter.hasNext()) {
563 String lang_label = lang_key_iter.next();
564
565 lang_list[i] = lang_label;
566 }
567 }
568
569 int text_len = text_al.size();
570
571 /*
572 for (int li=0; li<lang_len; li++) {
573 String lang_key = lang_list[li];
574
575 if (universal_langmap.containsLanguage(lang_key))
576 {
577 */
578 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
579
580 for (int ti=0; ti<text_len; ti++) {
581 POSString pos_text_value = text_al.get(ti);
582 String text_value = pos_text_value.getString();
583
584 String[] pos_tags = pos_text_value.getPOSTags();
585 int pos_tags_len = pos_tags.length;
586
587 for (int pti=0; pti<pos_tags_len; pti++) {
588 String opennlp_pos_key = pos_tags[pti];
589
590 Tuple2<String,String> lang_pos_pair = universal_langmap.getUniversalLanguagePOSPair(lang_list, opennlp_pos_key);
591 String selected_lang = lang_pos_pair._1;
592 String upos = lang_pos_pair._2;
593
594 String pos_lang_text_field = selected_lang;
595 if (upos != null) {
596 pos_lang_text_field += "_" + upos;
597 }
598 pos_lang_text_field += "_htrctokentext";
599
600 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
601 JSONArray empty_json_values = new JSONArray();
602 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
603 }
604 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
605 }
606 }
607
608 // Now add each of the POS language fields into solr_doc_json
609 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
610 for (String plf_key : pos_lang_field_keys) {
611 String lang_text_field = plf_key;
612 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
613
614 solr_doc_json.put(lang_text_field, json_values);
615 }
616 /*
617 }
618 else {
619 String lang_text_field = lang_key + "_htrctokentext";
620
621 JSONArray json_values = new JSONArray();
622 for (int ti=0; ti<text_len; ti++) {
623 POSString pos_text_value = text_al.get(ti);
624 String text_value = pos_text_value.getString();
625 json_values.put(text_value);
626 }
627 solr_doc_json.put(lang_text_field, json_values);
628
629 }
630
631
632 }
633 */
634 }
635 }
636
637 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id,
638 JSONObject ef_metadata, JSONObject ef_page,
639 WhitelistBloomFilter whitelist_bloomfilter,
640 UniversalPOSLangMap universal_langmap,
641 boolean icu_tokenize)
642 {
643 JSONObject solr_update_json = null;
644
645 if (ef_page != null) {
646 JSONObject ef_body = ef_page.getJSONObject("body");
647 if (ef_body != null) {
648 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
649 if (ef_token_pos_count != null) {
650
651 JSONObject solr_add_json = new JSONObject();
652
653 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
654
655 //JSONObject solr_doc_json = new JSONObject();
656 JSONObject solr_doc_json = generateMetadataSolrDocJSON(page_id,ef_metadata,true);
657
658 //solr_doc_json.put("id", page_id); // now done in generateMetadataSolrDocJSON
659 solr_doc_json.put("volumeid_s", volume_id);
660
661 if (text_al.size()>0) {
662 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
663 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
664 }
665 else {
666 solr_doc_json.put("efnotext_b", true);
667 }
668 solr_add_json.put("commitWithin", 60000); // used to be 5000
669 solr_add_json.put("doc", solr_doc_json);
670
671 solr_update_json = new JSONObject();
672 solr_update_json.put("add",solr_add_json);
673
674 }
675 else {
676 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
677 }
678 }
679 else {
680 System.err.println("Warning: empty body field for '" + page_id + "'");
681 }
682
683 }
684 else {
685 System.err.println("Warning: null page for '" + page_id + "'");
686 }
687
688
689 /*
690
691 /update/json/docs
692 */
693
694 // For Reference ...
695 // Example documentation on Solr JSON syntax:
696 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
697 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
698
699 /*
700 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
701 {
702 "add": {
703 "doc": {
704 "id": "DOC1",
705 "my_boosted_field": { use a map with boost/value for a boosted field
706 "boost": 2.3,
707 "value": "test"
708 },
709 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
710 }
711 },
712 "add": {
713 "commitWithin": 5000, commit this document within 5 seconds
714 "overwrite": false, don't check for existing documents with the same uniqueKey
715 "boost": 3.45, a document boost
716 "doc": {
717 "f1": "v1", Can use repeated keys for a multi-valued field
718 "f1": "v2"
719 }
720 },
721
722 "commit": {},
723 "optimize": { "waitSearcher":false },
724
725 "delete": { "id":"ID" }, delete by ID
726 "delete": { "query":"QUERY" } delete by query
727 }'
728 */
729
730 return solr_update_json;
731 }
732
733 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
734 boolean icu_tokenize)
735 {
736 ArrayList<String> word_list = null;
737
738 if (ef_page != null) {
739 JSONObject ef_body = ef_page.getJSONObject("body");
740 if (ef_body != null) {
741 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
742 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
743 }
744 else {
745 System.err.println("Warning: empty body field for '" + page_id + "'");
746 }
747
748 }
749 else {
750 System.err.println("Warning: null page for '" + page_id + "'");
751 }
752
753 return word_list;
754 }
755
756 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
757 {
758 ArrayList<String> word_list = null;
759
760 if (ef_page != null) {
761 JSONObject ef_body = ef_page.getJSONObject("body");
762 if (ef_body != null) {
763 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
764 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
765 }
766 else {
767 System.err.println("Warning: empty body field for '" + page_id + "'");
768 }
769
770 }
771 else {
772 System.err.println("Warning: null page for '" + page_id + "'");
773 }
774
775 return word_list;
776 }
777
778 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
779 {
780 ArrayList<String> lang_list = new ArrayList<String>();;
781
782 if (ef_page != null) {
783 JSONArray ef_languages = ef_page.getJSONArray("languages");
784 if (ef_languages != null) {
785
786 int lang_len = ef_languages.length();
787 for (int i=0; i<lang_len; i++) {
788 JSONObject lang_rec = ef_languages.getJSONObject(i);
789
790 Iterator<String> lang_key_iter = lang_rec.keys();
791 while (lang_key_iter.hasNext()) {
792 String lang_label = lang_key_iter.next();
793
794 lang_list.add(lang_label);
795 }
796 }
797 }
798 else {
799 System.err.println("Warning: empty languages field for '" + page_id + "'");
800 }
801
802 }
803 else {
804 System.err.println("Warning: null page for '" + page_id + "'");
805 }
806
807 return lang_list;
808 }
809
810 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
811 {
812 try {
813 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
814 bw.write(solr_add_doc_json.toString());
815 bw.close();
816 } catch (IOException e) {
817 e.printStackTrace();
818 } catch (CompressorException e) {
819 e.printStackTrace();
820 }
821 }
822
823 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json,
824 String volume_id, String page_id)
825 {
826
827 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
828 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
829 //curl_popen += " --data-binary '";
830 //curl_popen += "'"
831
832
833 // System.out.println("Post URL: " + post_url);
834
835 try {
836 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
837 httpcon.setDoOutput(true);
838 httpcon.setRequestProperty("Content-Type", "application/json");
839 httpcon.setRequestProperty("Accept", "application/json");
840 httpcon.setRequestMethod("POST");
841 httpcon.connect();
842
843 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
844 OutputStream os = httpcon.getOutputStream();
845 os.write(outputBytes);
846 os.close();
847
848
849 // Read response
850 StringBuilder sb = new StringBuilder();
851 InputStream is = httpcon.getInputStream();
852 BufferedReader in = new BufferedReader(new InputStreamReader(is));
853 String decodedString;
854 while ((decodedString = in.readLine()) != null) {
855 sb.append(decodedString);
856 }
857 in.close();
858
859 JSONObject solr_status_json = new JSONObject(sb.toString());
860 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
861 if (response_header_json != null) {
862 int status = response_header_json.getInt("status");
863 if (status != 0) {
864 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
865 System.err.println("Full response was: " + sb);
866 }
867 }
868 else {
869 System.err.println("Failed response to Solr POST: " + sb);
870 }
871
872
873
874 }
875 catch (IOException e) {
876 System.err.println("Solr core update failed when processing id: " + volume_id + "." + page_id);
877 e.printStackTrace();
878 }
879
880 catch (Exception e) {
881 e.printStackTrace();
882 }
883 }
884}
Note: See TracBrowser for help on using the repository browser.