source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31779

Last change on this file since 31779 was 31779, checked in by davidb, 7 years ago

Change in how POS words are checked against the Whitelist. Previously words were case-folded before being checked in the Whitelist, however this could lead to words not being included if they only appear in capitalized form (as in Sherlock) in the text, and never in lowercase (sherlock). The change addresses this issue by only mapping to lowercase after the POS word -- left in its native form -- has been checked against the Whitelist, which also operates with POS words in their native form

  • Property svn:executable set to *
File size: 24.3 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStream;
7import java.io.InputStreamReader;
8import java.io.OutputStream;
9import java.io.Reader;
10import java.io.StringReader;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.util.ArrayList;
14import java.util.HashMap;
15import java.util.Iterator;
16import java.util.Set;
17
18import org.apache.commons.compress.compressors.CompressorException;
19import org.json.JSONArray;
20import org.json.JSONObject;
21
22import scala.Tuple2;
23
24import org.apache.lucene.analysis.TokenStream;
25import org.apache.lucene.analysis.Tokenizer;
26import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
27import org.apache.lucene.analysis.standard.StandardTokenizer;
28import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29import org.apache.lucene.analysis.core.LowerCaseFilter;
30
31public class SolrDocJSON {
32
33 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata)
34 {
35 JSONObject solr_update_json = null;
36 /*
37 Example JSON for id: "gri.ark:/13960/t0003qw46
38 metadata: {
39
40 "accessProfile": "open",
41 "bibliographicFormat": "BK",
42 "classification": {
43 "lcc": [
44 "ND646 .B8 1900"
45 ]
46 },
47 "dateCreated": "2016-06-19T08:30:16.11199Z",
48 "enumerationChronology": " ",
49 "genre": [
50 "not fiction"
51 ],
52 "governmentDocument": false,
53 "handleUrl": "http://hdl.handle.net/2027/gri.ark:/13960/t0003qw46",
54 "hathitrustRecordNumber": "100789562",
55 "htBibUrl": "http://catalog.hathitrust.org/api/volumes/full/htid/gri.ark:/13960/t0003qw46.json",
56 "imprint": "Burlington Fine Arts Club, 1900.",
57 "isbn": [],
58 "issn": [],
59 "issuance": "monographic",
60 "language": "eng",
61 "lastUpdateDate": "2015-09-14 13:25:03",
62 "lccn": [],
63 "names": [
64 "Burlington Fine Arts Club "
65 ],
66 "oclc": [
67 "25259734"
68 ],
69 "pubDate": "1900",
70 "pubPlace": "enk",
71 "rightsAttributes": "pd",
72 "schemaVersion": "1.3",
73 "sourceInstitution": "CMALG",
74 "sourceInstitutionRecordNumber": "9928077890001551",
75 "title": "Exhibition of pictures by Dutch masters of the seventeenth century.",
76 "typeOfResource": "text",
77 "volumeIdentifier": "gri.ark:/13960/t0003qw46"
78
79 }
80
81 */
82
83 String [] metadata_single = new String[] {
84 "accessProfile",
85 "bibliographicFormat",
86 "dateCreated", // date
87 //"enumerationChronology", // What is this?
88 //"governmentDocument", // bool: true/false
89 "handleUrl",
90 "hathitrustRecordNumber", // int?
91 "htBibUrl",
92 "imprint",
93 "issuance",
94 "language",
95 "lastUpdateDate",
96 "pubDate",
97 "pubPlace",
98 "rightsAttributes",
99 "schemaVersion",
100 "sourceInstitution",
101 "sourceInstitutionRecordNumber",
102 "title",
103 "typeOfResource",
104 "volumeIdentifier"
105 };
106
107 String [] metadata_multiple = new String[] {
108 "oclc",
109 "isbn",
110 "issn",
111 "lccn",
112 "genre",
113 "names"
114 };
115
116 String [] metadata_hashmap_multiple = new String[] {
117 "classification"
118 };
119
120
121 if (ef_metadata != null) {
122
123 // For JSON Solr format see:
124 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
125
126 //String title= ef_metadata.getString("title");
127 JSONObject solr_add_json = new JSONObject();
128
129 JSONObject solr_doc_json = new JSONObject();
130 solr_doc_json.put("id", volume_id);
131
132 for (String metaname: metadata_single) {
133 String metavalue = ef_metadata.getString(metaname);
134 if (metavalue != null) {
135 solr_doc_json.put(metaname+"_t",metavalue);
136 solr_doc_json.put(metaname+"_s",metavalue);
137 }
138 }
139
140 for (String metaname: metadata_multiple) {
141 JSONArray metavalues = ef_metadata.getJSONArray(metaname);
142 if (metavalues != null) {
143 solr_doc_json.put(metaname+"_t",metavalues);
144 solr_doc_json.put(metaname+"_ss",metavalues);
145 }
146 }
147
148 for (String metaname: metadata_hashmap_multiple) {
149 JSONObject metakeys = ef_metadata.getJSONObject(metaname);
150
151 if (metakeys != null) {
152 Iterator<String> metakey_iter = metakeys.keys();
153 while (metakey_iter.hasNext()) {
154 String metakey = metakey_iter.next();
155
156 JSONArray metavalues = metakeys.getJSONArray(metakey);
157 if (metavalues != null) {
158 String combined_metaname = metaname + "_" + metakey;
159 solr_doc_json.put(combined_metaname+"_t",metavalues);
160 solr_doc_json.put(combined_metaname+"_ss",metavalues);
161 }
162 }
163 }
164 }
165
166 solr_add_json.put("commitWithin", 60000); // used to be 5000
167 solr_add_json.put("doc", solr_doc_json);
168
169 solr_update_json = new JSONObject();
170 solr_update_json.put("add",solr_add_json);
171
172 }
173 else {
174 System.err.println("Warning: null metadata for '" + volume_id + "'");
175 }
176
177 return solr_update_json;
178 }
179
180
181
182 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
183 boolean icu_tokenize)
184 {
185 boolean lowercase_filter = true;
186
187 ArrayList<String> words = new ArrayList<String>();
188
189 if (ef_token_pos_count != null) {
190
191 Iterator<String> word_token_iter = ef_token_pos_count.keys();
192 while (word_token_iter.hasNext()) {
193 String word_token = word_token_iter.next();
194
195 if (icu_tokenize) {
196 Reader reader = new StringReader(word_token);
197
198 ICUTokenizer icu_tokenizer = new ICUTokenizer();
199 icu_tokenizer.setReader(reader);
200
201 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
202
203 TokenStream token_stream = null;
204
205 if (lowercase_filter) {
206 token_stream = new LowerCaseFilter(icu_tokenizer);
207 }
208 else {
209 token_stream = icu_tokenizer;
210 }
211
212 try {
213 token_stream.reset();
214
215 while (token_stream.incrementToken()) {
216 String term = charTermAttribute.toString();
217 words.add(term);
218 }
219
220 token_stream.end();
221 token_stream.close();
222 }
223 catch (IOException e) {
224 e.printStackTrace();
225 }
226 }
227 else {
228 words.add(word_token);
229 }
230 }
231 }
232 else {
233 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
234 }
235
236 /* Alternative way to get at keys
237 Set<String> token_keys = ef_token_pos_count.keySet();
238 for (String token : token_keys) {
239 sb.append(token + " ");
240 }
241*/
242 return words;
243 }
244
245 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
246 boolean icu_tokenize)
247 {
248 ArrayList<POSString> words = new ArrayList<POSString>();
249
250 if (ef_token_pos_count != null) {
251
252 Iterator<String> word_token_iter = ef_token_pos_count.keys();
253 while (word_token_iter.hasNext()) {
254 String word_token = word_token_iter.next();
255
256 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
257
258 Set<String> pos_keys = pos_json_object.keySet();
259 int pos_keys_len = pos_keys.size();
260 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
261
262 if (icu_tokenize == true) {
263 Reader reader = new StringReader(word_token);
264
265 ICUTokenizer icu_tokenizer = new ICUTokenizer();
266 icu_tokenizer.setReader(reader);
267
268 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
269
270 TokenStream token_stream = icu_tokenizer;
271
272 try {
273 token_stream.reset();
274
275 while (token_stream.incrementToken()) {
276 String term = charTermAttribute.toString();
277
278 POSString pos_string = new POSString(term,pos_tags);
279
280 words.add(pos_string);
281 }
282
283 token_stream.end();
284 token_stream.close();
285 }
286 catch (IOException e) {
287 e.printStackTrace();
288 }
289 }
290 else {
291 POSString pos_word_token = new POSString(word_token,pos_tags);
292
293 words.add(pos_word_token);
294 }
295 }
296 }
297 else {
298 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
299 }
300
301 return words;
302 }
303 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
304 {
305 ArrayList<POSString> words_out = new ArrayList<POSString>();
306
307 for (POSString pos_word: words_in) {
308 String word = pos_word.getString();
309 String[] pos_tags = pos_word.getPOSTags();
310
311 Reader reader = new StringReader(word);
312
313 Tokenizer tokenizer = new StandardTokenizer();
314 tokenizer.setReader(reader);
315 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
316
317 TokenStream token_stream = new LowerCaseFilter(tokenizer);
318
319 try {
320 token_stream.reset();
321
322 while (token_stream.incrementToken()) {
323 String term = charTermAttribute.toString();
324
325 POSString pos_term = new POSString(term,pos_tags);
326 words_out.add(pos_term);
327 }
328
329 token_stream.end();
330 token_stream.close();
331 }
332 catch (IOException e) {
333 e.printStackTrace();
334 }
335
336 }
337
338 return words_out;
339 }
340
341 protected static ArrayList<String> lowerCaseTerms(String word)
342 {
343 ArrayList<String> words_out = new ArrayList<String>();
344
345 Reader reader = new StringReader(word);
346
347 Tokenizer tokenizer = new StandardTokenizer();
348 tokenizer.setReader(reader);
349 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
350
351 TokenStream token_stream = new LowerCaseFilter(tokenizer);
352
353 try {
354 token_stream.reset();
355
356 while (token_stream.incrementToken()) {
357 String term = charTermAttribute.toString();
358
359 words_out.add(term);
360 }
361
362 token_stream.end();
363 token_stream.close();
364 }
365 catch (IOException e) {
366 e.printStackTrace();
367 }
368
369 return words_out;
370 }
371
372 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
373 WhitelistBloomFilter whitelist_bloomfilter)
374 {
375 ArrayList<POSString> pos_words_out = new ArrayList<POSString>();
376
377 for (POSString pos_word: words_in) {
378 String word = pos_word.getString();
379 String[] pos_tags = pos_word.getPOSTags();
380
381 if (whitelist_bloomfilter.contains(word)) {
382
383 ArrayList<String> word_terms = lowerCaseTerms(word);
384 for (String term: word_terms) {
385 POSString pos_term = new POSString(term, pos_tags);
386
387 pos_words_out.add(pos_term);
388 }
389
390 // The old, direct way of adding the value in
391 //pos_words_out.add(pos_word);
392 }
393 else {
394 // else clause won't happen so often
395 // (has to be an 'obscure' word *not* be in the whitelist to get here)
396 // break down the word into terms, and see if any of them are in the whitelist instead
397
398 ArrayList<String> word_terms = lowerCaseTerms(word);
399 for (String term: word_terms) {
400
401 if (whitelist_bloomfilter.contains(term)) {
402 POSString pos_term = new POSString(term, pos_tags);
403
404 pos_words_out.add(pos_term);
405 }
406 }
407
408
409 }
410 }
411
412 return pos_words_out;
413 }
414
415 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
416 {
417 ArrayList<String> pos_labels = new ArrayList<String>();
418
419 if (ef_token_pos_count != null) {
420
421 Iterator<String> word_token_iter = ef_token_pos_count.keys();
422 while (word_token_iter.hasNext()) {
423 String word_token = word_token_iter.next();
424
425 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
426
427 Iterator<String> pos_token_iter = word_pos_labels.keys();
428 while (pos_token_iter.hasNext()) {
429 String pos_token = pos_token_iter.next();
430
431 pos_labels.add(pos_token);
432 }
433 }
434 }
435 else {
436 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
437 }
438
439 return pos_labels;
440 }
441
442
443
444 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
445 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
446 {
447 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
448
449 StringBuilder sb = new StringBuilder();
450
451 if (whitelist_bloomfilter == null) {
452
453 boolean first_append = true;
454
455 for (int i=0; i<tokens.size(); i++) {
456 String token = tokens.get(i);
457
458 if (!first_append) {
459 sb.append(" ");
460 }
461 else {
462 first_append = false;
463 }
464 sb.append(token);
465 }
466 }
467 else {
468 boolean first_append = true;
469
470 for (int i=0; i<tokens.size(); i++) {
471 String token = tokens.get(i);
472
473 if (whitelist_bloomfilter.contains(token)) {
474 if (!first_append) {
475 sb.append(" ");
476 }
477 else {
478 first_append = false;
479 }
480 sb.append(token);
481 }
482 }
483
484 }
485
486
487 return sb.toString();
488 }
489
490 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
491 WhitelistBloomFilter whitelist_bloomfilter,
492 UniversalPOSLangMap universal_langmap,
493 boolean icu_tokenize)
494 {
495 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
496 //ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
497
498 ArrayList<POSString> tokens = null;
499 if (whitelist_bloomfilter != null) {
500 tokens = getTokenPosCountWordsMapWhitelist(cs_tokens,whitelist_bloomfilter);
501 //tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
502 }
503 else {
504 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
505 tokens = lc_tokens;
506 }
507
508 return tokens;
509 }
510
511 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
512 UniversalPOSLangMap universal_langmap,
513 JSONObject solr_doc_json)
514 {
515 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
516 JSONArray ef_languages = ef_page.getJSONArray("languages");
517 if ((ef_languages != null) && (ef_languages.length()>0)) {
518
519 int lang_len = ef_languages.length();
520 String [] lang_list = new String[lang_len];
521
522 for (int i=0; i<lang_len; i++) {
523 JSONObject lang_rec = ef_languages.getJSONObject(i);
524
525 Iterator<String> lang_key_iter = lang_rec.keys();
526 while (lang_key_iter.hasNext()) {
527 String lang_label = lang_key_iter.next();
528
529 lang_list[i] = lang_label;
530 }
531 }
532
533 int text_len = text_al.size();
534
535 /*
536 for (int li=0; li<lang_len; li++) {
537 String lang_key = lang_list[li];
538
539 if (universal_langmap.containsLanguage(lang_key))
540 {
541 */
542 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
543
544 for (int ti=0; ti<text_len; ti++) {
545 POSString pos_text_value = text_al.get(ti);
546 String text_value = pos_text_value.getString();
547
548 String[] pos_tags = pos_text_value.getPOSTags();
549 int pos_tags_len = pos_tags.length;
550
551 for (int pti=0; pti<pos_tags_len; pti++) {
552 String opennlp_pos_key = pos_tags[pti];
553
554 Tuple2<String,String> lang_pos_pair = universal_langmap.getUniversalLanguagePOSPair(lang_list, opennlp_pos_key);
555 String selected_lang = lang_pos_pair._1;
556 String upos = lang_pos_pair._2;
557
558 String pos_lang_text_field = selected_lang;
559 if (upos != null) {
560 pos_lang_text_field += "_" + upos;
561 }
562 pos_lang_text_field += "_htrctoken";
563
564 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
565 JSONArray empty_json_values = new JSONArray();
566 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
567 }
568 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
569 }
570 }
571
572 // Now add each of the POS language fields into solr_doc_json
573 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
574 for (String plf_key : pos_lang_field_keys) {
575 String lang_text_field = plf_key;
576 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
577
578 solr_doc_json.put(lang_text_field, json_values);
579 }
580 /*
581 }
582 else {
583 String lang_text_field = lang_key + "_htrctoken";
584
585 JSONArray json_values = new JSONArray();
586 for (int ti=0; ti<text_len; ti++) {
587 POSString pos_text_value = text_al.get(ti);
588 String text_value = pos_text_value.getString();
589 json_values.put(text_value);
590 }
591 solr_doc_json.put(lang_text_field, json_values);
592
593 }
594
595
596 }
597 */
598 }
599 }
600
601 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
602 WhitelistBloomFilter whitelist_bloomfilter,
603 UniversalPOSLangMap universal_langmap,
604 boolean icu_tokenize)
605 {
606 JSONObject solr_update_json = null;
607
608 if (ef_page != null) {
609 JSONObject ef_body = ef_page.getJSONObject("body");
610 if (ef_body != null) {
611 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
612 if (ef_token_pos_count != null) {
613
614 JSONObject solr_add_json = new JSONObject();
615
616 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
617
618 JSONObject solr_doc_json = new JSONObject();
619 solr_doc_json.put("id", page_id);
620 solr_doc_json.put("volumeid_s", volume_id);
621 if (text_al.size()>0) {
622 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
623 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
624 }
625 else {
626 solr_doc_json.put("efnotext_b", true);
627 }
628 solr_add_json.put("commitWithin", 5000);
629 solr_add_json.put("doc", solr_doc_json);
630
631 solr_update_json = new JSONObject();
632 solr_update_json.put("add",solr_add_json);
633
634 }
635 else {
636 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
637 }
638 }
639 else {
640 System.err.println("Warning: empty body field for '" + page_id + "'");
641 }
642
643 }
644 else {
645 System.err.println("Warning: null page for '" + page_id + "'");
646 }
647
648
649 /*
650
651 /update/json/docs
652 */
653
654 // For Reference ...
655 // Example documentation on Solr JSON syntax:
656 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
657 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
658
659 /*
660 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
661 {
662 "add": {
663 "doc": {
664 "id": "DOC1",
665 "my_boosted_field": { use a map with boost/value for a boosted field
666 "boost": 2.3,
667 "value": "test"
668 },
669 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
670 }
671 },
672 "add": {
673 "commitWithin": 5000, commit this document within 5 seconds
674 "overwrite": false, don't check for existing documents with the same uniqueKey
675 "boost": 3.45, a document boost
676 "doc": {
677 "f1": "v1", Can use repeated keys for a multi-valued field
678 "f1": "v2"
679 }
680 },
681
682 "commit": {},
683 "optimize": { "waitSearcher":false },
684
685 "delete": { "id":"ID" }, delete by ID
686 "delete": { "query":"QUERY" } delete by query
687 }'
688 */
689
690 return solr_update_json;
691 }
692
693 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
694 boolean icu_tokenize)
695 {
696 ArrayList<String> word_list = null;
697
698 if (ef_page != null) {
699 JSONObject ef_body = ef_page.getJSONObject("body");
700 if (ef_body != null) {
701 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
702 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
703 }
704 else {
705 System.err.println("Warning: empty body field for '" + page_id + "'");
706 }
707
708 }
709 else {
710 System.err.println("Warning: null page for '" + page_id + "'");
711 }
712
713 return word_list;
714 }
715
716 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
717 {
718 ArrayList<String> word_list = null;
719
720 if (ef_page != null) {
721 JSONObject ef_body = ef_page.getJSONObject("body");
722 if (ef_body != null) {
723 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
724 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
725 }
726 else {
727 System.err.println("Warning: empty body field for '" + page_id + "'");
728 }
729
730 }
731 else {
732 System.err.println("Warning: null page for '" + page_id + "'");
733 }
734
735 return word_list;
736 }
737
738 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
739 {
740 ArrayList<String> lang_list = new ArrayList<String>();;
741
742 if (ef_page != null) {
743 JSONArray ef_languages = ef_page.getJSONArray("languages");
744 if (ef_languages != null) {
745
746 int lang_len = ef_languages.length();
747 for (int i=0; i<lang_len; i++) {
748 JSONObject lang_rec = ef_languages.getJSONObject(i);
749
750 Iterator<String> lang_key_iter = lang_rec.keys();
751 while (lang_key_iter.hasNext()) {
752 String lang_label = lang_key_iter.next();
753
754 lang_list.add(lang_label);
755 }
756 }
757 }
758 else {
759 System.err.println("Warning: empty languages field for '" + page_id + "'");
760 }
761
762 }
763 else {
764 System.err.println("Warning: null page for '" + page_id + "'");
765 }
766
767 return lang_list;
768 }
769
770 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
771 {
772 try {
773 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
774 bw.write(solr_add_doc_json.toString());
775 bw.close();
776 } catch (IOException e) {
777 e.printStackTrace();
778 } catch (CompressorException e) {
779 e.printStackTrace();
780 }
781 }
782
783 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json,
784 String volume_id, String page_id)
785 {
786
787 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
788 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
789 //curl_popen += " --data-binary '";
790 //curl_popen += "'"
791
792
793 // System.out.println("Post URL: " + post_url);
794
795 try {
796 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
797 httpcon.setDoOutput(true);
798 httpcon.setRequestProperty("Content-Type", "application/json");
799 httpcon.setRequestProperty("Accept", "application/json");
800 httpcon.setRequestMethod("POST");
801 httpcon.connect();
802
803 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
804 OutputStream os = httpcon.getOutputStream();
805 os.write(outputBytes);
806 os.close();
807
808
809 // Read response
810 StringBuilder sb = new StringBuilder();
811 InputStream is = httpcon.getInputStream();
812 BufferedReader in = new BufferedReader(new InputStreamReader(is));
813 String decodedString;
814 while ((decodedString = in.readLine()) != null) {
815 sb.append(decodedString);
816 }
817 in.close();
818
819 JSONObject solr_status_json = new JSONObject(sb.toString());
820 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
821 if (response_header_json != null) {
822 int status = response_header_json.getInt("status");
823 if (status != 0) {
824 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
825 System.err.println("Full response was: " + sb);
826 }
827 }
828 else {
829 System.err.println("Failed response to Solr POST: " + sb);
830 }
831
832
833
834 }
835 catch (IOException e) {
836 System.err.println("Solr core update failed when processing id: " + volume_id + "." + page_id);
837 e.printStackTrace();
838 }
839
840 catch (Exception e) {
841 e.printStackTrace();
842 }
843 }
844}
Note: See TracBrowser for help on using the repository browser.