source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31499

Last change on this file since 31499 was 31499, checked in by davidb, 7 years ago

Better exception handling

  • Property svn:executable set to *
File size: 18.1 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStream;
7import java.io.InputStreamReader;
8import java.io.OutputStream;
9import java.io.Reader;
10import java.io.StringReader;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.util.ArrayList;
14import java.util.HashMap;
15import java.util.Iterator;
16import java.util.Set;
17
18import org.apache.commons.compress.compressors.CompressorException;
19import org.json.JSONArray;
20import org.json.JSONObject;
21import org.apache.lucene.analysis.TokenStream;
22import org.apache.lucene.analysis.Tokenizer;
23import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
24import org.apache.lucene.analysis.standard.StandardTokenizer;
25import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26import org.apache.lucene.analysis.core.LowerCaseFilter;
27
28public class SolrDocJSON {
29
30
31 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
32 boolean icu_tokenize)
33 {
34 boolean lowercase_filter = true;
35
36 ArrayList<String> words = new ArrayList<String>();
37
38 if (ef_token_pos_count != null) {
39
40 Iterator<String> word_token_iter = ef_token_pos_count.keys();
41 while (word_token_iter.hasNext()) {
42 String word_token = word_token_iter.next();
43
44 if (icu_tokenize) {
45 Reader reader = new StringReader(word_token);
46
47 ICUTokenizer icu_tokenizer = new ICUTokenizer();
48 icu_tokenizer.setReader(reader);
49
50 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
51
52 TokenStream token_stream = null;
53
54 if (lowercase_filter) {
55 token_stream = new LowerCaseFilter(icu_tokenizer);
56 }
57 else {
58 token_stream = icu_tokenizer;
59 }
60
61 try {
62 token_stream.reset();
63
64 while (token_stream.incrementToken()) {
65 String term = charTermAttribute.toString();
66 words.add(term);
67 }
68
69 token_stream.end();
70 token_stream.close();
71 }
72 catch (IOException e) {
73 e.printStackTrace();
74 }
75 }
76 else {
77 words.add(word_token);
78 }
79 }
80 }
81 else {
82 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
83 }
84
85 /* Alternative way to get at keys
86 Set<String> token_keys = ef_token_pos_count.keySet();
87 for (String token : token_keys) {
88 sb.append(token + " ");
89 }
90*/
91 return words;
92 }
93
94 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
95 boolean icu_tokenize)
96 {
97 ArrayList<POSString> words = new ArrayList<POSString>();
98
99 if (ef_token_pos_count != null) {
100
101 Iterator<String> word_token_iter = ef_token_pos_count.keys();
102 while (word_token_iter.hasNext()) {
103 String word_token = word_token_iter.next();
104
105 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
106
107 Set<String> pos_keys = pos_json_object.keySet();
108 int pos_keys_len = pos_keys.size();
109 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
110
111 if (icu_tokenize == true) {
112 Reader reader = new StringReader(word_token);
113
114 ICUTokenizer icu_tokenizer = new ICUTokenizer();
115 icu_tokenizer.setReader(reader);
116
117 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
118
119 TokenStream token_stream = icu_tokenizer;
120
121 try {
122 token_stream.reset();
123
124 while (token_stream.incrementToken()) {
125 String term = charTermAttribute.toString();
126
127 POSString pos_string = new POSString(term,pos_tags);
128
129 words.add(pos_string);
130 }
131
132 token_stream.end();
133 token_stream.close();
134 }
135 catch (IOException e) {
136 e.printStackTrace();
137 }
138 }
139 else {
140 POSString pos_word_token = new POSString(word_token,pos_tags);
141
142 words.add(pos_word_token);
143 }
144 }
145 }
146 else {
147 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
148 }
149
150 return words;
151 }
152 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
153 {
154 ArrayList<POSString> words_out = new ArrayList<POSString>();
155
156 for (POSString pos_word: words_in) {
157 String word = pos_word.getString();
158 String[] pos_tags = pos_word.getPOSTags();
159
160 Reader reader = new StringReader(word);
161
162 Tokenizer tokenizer = new StandardTokenizer();
163 tokenizer.setReader(reader);
164 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
165
166 TokenStream token_stream = new LowerCaseFilter(tokenizer);
167
168 try {
169 token_stream.reset();
170
171 while (token_stream.incrementToken()) {
172 String term = charTermAttribute.toString();
173
174 POSString pos_term = new POSString(term,pos_tags);
175 words_out.add(pos_term);
176 }
177
178 token_stream.end();
179 token_stream.close();
180 }
181 catch (IOException e) {
182 e.printStackTrace();
183 }
184
185 }
186
187 return words_out;
188 }
189
190 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
191 WhitelistBloomFilter whitelist_bloomfilter)
192 {
193 ArrayList<POSString> words_out = new ArrayList<POSString>();
194
195 for (POSString pos_word: words_in) {
196 String word = pos_word.getString();
197 if (whitelist_bloomfilter.contains(word)) {
198 words_out.add(pos_word);
199 }
200 }
201
202 return words_out;
203 }
204
205 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
206 {
207 ArrayList<String> pos_labels = new ArrayList<String>();
208
209 if (ef_token_pos_count != null) {
210
211 Iterator<String> word_token_iter = ef_token_pos_count.keys();
212 while (word_token_iter.hasNext()) {
213 String word_token = word_token_iter.next();
214
215 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
216
217 Iterator<String> pos_token_iter = word_pos_labels.keys();
218 while (pos_token_iter.hasNext()) {
219 String pos_token = pos_token_iter.next();
220
221 pos_labels.add(pos_token);
222 }
223 }
224 }
225 else {
226 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
227 }
228
229 return pos_labels;
230 }
231
232
233
234 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
235 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
236 {
237 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
238
239 StringBuilder sb = new StringBuilder();
240
241 if (whitelist_bloomfilter == null) {
242
243 boolean first_append = true;
244
245 for (int i=0; i<tokens.size(); i++) {
246 String token = tokens.get(i);
247
248 if (!first_append) {
249 sb.append(" ");
250 }
251 else {
252 first_append = false;
253 }
254 sb.append(token);
255 }
256 }
257 else {
258 boolean first_append = true;
259
260 for (int i=0; i<tokens.size(); i++) {
261 String token = tokens.get(i);
262
263 if (whitelist_bloomfilter.contains(token)) {
264 if (!first_append) {
265 sb.append(" ");
266 }
267 else {
268 first_append = false;
269 }
270 sb.append(token);
271 }
272 }
273
274 }
275
276
277 return sb.toString();
278 }
279
280 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
281 WhitelistBloomFilter whitelist_bloomfilter,
282 UniversalPOSLangMap universal_langmap,
283 boolean icu_tokenize)
284 {
285 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
286 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
287
288 ArrayList<POSString> tokens = null;
289 if (whitelist_bloomfilter != null) {
290 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
291 }
292 else {
293 tokens = lc_tokens;
294 }
295
296 return tokens;
297 }
298
299 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
300 UniversalPOSLangMap universal_langmap,
301 JSONObject solr_doc_json)
302 {
303 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
304 JSONArray ef_languages = ef_page.getJSONArray("languages");
305 if (ef_languages != null) {
306
307 int lang_len = ef_languages.length();
308 String [] lang_list = new String[lang_len];
309
310 for (int i=0; i<lang_len; i++) {
311 JSONObject lang_rec = ef_languages.getJSONObject(i);
312
313 Iterator<String> lang_key_iter = lang_rec.keys();
314 while (lang_key_iter.hasNext()) {
315 String lang_label = lang_key_iter.next();
316
317 lang_list[i] = lang_label;
318 }
319 }
320
321 int text_len = text_al.size();
322
323 for (int li=0; li<lang_len; li++) {
324 String lang_key = lang_list[li];
325
326 if (universal_langmap.containsLanguage(lang_key))
327 {
328 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
329
330 for (int ti=0; ti<text_len; ti++) {
331 POSString pos_text_value = text_al.get(ti);
332 String text_value = pos_text_value.getString();
333
334 String[] pos_tags = pos_text_value.getPOSTags();
335 int pos_tags_len = pos_tags.length;
336
337 for (int pti=0; pti<pos_tags_len; pti++) {
338 String opennlp_pos_key = pos_tags[pti];
339
340 String upos = universal_langmap.getUniversalLanguagePOS(lang_key, opennlp_pos_key);
341 String pos_lang_text_field = lang_key + "_" + upos + "_htrctoken";
342
343 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
344 JSONArray empty_json_values = new JSONArray();
345 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
346 }
347 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
348 }
349 }
350
351 // Now add each of the POS language fields into solr_doc_json
352 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
353 for (String plf_key : pos_lang_field_keys) {
354 String lang_text_field = plf_key;
355 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
356
357 solr_doc_json.put(lang_text_field, json_values);
358 }
359 }
360 else {
361 String lang_text_field = lang_key + "_htrctoken";
362
363 JSONArray json_values = new JSONArray();
364 for (int ti=0; ti<text_len; ti++) {
365 POSString pos_text_value = text_al.get(ti);
366 String text_value = pos_text_value.getString();
367 json_values.put(text_value);
368 }
369 solr_doc_json.put(lang_text_field, json_values);
370
371 }
372
373
374 }
375
376 }
377 }
378 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
379 WhitelistBloomFilter whitelist_bloomfilter,
380 UniversalPOSLangMap universal_langmap,
381 boolean icu_tokenize)
382 {
383 JSONObject solr_update_json = null;
384
385 if (ef_page != null) {
386 JSONObject ef_body = ef_page.getJSONObject("body");
387 if (ef_body != null) {
388 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
389 if (ef_token_pos_count != null) {
390
391 JSONObject solr_add_json = new JSONObject();
392
393 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
394
395 JSONObject solr_doc_json = new JSONObject();
396 solr_doc_json.put("id", page_id);
397 solr_doc_json.put("volumeid_s", volume_id);
398 if (text_al.size()>0) {
399 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
400 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
401 }
402 else {
403 solr_doc_json.put("efnotext_b", true);
404 }
405 solr_add_json.put("commitWithin", 5000);
406 solr_add_json.put("doc", solr_doc_json);
407
408 solr_update_json = new JSONObject();
409 solr_update_json.put("add",solr_add_json);
410
411 }
412 else {
413 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
414 }
415 }
416 else {
417 System.err.println("Warning: empty body field for '" + page_id + "'");
418 }
419
420 }
421 else {
422 System.err.println("Warning: null page for '" + page_id + "'");
423 }
424
425
426 /*
427
428 /update/json/docs
429 */
430
431 // For Reference ...
432 // Example documentation on Solr JSON syntax:
433 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
434 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
435
436 /*
437 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
438 {
439 "add": {
440 "doc": {
441 "id": "DOC1",
442 "my_boosted_field": { use a map with boost/value for a boosted field
443 "boost": 2.3,
444 "value": "test"
445 },
446 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
447 }
448 },
449 "add": {
450 "commitWithin": 5000, commit this document within 5 seconds
451 "overwrite": false, don't check for existing documents with the same uniqueKey
452 "boost": 3.45, a document boost
453 "doc": {
454 "f1": "v1", Can use repeated keys for a multi-valued field
455 "f1": "v2"
456 }
457 },
458
459 "commit": {},
460 "optimize": { "waitSearcher":false },
461
462 "delete": { "id":"ID" }, delete by ID
463 "delete": { "query":"QUERY" } delete by query
464 }'
465 */
466
467 return solr_update_json;
468 }
469
470 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
471 boolean icu_tokenize)
472 {
473 ArrayList<String> word_list = null;
474
475 if (ef_page != null) {
476 JSONObject ef_body = ef_page.getJSONObject("body");
477 if (ef_body != null) {
478 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
479 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
480 }
481 else {
482 System.err.println("Warning: empty body field for '" + page_id + "'");
483 }
484
485 }
486 else {
487 System.err.println("Warning: null page for '" + page_id + "'");
488 }
489
490 return word_list;
491 }
492
493 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
494 {
495 ArrayList<String> word_list = null;
496
497 if (ef_page != null) {
498 JSONObject ef_body = ef_page.getJSONObject("body");
499 if (ef_body != null) {
500 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
501 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
502 }
503 else {
504 System.err.println("Warning: empty body field for '" + page_id + "'");
505 }
506
507 }
508 else {
509 System.err.println("Warning: null page for '" + page_id + "'");
510 }
511
512 return word_list;
513 }
514
515 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
516 {
517 ArrayList<String> lang_list = new ArrayList<String>();;
518
519 if (ef_page != null) {
520 JSONArray ef_languages = ef_page.getJSONArray("languages");
521 if (ef_languages != null) {
522
523 int lang_len = ef_languages.length();
524 for (int i=0; i<lang_len; i++) {
525 JSONObject lang_rec = ef_languages.getJSONObject(i);
526
527 Iterator<String> lang_key_iter = lang_rec.keys();
528 while (lang_key_iter.hasNext()) {
529 String lang_label = lang_key_iter.next();
530
531 lang_list.add(lang_label);
532 }
533 }
534 }
535 else {
536 System.err.println("Warning: empty languages field for '" + page_id + "'");
537 }
538
539 }
540 else {
541 System.err.println("Warning: null page for '" + page_id + "'");
542 }
543
544 return lang_list;
545 }
546
547 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
548 {
549 try {
550 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
551 bw.write(solr_add_doc_json.toString());
552 bw.close();
553 } catch (IOException e) {
554 e.printStackTrace();
555 } catch (CompressorException e) {
556 e.printStackTrace();
557 }
558 }
559
560 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json,
561 String volume_id, String page_id)
562 {
563
564 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
565 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
566 //curl_popen += " --data-binary '";
567 //curl_popen += "'"
568
569
570 // System.out.println("Post URL: " + post_url);
571
572 try {
573 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
574 httpcon.setDoOutput(true);
575 httpcon.setRequestProperty("Content-Type", "application/json");
576 httpcon.setRequestProperty("Accept", "application/json");
577 httpcon.setRequestMethod("POST");
578 httpcon.connect();
579
580 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
581 OutputStream os = httpcon.getOutputStream();
582 os.write(outputBytes);
583 os.close();
584
585
586 // Read response
587 StringBuilder sb = new StringBuilder();
588 InputStream is = httpcon.getInputStream();
589 BufferedReader in = new BufferedReader(new InputStreamReader(is));
590 String decodedString;
591 while ((decodedString = in.readLine()) != null) {
592 sb.append(decodedString);
593 }
594 in.close();
595
596 JSONObject solr_status_json = new JSONObject(sb.toString());
597 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
598 if (response_header_json != null) {
599 int status = response_header_json.getInt("status");
600 if (status != 0) {
601 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
602 System.err.println("Full response was: " + sb);
603 }
604 }
605 else {
606 System.err.println("Failed response to Solr POST: " + sb);
607 }
608
609
610
611 }
612 catch (IOException e) {
613 System.err.println("Solr core update failed when processing id: " + volume_id + "." + page_id);
614 e.printStackTrace();
615 }
616
617 catch (Exception e) {
618 e.printStackTrace();
619 }
620
621 }
622}
Note: See TracBrowser for help on using the repository browser.