source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31378

Last change on this file since 31378 was 31378, checked in by davidb, 7 years ago

Fixed loop limit test

  • Property svn:executable set to *
File size: 17.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.HashMap;
14import java.util.Iterator;
15import java.util.Set;
16
17import org.apache.commons.compress.compressors.CompressorException;
18import org.json.JSONArray;
19import org.json.JSONObject;
20import org.apache.lucene.analysis.TokenStream;
21import org.apache.lucene.analysis.Tokenizer;
22import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
23import org.apache.lucene.analysis.standard.StandardTokenizer;
24import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25import org.apache.lucene.analysis.core.LowerCaseFilter;
26
27public class SolrDocJSON {
28
29
30 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
31 boolean icu_tokenize)
32 {
33 boolean lowercase_filter = true;
34
35 ArrayList<String> words = new ArrayList<String>();
36
37 if (ef_token_pos_count != null) {
38
39 Iterator<String> word_token_iter = ef_token_pos_count.keys();
40 while (word_token_iter.hasNext()) {
41 String word_token = word_token_iter.next();
42
43 if (icu_tokenize) {
44 Reader reader = new StringReader(word_token);
45
46 ICUTokenizer icu_tokenizer = new ICUTokenizer();
47 icu_tokenizer.setReader(reader);
48
49 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
50
51 TokenStream token_stream = null;
52
53 if (lowercase_filter) {
54 token_stream = new LowerCaseFilter(icu_tokenizer);
55 }
56 else {
57 token_stream = icu_tokenizer;
58 }
59
60 try {
61 token_stream.reset();
62
63 while (token_stream.incrementToken()) {
64 String term = charTermAttribute.toString();
65 words.add(term);
66 }
67
68 token_stream.end();
69 token_stream.close();
70 }
71 catch (IOException e) {
72 e.printStackTrace();
73 }
74 }
75 else {
76 words.add(word_token);
77 }
78 }
79 }
80 else {
81 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
82 }
83
84 /* Alternative way to get at keys
85 Set<String> token_keys = ef_token_pos_count.keySet();
86 for (String token : token_keys) {
87 sb.append(token + " ");
88 }
89*/
90 return words;
91 }
92
93 protected static ArrayList<POSString> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
94 boolean icu_tokenize)
95 {
96 ArrayList<POSString> words = new ArrayList<POSString>();
97
98 if (ef_token_pos_count != null) {
99
100 Iterator<String> word_token_iter = ef_token_pos_count.keys();
101 while (word_token_iter.hasNext()) {
102 String word_token = word_token_iter.next();
103
104 JSONObject pos_json_object = ef_token_pos_count.getJSONObject(word_token);
105
106 Set<String> pos_keys = pos_json_object.keySet();
107 int pos_keys_len = pos_keys.size();
108 String[] pos_tags = (pos_keys_len>0) ? pos_keys.toArray(new String[pos_keys_len]) : null;
109
110 if (icu_tokenize == true) {
111 Reader reader = new StringReader(word_token);
112
113 ICUTokenizer icu_tokenizer = new ICUTokenizer();
114 icu_tokenizer.setReader(reader);
115
116 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
117
118 TokenStream token_stream = icu_tokenizer;
119
120 try {
121 token_stream.reset();
122
123 while (token_stream.incrementToken()) {
124 String term = charTermAttribute.toString();
125
126 POSString pos_string = new POSString(term,pos_tags);
127
128 words.add(pos_string);
129 }
130
131 token_stream.end();
132 token_stream.close();
133 }
134 catch (IOException e) {
135 e.printStackTrace();
136 }
137 }
138 else {
139 POSString pos_word_token = new POSString(word_token,pos_tags);
140
141 words.add(pos_word_token);
142 }
143 }
144 }
145 else {
146 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
147 }
148
149 return words;
150 }
151 protected static ArrayList<POSString> getTokenPosCountWordsMapCaseInsensitive(ArrayList<POSString> words_in)
152 {
153 ArrayList<POSString> words_out = new ArrayList<POSString>();
154
155 for (POSString pos_word: words_in) {
156 String word = pos_word.getString();
157 String[] pos_tags = pos_word.getPOSTags();
158
159 Reader reader = new StringReader(word);
160
161 Tokenizer tokenizer = new StandardTokenizer();
162 tokenizer.setReader(reader);
163 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
164
165 TokenStream token_stream = new LowerCaseFilter(tokenizer);
166
167 try {
168 token_stream.reset();
169
170 while (token_stream.incrementToken()) {
171 String term = charTermAttribute.toString();
172
173 POSString pos_term = new POSString(term,pos_tags);
174 words_out.add(pos_term);
175 }
176
177 token_stream.end();
178 token_stream.close();
179 }
180 catch (IOException e) {
181 e.printStackTrace();
182 }
183
184 }
185
186 return words_out;
187 }
188
189 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in,
190 WhitelistBloomFilter whitelist_bloomfilter)
191 {
192 ArrayList<POSString> words_out = new ArrayList<POSString>();
193
194 for (POSString pos_word: words_in) {
195 String word = pos_word.getString();
196 if (whitelist_bloomfilter.contains(word)) {
197 words_out.add(pos_word);
198 }
199 }
200
201 return words_out;
202 }
203
204 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
205 {
206 ArrayList<String> pos_labels = new ArrayList<String>();
207
208 if (ef_token_pos_count != null) {
209
210 Iterator<String> word_token_iter = ef_token_pos_count.keys();
211 while (word_token_iter.hasNext()) {
212 String word_token = word_token_iter.next();
213
214 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
215
216 Iterator<String> pos_token_iter = word_pos_labels.keys();
217 while (pos_token_iter.hasNext()) {
218 String pos_token = pos_token_iter.next();
219
220 pos_labels.add(pos_token);
221 }
222 }
223 }
224 else {
225 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
226 }
227
228 return pos_labels;
229 }
230
231
232
233 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
234 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
235 {
236 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
237
238 StringBuilder sb = new StringBuilder();
239
240 if (whitelist_bloomfilter == null) {
241
242 boolean first_append = true;
243
244 for (int i=0; i<tokens.size(); i++) {
245 String token = tokens.get(i);
246
247 if (!first_append) {
248 sb.append(" ");
249 }
250 else {
251 first_append = false;
252 }
253 sb.append(token);
254 }
255 }
256 else {
257 boolean first_append = true;
258
259 for (int i=0; i<tokens.size(); i++) {
260 String token = tokens.get(i);
261
262 if (whitelist_bloomfilter.contains(token)) {
263 if (!first_append) {
264 sb.append(" ");
265 }
266 else {
267 first_append = false;
268 }
269 sb.append(token);
270 }
271 }
272
273 }
274
275
276 return sb.toString();
277 }
278
279 protected static ArrayList<POSString> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
280 WhitelistBloomFilter whitelist_bloomfilter,
281 UniversalPOSLangMap universal_langmap,
282 boolean icu_tokenize)
283 {
284 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
285 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
286
287 ArrayList<POSString> tokens = null;
288 if (whitelist_bloomfilter != null) {
289 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
290 }
291 else {
292 tokens = lc_tokens;
293 }
294
295 return tokens;
296 }
297
298 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<POSString> text_al,
299 UniversalPOSLangMap universal_langmap,
300 JSONObject solr_doc_json)
301 {
302 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
303 JSONArray ef_languages = ef_page.getJSONArray("languages");
304 if (ef_languages != null) {
305
306 int lang_len = ef_languages.length();
307 String [] lang_list = new String[lang_len];
308
309 for (int i=0; i<lang_len; i++) {
310 JSONObject lang_rec = ef_languages.getJSONObject(i);
311
312 Iterator<String> lang_key_iter = lang_rec.keys();
313 while (lang_key_iter.hasNext()) {
314 String lang_label = lang_key_iter.next();
315
316 lang_list[i] = lang_label;
317 }
318 }
319
320 int text_len = text_al.size();
321
322 for (int li=0; li<lang_len; li++) {
323 String lang_key = lang_list[li];
324
325 if (universal_langmap.containsLanguage(lang_key))
326 {
327 HashMap<String,JSONArray> pos_lang_text_field_map = new HashMap<String,JSONArray>();
328
329 for (int ti=0; ti<text_len; ti++) {
330 POSString pos_text_value = text_al.get(ti);
331 String text_value = pos_text_value.getString();
332
333 String[] pos_tags = pos_text_value.getPOSTags();
334 int pos_tags_len = pos_tags.length;
335
336 for (int pti=0; pti<pos_tags_len; pti++) {
337 String opennlp_pos_key = pos_tags[pti];
338
339 String upos = universal_langmap.getUniversalLanguagePOS(lang_key, opennlp_pos_key);
340 String pos_lang_text_field = lang_key + "_" + upos + "_htrctoken";
341
342 if (!pos_lang_text_field_map.containsKey(pos_lang_text_field)) {
343 JSONArray empty_json_values = new JSONArray();
344 pos_lang_text_field_map.put(pos_lang_text_field, empty_json_values);
345 }
346 pos_lang_text_field_map.get(pos_lang_text_field).put(text_value);
347 }
348 }
349
350 // Now add each of the POS language fields into solr_doc_json
351 Set<String> pos_lang_field_keys = pos_lang_text_field_map.keySet();
352 for (String plf_key : pos_lang_field_keys) {
353 String lang_text_field = plf_key;
354 JSONArray json_values = pos_lang_text_field_map.get(plf_key);
355
356 solr_doc_json.put(lang_text_field, json_values);
357 }
358 }
359 else {
360 String lang_text_field = lang_key + "_htrctoken";
361
362 JSONArray json_values = new JSONArray();
363 for (int ti=0; ti<text_len; ti++) {
364 POSString pos_text_value = text_al.get(ti);
365 String text_value = pos_text_value.getString();
366 json_values.put(text_value);
367 }
368 solr_doc_json.put(lang_text_field, json_values);
369
370 }
371
372
373 }
374
375 }
376 }
377 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
378 WhitelistBloomFilter whitelist_bloomfilter,
379 UniversalPOSLangMap universal_langmap,
380 boolean icu_tokenize)
381 {
382 JSONObject solr_update_json = null;
383
384 if (ef_page != null) {
385 JSONObject ef_body = ef_page.getJSONObject("body");
386 if (ef_body != null) {
387 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
388 if (ef_token_pos_count != null) {
389
390 JSONObject solr_add_json = new JSONObject();
391
392 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize);
393
394 JSONObject solr_doc_json = new JSONObject();
395 solr_doc_json.put("id", page_id);
396 solr_doc_json.put("volumeid_s", volume_id);
397 if (text_al.size()>0) {
398 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json);
399 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
400 }
401 else {
402 solr_doc_json.put("efnotext_b", true);
403 }
404 solr_add_json.put("commitWithin", 5000);
405 solr_add_json.put("doc", solr_doc_json);
406
407 solr_update_json = new JSONObject();
408 solr_update_json.put("add",solr_add_json);
409
410 }
411 else {
412 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
413 }
414 }
415 else {
416 System.err.println("Warning: empty body field for '" + page_id + "'");
417 }
418
419 }
420 else {
421 System.err.println("Warning: null page for '" + page_id + "'");
422 }
423
424
425 /*
426
427 /update/json/docs
428 */
429
430 // For Reference ...
431 // Example documentation on Solr JSON syntax:
432 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
433 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
434
435 /*
436 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
437 {
438 "add": {
439 "doc": {
440 "id": "DOC1",
441 "my_boosted_field": { use a map with boost/value for a boosted field
442 "boost": 2.3,
443 "value": "test"
444 },
445 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
446 }
447 },
448 "add": {
449 "commitWithin": 5000, commit this document within 5 seconds
450 "overwrite": false, don't check for existing documents with the same uniqueKey
451 "boost": 3.45, a document boost
452 "doc": {
453 "f1": "v1", Can use repeated keys for a multi-valued field
454 "f1": "v2"
455 }
456 },
457
458 "commit": {},
459 "optimize": { "waitSearcher":false },
460
461 "delete": { "id":"ID" }, delete by ID
462 "delete": { "query":"QUERY" } delete by query
463 }'
464 */
465
466 return solr_update_json;
467 }
468
469 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
470 boolean icu_tokenize)
471 {
472 ArrayList<String> word_list = null;
473
474 if (ef_page != null) {
475 JSONObject ef_body = ef_page.getJSONObject("body");
476 if (ef_body != null) {
477 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
478 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
479 }
480 else {
481 System.err.println("Warning: empty body field for '" + page_id + "'");
482 }
483
484 }
485 else {
486 System.err.println("Warning: null page for '" + page_id + "'");
487 }
488
489 return word_list;
490 }
491
492 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
493 {
494 ArrayList<String> word_list = null;
495
496 if (ef_page != null) {
497 JSONObject ef_body = ef_page.getJSONObject("body");
498 if (ef_body != null) {
499 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
500 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
501 }
502 else {
503 System.err.println("Warning: empty body field for '" + page_id + "'");
504 }
505
506 }
507 else {
508 System.err.println("Warning: null page for '" + page_id + "'");
509 }
510
511 return word_list;
512 }
513
514 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
515 {
516 ArrayList<String> lang_list = new ArrayList<String>();;
517
518 if (ef_page != null) {
519 JSONArray ef_languages = ef_page.getJSONArray("languages");
520 if (ef_languages != null) {
521
522 int lang_len = ef_languages.length();
523 for (int i=0; i<lang_len; i++) {
524 JSONObject lang_rec = ef_languages.getJSONObject(i);
525
526 Iterator<String> lang_key_iter = lang_rec.keys();
527 while (lang_key_iter.hasNext()) {
528 String lang_label = lang_key_iter.next();
529
530 lang_list.add(lang_label);
531 }
532 }
533 }
534 else {
535 System.err.println("Warning: empty languages field for '" + page_id + "'");
536 }
537
538 }
539 else {
540 System.err.println("Warning: null page for '" + page_id + "'");
541 }
542
543 return lang_list;
544 }
545
546 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
547 {
548 try {
549 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
550 bw.write(solr_add_doc_json.toString());
551 bw.close();
552 } catch (IOException e) {
553 e.printStackTrace();
554 } catch (CompressorException e) {
555 e.printStackTrace();
556 }
557 }
558
559 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
560 {
561
562 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
563 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
564 //curl_popen += " --data-binary '";
565 //curl_popen += "'"
566
567
568 System.out.println("**** post_url = " + post_url);
569
570 try {
571 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
572 httpcon.setDoOutput(true);
573 httpcon.setRequestProperty("Content-Type", "application/json");
574 httpcon.setRequestProperty("Accept", "application/json");
575 httpcon.setRequestMethod("POST");
576 httpcon.connect();
577
578 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
579 OutputStream os = httpcon.getOutputStream();
580 os.write(outputBytes);
581 os.close();
582
583
584 // Read response
585 StringBuilder sb = new StringBuilder();
586 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
587 String decodedString;
588 while ((decodedString = in.readLine()) != null) {
589 sb.append(decodedString);
590 }
591 in.close();
592
593 JSONObject solr_status_json = new JSONObject(sb.toString());
594 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
595 if (response_header_json != null) {
596 int status = response_header_json.getInt("status");
597 if (status != 0) {
598 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
599 System.err.println("Full response was: " + sb);
600 }
601 }
602 else {
603 System.err.println("Failed response to Solr POST: " + sb);
604 }
605
606
607
608 }
609 catch (Exception e) {
610 e.printStackTrace();
611 }
612
613 }
614}
Note: See TracBrowser for help on using the repository browser.