source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31273

Last change on this file since 31273 was 31273, checked in by davidb, 7 years ago

Code moved to store fields for multilingual use using dynamic Solr fields *_htrctoken. Text is now also put in as separate tokens

  • Property svn:executable set to *
File size: 15.3 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import org.apache.commons.compress.compressors.CompressorException;
15import org.json.JSONArray;
16import org.json.JSONObject;
17import org.apache.lucene.analysis.TokenStream;
18import org.apache.lucene.analysis.Tokenizer;
19import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
20import org.apache.lucene.analysis.standard.StandardTokenizer;
21import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22import org.apache.lucene.analysis.core.LowerCaseFilter;
23
24public class SolrDocJSON {
25
26
27 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
28 boolean icu_tokenize)
29 {
30 boolean lowercase_filter = true;
31
32 ArrayList<String> words = new ArrayList<String>();
33
34 if (ef_token_pos_count != null) {
35
36 Iterator<String> word_token_iter = ef_token_pos_count.keys();
37 while (word_token_iter.hasNext()) {
38 String word_token = word_token_iter.next();
39
40 if (icu_tokenize == true) {
41 Reader reader = new StringReader(word_token);
42
43 ICUTokenizer icu_tokenizer = new ICUTokenizer();
44 icu_tokenizer.setReader(reader);
45
46 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
47
48 TokenStream token_stream = null;
49
50 if (lowercase_filter) {
51 token_stream = new LowerCaseFilter(icu_tokenizer);
52 }
53 else {
54 token_stream = icu_tokenizer;
55 }
56
57 try {
58 token_stream.reset();
59
60 while (token_stream.incrementToken()) {
61 String term = charTermAttribute.toString();
62 words.add(term);
63 }
64
65 token_stream.end();
66 token_stream.close();
67 }
68 catch (IOException e) {
69 e.printStackTrace();
70 }
71 }
72 else {
73 words.add(word_token);
74 }
75 }
76 }
77 else {
78 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
79 }
80
81 /* Alternative way to get at keys
82 Set<String> token_keys = ef_token_pos_count.keySet();
83 for (String token : token_keys) {
84 sb.append(token + " ");
85 }
86*/
87 return words;
88 }
89
90 protected static ArrayList<String> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
91 boolean icu_tokenize)
92 {
93 ArrayList<String> words = new ArrayList<String>();
94
95 if (ef_token_pos_count != null) {
96
97 Iterator<String> word_token_iter = ef_token_pos_count.keys();
98 while (word_token_iter.hasNext()) {
99 String word_token = word_token_iter.next();
100
101 if (icu_tokenize == true) {
102 Reader reader = new StringReader(word_token);
103
104 ICUTokenizer icu_tokenizer = new ICUTokenizer();
105 icu_tokenizer.setReader(reader);
106
107 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
108
109 TokenStream token_stream = icu_tokenizer;
110
111 try {
112 token_stream.reset();
113
114 while (token_stream.incrementToken()) {
115 String term = charTermAttribute.toString();
116 words.add(term);
117 }
118
119 token_stream.end();
120 token_stream.close();
121 }
122 catch (IOException e) {
123 e.printStackTrace();
124 }
125 }
126 else {
127 words.add(word_token);
128 }
129 }
130 }
131 else {
132 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
133 }
134
135 return words;
136 }
137 protected static ArrayList<String> getTokenPosCountWordsMapCaseInsensitive(ArrayList<String> words_in)
138 {
139 ArrayList<String> words_out = new ArrayList<String>();
140
141 for (String word: words_in) {
142
143 Reader reader = new StringReader(word);
144
145 Tokenizer tokenizer = new StandardTokenizer();
146 tokenizer.setReader(reader);
147 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
148
149 TokenStream token_stream = new LowerCaseFilter(tokenizer);
150
151 try {
152 token_stream.reset();
153
154 while (token_stream.incrementToken()) {
155 String term = charTermAttribute.toString();
156 words_out.add(term);
157 }
158
159 token_stream.end();
160 token_stream.close();
161 }
162 catch (IOException e) {
163 e.printStackTrace();
164 }
165
166 }
167
168 return words_out;
169 }
170
171 protected static ArrayList<String> getTokenPosCountWordsMapWhitelist(ArrayList<String> words_in,
172 WhitelistBloomFilter whitelist_bloomfilter)
173 {
174 ArrayList<String> words_out = new ArrayList<String>();
175
176 for (String word: words_in) {
177
178 if (whitelist_bloomfilter.contains(word)) {
179 words_out.add(word);
180 }
181 }
182
183 return words_out;
184 }
185
186 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
187 {
188 ArrayList<String> pos_labels = new ArrayList<String>();
189
190 if (ef_token_pos_count != null) {
191
192 Iterator<String> word_token_iter = ef_token_pos_count.keys();
193 while (word_token_iter.hasNext()) {
194 String word_token = word_token_iter.next();
195
196 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
197
198 Iterator<String> pos_token_iter = word_pos_labels.keys();
199 while (pos_token_iter.hasNext()) {
200 String pos_token = pos_token_iter.next();
201
202 pos_labels.add(pos_token);
203 }
204 }
205 }
206 else {
207 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
208 }
209
210 return pos_labels;
211 }
212
213
214
215 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
216 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
217 {
218 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
219
220 StringBuilder sb = new StringBuilder();
221
222 if (whitelist_bloomfilter == null) {
223
224 boolean first_append = true;
225
226 for (int i=0; i<tokens.size(); i++) {
227 String token = tokens.get(i);
228
229 if (!first_append) {
230 sb.append(" ");
231 }
232 else {
233 first_append = false;
234 }
235 sb.append(token);
236 }
237 }
238 else {
239 boolean first_append = true;
240
241 for (int i=0; i<tokens.size(); i++) {
242 String token = tokens.get(i);
243
244 if (whitelist_bloomfilter.contains(token)) {
245 if (!first_append) {
246 sb.append(" ");
247 }
248 else {
249 first_append = false;
250 }
251 sb.append(token);
252 }
253 }
254
255 }
256
257
258 return sb.toString();
259 }
260
261 protected static ArrayList<String> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
262 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
263 {
264 ArrayList<String> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
265 ArrayList<String> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
266
267 ArrayList<String> tokens = null;
268 if (whitelist_bloomfilter != null) {
269 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
270 }
271 else {
272 tokens = lc_tokens;
273 }
274
275 return tokens;
276 }
277
278 protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<String> text_al,
279 JSONObject solr_doc_json)
280 {
281 // e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
282 JSONArray ef_languages = ef_page.getJSONArray("languages");
283 if (ef_languages != null) {
284
285 int lang_len = ef_languages.length();
286 String [] lang_list = new String[lang_len];
287
288 for (int i=0; i<lang_len; i++) {
289 JSONObject lang_rec = ef_languages.getJSONObject(i);
290
291 Iterator<String> lang_key_iter = lang_rec.keys();
292 while (lang_key_iter.hasNext()) {
293 String lang_label = lang_key_iter.next();
294
295 String solr_field = lang_label + "_htrctoken";
296 lang_list[i] = solr_field;
297 }
298 }
299
300 int text_len = text_al.size();
301 for (int ti=0; ti<text_len; ti++) {
302 String text_value = text_al.get(ti);
303 for (int li=0; li<lang_len; li++) {
304 String lang_text_field = lang_list[li];
305
306 solr_doc_json.put(lang_text_field, text_value);
307
308 }
309 }
310
311 }
312 }
313 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
314 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
315 {
316 JSONObject solr_update_json = null;
317
318 if (ef_page != null) {
319 JSONObject ef_body = ef_page.getJSONObject("body");
320 if (ef_body != null) {
321 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
322 if (ef_token_pos_count != null) {
323
324 JSONObject solr_add_json = new JSONObject();
325
326 ArrayList<String> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
327
328 JSONObject solr_doc_json = new JSONObject();
329 solr_doc_json.put("id", page_id);
330 solr_doc_json.put("volumeid_s", volume_id);
331 if (text_al.size()>0) {
332 addSolrLanguageTextFields(ef_page,text_al, solr_doc_json);
333 //solr_doc_json.put("eftext_txt", text_al.toString()); // ****
334 }
335 else {
336 solr_doc_json.put("efnotext_b", true);
337 }
338 solr_add_json.put("commitWithin", 5000);
339 solr_add_json.put("doc", solr_doc_json);
340
341 solr_update_json = new JSONObject();
342 solr_update_json.put("add",solr_add_json);
343
344 }
345 else {
346 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
347 }
348 }
349 else {
350 System.err.println("Warning: empty body field for '" + page_id + "'");
351 }
352
353 }
354 else {
355 System.err.println("Warning: null page for '" + page_id + "'");
356 }
357
358
359 /*
360
361 /update/json/docs
362 */
363
364 // For Reference ...
365 // Example documentation on Solr JSON syntax:
366 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
367 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
368
369 /*
370 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
371 {
372 "add": {
373 "doc": {
374 "id": "DOC1",
375 "my_boosted_field": { use a map with boost/value for a boosted field
376 "boost": 2.3,
377 "value": "test"
378 },
379 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
380 }
381 },
382 "add": {
383 "commitWithin": 5000, commit this document within 5 seconds
384 "overwrite": false, don't check for existing documents with the same uniqueKey
385 "boost": 3.45, a document boost
386 "doc": {
387 "f1": "v1", Can use repeated keys for a multi-valued field
388 "f1": "v2"
389 }
390 },
391
392 "commit": {},
393 "optimize": { "waitSearcher":false },
394
395 "delete": { "id":"ID" }, delete by ID
396 "delete": { "query":"QUERY" } delete by query
397 }'
398 */
399
400 return solr_update_json;
401 }
402
403 public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
404 boolean icu_tokenize)
405 {
406 ArrayList<String> word_list = null;
407
408 if (ef_page != null) {
409 JSONObject ef_body = ef_page.getJSONObject("body");
410 if (ef_body != null) {
411 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
412 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
413 }
414 else {
415 System.err.println("Warning: empty body field for '" + page_id + "'");
416 }
417
418 }
419 else {
420 System.err.println("Warning: null page for '" + page_id + "'");
421 }
422
423 return word_list;
424 }
425
426 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
427 {
428 ArrayList<String> word_list = null;
429
430 if (ef_page != null) {
431 JSONObject ef_body = ef_page.getJSONObject("body");
432 if (ef_body != null) {
433 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
434 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
435 }
436 else {
437 System.err.println("Warning: empty body field for '" + page_id + "'");
438 }
439
440 }
441 else {
442 System.err.println("Warning: null page for '" + page_id + "'");
443 }
444
445 return word_list;
446 }
447
448 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
449 {
450 ArrayList<String> lang_list = new ArrayList<String>();;
451
452 if (ef_page != null) {
453 JSONArray ef_languages = ef_page.getJSONArray("languages");
454 if (ef_languages != null) {
455
456 int lang_len = ef_languages.length();
457 for (int i=0; i<lang_len; i++) {
458 JSONObject lang_rec = ef_languages.getJSONObject(i);
459
460 Iterator<String> lang_key_iter = lang_rec.keys();
461 while (lang_key_iter.hasNext()) {
462 String lang_label = lang_key_iter.next();
463
464 lang_list.add(lang_label);
465 }
466 }
467 }
468 else {
469 System.err.println("Warning: empty languages field for '" + page_id + "'");
470 }
471
472 }
473 else {
474 System.err.println("Warning: null page for '" + page_id + "'");
475 }
476
477 return lang_list;
478 }
479
480 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
481 {
482 try {
483 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
484 bw.write(solr_add_doc_json.toString());
485 bw.close();
486 } catch (IOException e) {
487 e.printStackTrace();
488 } catch (CompressorException e) {
489 e.printStackTrace();
490 }
491 }
492
493 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
494 {
495
496 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
497 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
498 //curl_popen += " --data-binary '";
499 //curl_popen += "'"
500
501
502 try {
503 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
504 httpcon.setDoOutput(true);
505 httpcon.setRequestProperty("Content-Type", "application/json");
506 httpcon.setRequestProperty("Accept", "application/json");
507 httpcon.setRequestMethod("POST");
508 httpcon.connect();
509
510 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
511 OutputStream os = httpcon.getOutputStream();
512 os.write(outputBytes);
513 os.close();
514
515
516 // Read response
517 StringBuilder sb = new StringBuilder();
518 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
519 String decodedString;
520 while ((decodedString = in.readLine()) != null) {
521 sb.append(decodedString);
522 }
523 in.close();
524
525 JSONObject solr_status_json = new JSONObject(sb.toString());
526 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
527 if (response_header_json != null) {
528 int status = response_header_json.getInt("status");
529 if (status != 0) {
530 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
531 System.err.println("Full response was: " + sb);
532 }
533 }
534 else {
535 System.err.println("Failed response to Solr POST: " + sb);
536 }
537
538
539
540 }
541 catch (Exception e) {
542 e.printStackTrace();
543 }
544
545 }
546}
Note: See TracBrowser for help on using the repository browser.