source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31260

Last change on this file since 31260 was 31260, checked in by davidb, 7 years ago

Language counting

  • Property svn:executable set to *
File size: 11.1 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import org.apache.commons.compress.compressors.CompressorException;
15import org.json.JSONArray;
16import org.json.JSONObject;
17import org.apache.lucene.analysis.TokenStream;
18import org.apache.lucene.analysis.Tokenizer;
19import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
20import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21import org.apache.lucene.analysis.core.LowerCaseFilter;
22
23public class SolrDocJSON {
24
25 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
26 boolean icu_tokenize)
27 {
28 boolean lowercase_filter = true;
29
30 ArrayList<String> words = new ArrayList<String>();
31
32 if (ef_token_pos_count != null) {
33
34 Iterator<String> word_token_iter = ef_token_pos_count.keys();
35 while (word_token_iter.hasNext()) {
36 String word_token = word_token_iter.next();
37
38 if (icu_tokenize == true) {
39 Reader reader = new StringReader(word_token);
40
41 ICUTokenizer icu_tokenizer = new ICUTokenizer();
42 icu_tokenizer.setReader(reader);
43
44 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
45
46 TokenStream token_stream = null;
47
48 if (lowercase_filter) {
49 token_stream = new LowerCaseFilter(icu_tokenizer);
50 }
51 else {
52 token_stream = icu_tokenizer;
53 }
54
55 try {
56 token_stream.reset();
57
58 while (token_stream.incrementToken()) {
59 String term = charTermAttribute.toString();
60 words.add(term);
61 }
62
63 token_stream.end();
64 token_stream.close();
65 }
66 catch (IOException e) {
67 e.printStackTrace();
68 }
69 }
70 else {
71 words.add(word_token);
72 }
73 }
74 }
75 else {
76 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
77 }
78
79 /* Alternative way to get at keys
80 Set<String> token_keys = ef_token_pos_count.keySet();
81 for (String token : token_keys) {
82 sb.append(token + " ");
83 }
84*/
85 return words;
86 }
87
88 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
89 {
90 ArrayList<String> pos_labels = new ArrayList<String>();
91
92 if (ef_token_pos_count != null) {
93
94 Iterator<String> word_token_iter = ef_token_pos_count.keys();
95 while (word_token_iter.hasNext()) {
96 String word_token = word_token_iter.next();
97
98 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
99
100 Iterator<String> pos_token_iter = word_pos_labels.keys();
101 while (pos_token_iter.hasNext()) {
102 String pos_token = pos_token_iter.next();
103
104 pos_labels.add(pos_token);
105 }
106 }
107 }
108 else {
109 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
110 }
111
112 return pos_labels;
113 }
114
115
116
117 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
118 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
119 {
120 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
121
122 StringBuilder sb = new StringBuilder();
123
124 if (whitelist_bloomfilter == null) {
125
126 boolean first_append = true;
127
128 for (int i=0; i<tokens.size(); i++) {
129 String token = tokens.get(i);
130
131 if (!first_append) {
132 sb.append(" ");
133 }
134 else {
135 first_append = false;
136 }
137 sb.append(token);
138 }
139 }
140 else {
141 boolean first_append = true;
142
143 for (int i=0; i<tokens.size(); i++) {
144 String token = tokens.get(i);
145
146 if (whitelist_bloomfilter.contains(token)) {
147 if (!first_append) {
148 sb.append(" ");
149 }
150 else {
151 first_append = false;
152 }
153 sb.append(token);
154 }
155 }
156
157 }
158
159
160 return sb.toString();
161 }
162
163 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
164 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
165 {
166 JSONObject solr_update_json = null;
167
168 if (ef_page != null) {
169 JSONObject ef_body = ef_page.getJSONObject("body");
170 if (ef_body != null) {
171 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
172 if (ef_token_pos_count != null) {
173
174 JSONObject solr_add_json = new JSONObject();
175
176 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
177
178 JSONObject solr_doc_json = new JSONObject();
179 solr_doc_json.put("id", page_id);
180 solr_doc_json.put("volumeid_s", volume_id);
181 if (!text.equals("")) {
182 solr_doc_json.put("eftext_txt", text);
183 }
184 else {
185 solr_doc_json.put("efnotext_b", true);
186 }
187 solr_add_json.put("commitWithin", 5000);
188 solr_add_json.put("doc", solr_doc_json);
189
190 solr_update_json = new JSONObject();
191 solr_update_json.put("add",solr_add_json);
192
193 }
194 else {
195 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
196 }
197 }
198 else {
199 System.err.println("Warning: empty body field for '" + page_id + "'");
200 }
201
202 }
203 else {
204 System.err.println("Warning: null page for '" + page_id + "'");
205 }
206
207
208 /*
209
210 /update/json/docs
211 */
212
213 // For Reference ...
214 // Example documentation on Solr JSON syntax:
215 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
216 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
217
218 /*
219 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
220 {
221 "add": {
222 "doc": {
223 "id": "DOC1",
224 "my_boosted_field": { use a map with boost/value for a boosted field
225 "boost": 2.3,
226 "value": "test"
227 },
228 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
229 }
230 },
231 "add": {
232 "commitWithin": 5000, commit this document within 5 seconds
233 "overwrite": false, don't check for existing documents with the same uniqueKey
234 "boost": 3.45, a document boost
235 "doc": {
236 "f1": "v1", Can use repeated keys for a multi-valued field
237 "f1": "v2"
238 }
239 },
240
241 "commit": {},
242 "optimize": { "waitSearcher":false },
243
244 "delete": { "id":"ID" }, delete by ID
245 "delete": { "query":"QUERY" } delete by query
246 }'
247 */
248
249 return solr_update_json;
250 }
251
252 public static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
253 boolean icu_tokenize)
254 {
255 ArrayList<String> word_list = null;
256
257 if (ef_page != null) {
258 JSONObject ef_body = ef_page.getJSONObject("body");
259 if (ef_body != null) {
260 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
261 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
262 }
263 else {
264 System.err.println("Warning: empty body field for '" + page_id + "'");
265 }
266
267 }
268 else {
269 System.err.println("Warning: null page for '" + page_id + "'");
270 }
271
272 return word_list;
273 }
274
275 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
276 {
277 ArrayList<String> word_list = null;
278
279 if (ef_page != null) {
280 JSONObject ef_body = ef_page.getJSONObject("body");
281 if (ef_body != null) {
282 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
283 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
284 }
285 else {
286 System.err.println("Warning: empty body field for '" + page_id + "'");
287 }
288
289 }
290 else {
291 System.err.println("Warning: null page for '" + page_id + "'");
292 }
293
294 return word_list;
295 }
296
297 public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
298 {
299 ArrayList<String> lang_list = new ArrayList<String>();;
300
301 if (ef_page != null) {
302 JSONArray ef_languages = ef_page.getJSONArray("languages");
303 if (ef_languages != null) {
304
305 int lang_len = ef_languages.length();
306 for (int i=0; i<lang_len; i++) {
307 JSONObject lang_rec = ef_languages.getJSONObject(i);
308
309 Iterator<String> lang_key_iter = lang_rec.keys();
310 while (lang_key_iter.hasNext()) {
311 String lang_label = lang_key_iter.next();
312
313 lang_list.add(lang_label);
314 }
315 }
316 }
317 else {
318 System.err.println("Warning: empty languages field for '" + page_id + "'");
319 }
320
321 }
322 else {
323 System.err.println("Warning: null page for '" + page_id + "'");
324 }
325
326 return lang_list;
327 }
328
329 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
330 {
331 try {
332 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
333 bw.write(solr_add_doc_json.toString());
334 bw.close();
335 } catch (IOException e) {
336 e.printStackTrace();
337 } catch (CompressorException e) {
338 e.printStackTrace();
339 }
340 }
341
342 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
343 {
344
345 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
346 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
347 //curl_popen += " --data-binary '";
348 //curl_popen += "'"
349
350
351 try {
352 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
353 httpcon.setDoOutput(true);
354 httpcon.setRequestProperty("Content-Type", "application/json");
355 httpcon.setRequestProperty("Accept", "application/json");
356 httpcon.setRequestMethod("POST");
357 httpcon.connect();
358
359 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
360 OutputStream os = httpcon.getOutputStream();
361 os.write(outputBytes);
362 os.close();
363
364
365 // Read response
366 StringBuilder sb = new StringBuilder();
367 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
368 String decodedString;
369 while ((decodedString = in.readLine()) != null) {
370 sb.append(decodedString);
371 }
372 in.close();
373
374 JSONObject solr_status_json = new JSONObject(sb.toString());
375 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
376 if (response_header_json != null) {
377 int status = response_header_json.getInt("status");
378 if (status != 0) {
379 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
380 System.err.println("Full response was: " + sb);
381 }
382 }
383 else {
384 System.err.println("Failed response to Solr POST: " + sb);
385 }
386
387
388
389 }
390 catch (Exception e) {
391 e.printStackTrace();
392 }
393
394 }
395}
Note: See TracBrowser for help on using the repository browser.