source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31258

Last change on this file since 31258 was 31258, checked in by davidb, 7 years ago

POS Label count, similar to Whitelist word count

  • Property svn:executable set to *
File size: 10.2 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import org.apache.commons.compress.compressors.CompressorException;
15import org.json.JSONObject;
16import org.apache.lucene.analysis.TokenStream;
17import org.apache.lucene.analysis.Tokenizer;
18import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
19import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
20import org.apache.lucene.analysis.core.LowerCaseFilter;
21
22public class SolrDocJSON {
23
24 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
25 boolean icu_tokenize)
26 {
27 boolean lowercase_filter = true;
28
29 ArrayList<String> words = new ArrayList<String>();
30
31 if (ef_token_pos_count != null) {
32
33 Iterator<String> word_token_iter = ef_token_pos_count.keys();
34 while (word_token_iter.hasNext()) {
35 String word_token = word_token_iter.next();
36
37 if (icu_tokenize == true) {
38 Reader reader = new StringReader(word_token);
39
40 ICUTokenizer icu_tokenizer = new ICUTokenizer();
41 icu_tokenizer.setReader(reader);
42
43 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
44
45 TokenStream token_stream = null;
46
47 if (lowercase_filter) {
48 token_stream = new LowerCaseFilter(icu_tokenizer);
49 }
50 else {
51 token_stream = icu_tokenizer;
52 }
53
54 try {
55 token_stream.reset();
56
57 while (token_stream.incrementToken()) {
58 String term = charTermAttribute.toString();
59 words.add(term);
60 }
61
62 token_stream.end();
63 token_stream.close();
64 }
65 catch (IOException e) {
66 e.printStackTrace();
67 }
68 }
69 else {
70 words.add(word_token);
71 }
72 }
73 }
74 else {
75 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
76 }
77
78 /* Alternative way to get at keys
79 Set<String> token_keys = ef_token_pos_count.keySet();
80 for (String token : token_keys) {
81 sb.append(token + " ");
82 }
83*/
84 return words;
85 }
86
87 protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
88 {
89 ArrayList<String> pos_labels = new ArrayList<String>();
90
91 if (ef_token_pos_count != null) {
92
93 Iterator<String> word_token_iter = ef_token_pos_count.keys();
94 while (word_token_iter.hasNext()) {
95 String word_token = word_token_iter.next();
96
97 JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
98
99 Iterator<String> pos_token_iter = word_pos_labels.keys();
100 while (pos_token_iter.hasNext()) {
101 String pos_token = pos_token_iter.next();
102
103 pos_labels.add(pos_token);
104 }
105 }
106 }
107 else {
108 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
109 }
110
111 return pos_labels;
112 }
113
114
115
116 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
117 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
118 {
119 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
120
121 StringBuilder sb = new StringBuilder();
122
123 if (whitelist_bloomfilter == null) {
124
125 boolean first_append = true;
126
127 for (int i=0; i<tokens.size(); i++) {
128 String token = tokens.get(i);
129
130 if (!first_append) {
131 sb.append(" ");
132 }
133 else {
134 first_append = false;
135 }
136 sb.append(token);
137 }
138 }
139 else {
140 boolean first_append = true;
141
142 for (int i=0; i<tokens.size(); i++) {
143 String token = tokens.get(i);
144
145 if (whitelist_bloomfilter.contains(token)) {
146 if (!first_append) {
147 sb.append(" ");
148 }
149 else {
150 first_append = false;
151 }
152 sb.append(token);
153 }
154 }
155
156 }
157
158
159 return sb.toString();
160 }
161
162 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
163 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
164 {
165 JSONObject solr_update_json = null;
166
167 if (ef_page != null) {
168 JSONObject ef_body = ef_page.getJSONObject("body");
169 if (ef_body != null) {
170 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
171 if (ef_token_pos_count != null) {
172
173 JSONObject solr_add_json = new JSONObject();
174
175 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
176
177 JSONObject solr_doc_json = new JSONObject();
178 solr_doc_json.put("id", page_id);
179 solr_doc_json.put("volumeid_s", volume_id);
180 if (!text.equals("")) {
181 solr_doc_json.put("eftext_txt", text);
182 }
183 else {
184 solr_doc_json.put("efnotext_b", true);
185 }
186 solr_add_json.put("commitWithin", 5000);
187 solr_add_json.put("doc", solr_doc_json);
188
189 solr_update_json = new JSONObject();
190 solr_update_json.put("add",solr_add_json);
191
192 }
193 else {
194 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
195 }
196 }
197 else {
198 System.err.println("Warning: empty body field for '" + page_id + "'");
199 }
200
201 }
202 else {
203 System.err.println("Warning: null page for '" + page_id + "'");
204 }
205
206
207 /*
208
209 /update/json/docs
210 */
211
212 // For Reference ...
213 // Example documentation on Solr JSON syntax:
214 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
215 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
216
217 /*
218 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
219 {
220 "add": {
221 "doc": {
222 "id": "DOC1",
223 "my_boosted_field": { use a map with boost/value for a boosted field
224 "boost": 2.3,
225 "value": "test"
226 },
227 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
228 }
229 },
230 "add": {
231 "commitWithin": 5000, commit this document within 5 seconds
232 "overwrite": false, don't check for existing documents with the same uniqueKey
233 "boost": 3.45, a document boost
234 "doc": {
235 "f1": "v1", Can use repeated keys for a multi-valued field
236 "f1": "v2"
237 }
238 },
239
240 "commit": {},
241 "optimize": { "waitSearcher":false },
242
243 "delete": { "id":"ID" }, delete by ID
244 "delete": { "query":"QUERY" } delete by query
245 }'
246 */
247
248 return solr_update_json;
249 }
250
251 public static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
252 boolean icu_tokenize)
253 {
254 ArrayList<String> word_list = null;
255
256 if (ef_page != null) {
257 JSONObject ef_body = ef_page.getJSONObject("body");
258 if (ef_body != null) {
259 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
260 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
261 }
262 else {
263 System.err.println("Warning: empty body field for '" + page_id + "'");
264 }
265
266 }
267 else {
268 System.err.println("Warning: null page for '" + page_id + "'");
269 }
270
271 return word_list;
272 }
273
274 public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
275 {
276 ArrayList<String> word_list = null;
277
278 if (ef_page != null) {
279 JSONObject ef_body = ef_page.getJSONObject("body");
280 if (ef_body != null) {
281 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
282 word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
283 }
284 else {
285 System.err.println("Warning: empty body field for '" + page_id + "'");
286 }
287
288 }
289 else {
290 System.err.println("Warning: null page for '" + page_id + "'");
291 }
292
293 return word_list;
294 }
295
296 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
297 {
298 try {
299 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
300 bw.write(solr_add_doc_json.toString());
301 bw.close();
302 } catch (IOException e) {
303 e.printStackTrace();
304 } catch (CompressorException e) {
305 e.printStackTrace();
306 }
307 }
308
309 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
310 {
311
312 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
313 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
314 //curl_popen += " --data-binary '";
315 //curl_popen += "'"
316
317
318 try {
319 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
320 httpcon.setDoOutput(true);
321 httpcon.setRequestProperty("Content-Type", "application/json");
322 httpcon.setRequestProperty("Accept", "application/json");
323 httpcon.setRequestMethod("POST");
324 httpcon.connect();
325
326 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
327 OutputStream os = httpcon.getOutputStream();
328 os.write(outputBytes);
329 os.close();
330
331
332 // Read response
333 StringBuilder sb = new StringBuilder();
334 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
335 String decodedString;
336 while ((decodedString = in.readLine()) != null) {
337 sb.append(decodedString);
338 }
339 in.close();
340
341 JSONObject solr_status_json = new JSONObject(sb.toString());
342 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
343 if (response_header_json != null) {
344 int status = response_header_json.getInt("status");
345 if (status != 0) {
346 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
347 System.err.println("Full response was: " + sb);
348 }
349 }
350 else {
351 System.err.println("Failed response to Solr POST: " + sb);
352 }
353
354
355
356 }
357 catch (Exception e) {
358 e.printStackTrace();
359 }
360
361 }
362}
Note: See TracBrowser for help on using the repository browser.