source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31252

Last change on this file since 31252 was 31252, checked in by davidb, 7 years ago

Support for icu-tokenize property added, plus relevant refactoring.

  • Property svn:executable set to *
File size: 8.5 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import java.util.Set;
15
16import org.apache.commons.compress.compressors.CompressorException;
17import org.json.JSONObject;
18import org.apache.lucene.analysis.TokenStream;
19import org.apache.lucene.analysis.Tokenizer;
20import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
21import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23
24public class SolrDocJSON {
25
26 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
27 boolean icu_tokenize)
28 {
29
30 ArrayList<String> words = new ArrayList<String>();
31
32 if (ef_token_pos_count != null) {
33
34 Iterator<String> token_iter = ef_token_pos_count.keys();
35 while (token_iter.hasNext()) {
36 String token = token_iter.next();
37
38 if (icu_tokenize == true) {
39 Reader reader = new StringReader(token);
40
41 Tokenizer tokenizer = new ICUTokenizer();
42 tokenizer.setReader(reader);
43
44 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
45
46 try {
47 tokenizer.reset();
48
49 while (tokenizer.incrementToken()) {
50 String term = charTermAttribute.toString();
51 words.add(term);
52 }
53
54 tokenizer.end();
55 tokenizer.close();
56 }
57 catch (IOException e) {
58 e.printStackTrace();
59 }
60 }
61 else {
62 words.add(token);
63 }
64 }
65 }
66 else {
67 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
68 }
69
70 /* Alternative way to get at keys
71 Set<String> token_keys = ef_token_pos_count.keySet();
72 for (String token : token_keys) {
73 sb.append(token + " ");
74 }
75*/
76 return words;
77 }
78
79
80 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
81 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
82 {
83 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
84
85 StringBuilder sb = new StringBuilder();
86
87 if (whitelist_bloomfilter == null) {
88
89 boolean first_append = true;
90
91 for (int i=0; i<tokens.size(); i++) {
92 String token = tokens.get(i);
93
94 if (!first_append) {
95 sb.append(" ");
96 }
97 else {
98 first_append = false;
99 }
100 sb.append(token);
101 }
102 }
103 else {
104 boolean first_append = true;
105
106 for (int i=0; i<tokens.size(); i++) {
107 String token = tokens.get(i);
108
109 if (whitelist_bloomfilter.contains(token)) {
110 if (!first_append) {
111 sb.append(" ");
112 }
113 else {
114 first_append = false;
115 }
116 sb.append(token);
117 }
118 }
119
120 }
121
122
123 return sb.toString();
124 }
125
126 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
127 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
128 {
129 JSONObject solr_update_json = null;
130
131 if (ef_page != null) {
132 JSONObject ef_body = ef_page.getJSONObject("body");
133 if (ef_body != null) {
134 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
135 if (ef_token_pos_count != null) {
136
137 JSONObject solr_add_json = new JSONObject();
138
139 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
140
141 JSONObject solr_doc_json = new JSONObject();
142 solr_doc_json.put("id", page_id);
143 solr_doc_json.put("volumeid_s", volume_id);
144 if (!text.equals("")) {
145 solr_doc_json.put("eftext_txt", text);
146 }
147 else {
148 solr_doc_json.put("efnotext_b", true);
149 }
150 solr_add_json.put("commitWithin", 5000);
151 solr_add_json.put("doc", solr_doc_json);
152
153 solr_update_json = new JSONObject();
154 solr_update_json.put("add",solr_add_json);
155
156 }
157 else {
158 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
159 }
160 }
161 else {
162 System.err.println("Warning: empty body field for '" + page_id + "'");
163 }
164
165 }
166 else {
167 System.err.println("Warning: null page for '" + page_id + "'");
168 }
169
170
171 /*
172
173 /update/json/docs
174 */
175
176 // For Reference ...
177 // Example documentation on Solr JSON syntax:
178 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
179 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
180
181 /*
182 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
183 {
184 "add": {
185 "doc": {
186 "id": "DOC1",
187 "my_boosted_field": { use a map with boost/value for a boosted field
188 "boost": 2.3,
189 "value": "test"
190 },
191 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
192 }
193 },
194 "add": {
195 "commitWithin": 5000, commit this document within 5 seconds
196 "overwrite": false, don't check for existing documents with the same uniqueKey
197 "boost": 3.45, a document boost
198 "doc": {
199 "f1": "v1", Can use repeated keys for a multi-valued field
200 "f1": "v2"
201 }
202 },
203
204 "commit": {},
205 "optimize": { "waitSearcher":false },
206
207 "delete": { "id":"ID" }, delete by ID
208 "delete": { "query":"QUERY" } delete by query
209 }'
210 */
211
212 return solr_update_json;
213 }
214
215 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
216 boolean icu_tokenize)
217 {
218 ArrayList<String> word_list = null;
219
220 if (ef_page != null) {
221 JSONObject ef_body = ef_page.getJSONObject("body");
222 if (ef_body != null) {
223 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
224 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
225 }
226 else {
227 System.err.println("Warning: empty body field for '" + page_id + "'");
228 }
229
230 }
231 else {
232 System.err.println("Warning: null page for '" + page_id + "'");
233 }
234
235 return word_list;
236 }
237
238 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
239 {
240 try {
241 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
242 bw.write(solr_add_doc_json.toString());
243 bw.close();
244 } catch (IOException e) {
245 e.printStackTrace();
246 } catch (CompressorException e) {
247 e.printStackTrace();
248 }
249 }
250
251 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
252 {
253
254 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
255 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
256 //curl_popen += " --data-binary '";
257 //curl_popen += "'"
258
259
260 try {
261 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
262 httpcon.setDoOutput(true);
263 httpcon.setRequestProperty("Content-Type", "application/json");
264 httpcon.setRequestProperty("Accept", "application/json");
265 httpcon.setRequestMethod("POST");
266 httpcon.connect();
267
268 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
269 OutputStream os = httpcon.getOutputStream();
270 os.write(outputBytes);
271 os.close();
272
273
274 // Read response
275 StringBuilder sb = new StringBuilder();
276 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
277 String decodedString;
278 while ((decodedString = in.readLine()) != null) {
279 sb.append(decodedString);
280 }
281 in.close();
282
283 JSONObject solr_status_json = new JSONObject(sb.toString());
284 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
285 if (response_header_json != null) {
286 int status = response_header_json.getInt("status");
287 if (status != 0) {
288 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
289 System.err.println("Full response was: " + sb);
290 }
291 }
292 else {
293 System.err.println("Failed response to Solr POST: " + sb);
294 }
295
296
297
298 }
299 catch (Exception e) {
300 e.printStackTrace();
301 }
302
303 }
304}
Note: See TracBrowser for help on using the repository browser.