source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31245

Last change on this file since 31245 was 31245, checked in by davidb, 7 years ago

Refactored so processing of words from TokenPosCount now done by the same sub-routine

  • Property svn:executable set to *
File size: 8.4 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import java.util.Set;
15
16import org.apache.commons.compress.compressors.CompressorException;
17import org.json.JSONObject;
18import org.apache.lucene.analysis.TokenStream;
19import org.apache.lucene.analysis.Tokenizer;
20import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
21import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23
24public class SolrDocJSON {
25
26 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id)
27 {
28 boolean solr_icu_tokenize = true;
29
30 ArrayList<String> words = new ArrayList<String>();
31
32 if (ef_token_pos_count != null) {
33
34 Iterator<String> token_iter = ef_token_pos_count.keys();
35 while (token_iter.hasNext()) {
36 String token = token_iter.next();
37
38 if (solr_icu_tokenize == true) {
39 Reader reader = new StringReader(token);
40
41 Tokenizer tokenizer = new ICUTokenizer();
42 tokenizer.setReader(reader);
43
44 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
45
46 try {
47 tokenizer.reset();
48
49 while (tokenizer.incrementToken()) {
50 String term = charTermAttribute.toString();
51 words.add(term);
52 }
53
54 tokenizer.end();
55 tokenizer.close();
56 }
57 catch (IOException e) {
58 e.printStackTrace();
59 }
60 }
61 else {
62 words.add(token);
63 }
64 }
65 }
66 else {
67 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
68 }
69
70 /* Alternative way to get at keys
71 Set<String> token_keys = ef_token_pos_count.keySet();
72 for (String token : token_keys) {
73 sb.append(token + " ");
74 }
75*/
76 return words;
77 }
78
79
80 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
81 WhitelistBloomFilter whitelist_bloomfilter)
82 {
83 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id);
84
85 StringBuilder sb = new StringBuilder();
86
87 if (whitelist_bloomfilter == null) {
88
89 boolean first_append = true;
90
91 for (int i=0; i<tokens.size(); i++) {
92 String token = tokens.get(i);
93
94 if (!first_append) {
95 sb.append(" ");
96 }
97 else {
98 first_append = false;
99 }
100 sb.append(token);
101 }
102 }
103 else {
104 boolean first_append = true;
105
106 for (int i=0; i<tokens.size(); i++) {
107 String token = tokens.get(i);
108
109 if (whitelist_bloomfilter.contains(token)) {
110 if (!first_append) {
111 sb.append(" ");
112 }
113 else {
114 first_append = false;
115 }
116 sb.append(token);
117 }
118 }
119
120 }
121
122
123 return sb.toString();
124 }
125
126 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
127 WhitelistBloomFilter whitelist_bloomfilter)
128 {
129 JSONObject solr_update_json = null;
130
131 if (ef_page != null) {
132 JSONObject ef_body = ef_page.getJSONObject("body");
133 if (ef_body != null) {
134 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
135 if (ef_token_pos_count != null) {
136
137 JSONObject solr_add_json = new JSONObject();
138
139 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter);
140
141 JSONObject solr_doc_json = new JSONObject();
142 solr_doc_json.put("id", page_id);
143 solr_doc_json.put("volumeid_s", volume_id);
144 if (!text.equals("")) {
145 solr_doc_json.put("eftext_txt", text);
146 }
147 else {
148 solr_doc_json.put("efnotext_b", true);
149 }
150 solr_add_json.put("commitWithin", 5000);
151 solr_add_json.put("doc", solr_doc_json);
152
153 solr_update_json = new JSONObject();
154 solr_update_json.put("add",solr_add_json);
155
156 }
157 else {
158 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
159 }
160 }
161 else {
162 System.err.println("Warning: empty body field for '" + page_id + "'");
163 }
164
165 }
166 else {
167 System.err.println("Warning: null page for '" + page_id + "'");
168 }
169
170
171 /*
172
173 /update/json/docs
174 */
175
176 // For Reference ...
177 // Example documentation on Solr JSON syntax:
178 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
179 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
180
181 /*
182 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
183 {
184 "add": {
185 "doc": {
186 "id": "DOC1",
187 "my_boosted_field": { use a map with boost/value for a boosted field
188 "boost": 2.3,
189 "value": "test"
190 },
191 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
192 }
193 },
194 "add": {
195 "commitWithin": 5000, commit this document within 5 seconds
196 "overwrite": false, don't check for existing documents with the same uniqueKey
197 "boost": 3.45, a document boost
198 "doc": {
199 "f1": "v1", Can use repeated keys for a multi-valued field
200 "f1": "v2"
201 }
202 },
203
204 "commit": {},
205 "optimize": { "waitSearcher":false },
206
207 "delete": { "id":"ID" }, delete by ID
208 "delete": { "query":"QUERY" } delete by query
209 }'
210 */
211
212 return solr_update_json;
213 }
214
215 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page)
216 {
217 ArrayList<String> word_list = null;
218
219 if (ef_page != null) {
220 JSONObject ef_body = ef_page.getJSONObject("body");
221 if (ef_body != null) {
222 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
223 word_list = getTokenPosCountWords(ef_token_pos_count,page_id);
224 }
225 else {
226 System.err.println("Warning: empty body field for '" + page_id + "'");
227 }
228
229 }
230 else {
231 System.err.println("Warning: null page for '" + page_id + "'");
232 }
233
234 return word_list;
235 }
236
237 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
238 {
239 try {
240 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
241 bw.write(solr_add_doc_json.toString());
242 bw.close();
243 } catch (IOException e) {
244 e.printStackTrace();
245 } catch (CompressorException e) {
246 e.printStackTrace();
247 }
248 }
249
250 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
251 {
252
253 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
254 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
255 //curl_popen += " --data-binary '";
256 //curl_popen += "'"
257
258
259 try {
260 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
261 httpcon.setDoOutput(true);
262 httpcon.setRequestProperty("Content-Type", "application/json");
263 httpcon.setRequestProperty("Accept", "application/json");
264 httpcon.setRequestMethod("POST");
265 httpcon.connect();
266
267 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
268 OutputStream os = httpcon.getOutputStream();
269 os.write(outputBytes);
270 os.close();
271
272
273 // Read response
274 StringBuilder sb = new StringBuilder();
275 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
276 String decodedString;
277 while ((decodedString = in.readLine()) != null) {
278 sb.append(decodedString);
279 }
280 in.close();
281
282 JSONObject solr_status_json = new JSONObject(sb.toString());
283 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
284 if (response_header_json != null) {
285 int status = response_header_json.getInt("status");
286 if (status != 0) {
287 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
288 System.err.println("Full response was: " + sb);
289 }
290 }
291 else {
292 System.err.println("Failed response to Solr POST: " + sb);
293 }
294
295
296
297 }
298 catch (Exception e) {
299 e.printStackTrace();
300 }
301
302 }
303}
Note: See TracBrowser for help on using the repository browser.