source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31244

Last change on this file since 31244 was 31244, checked in by davidb, 7 years ago

Tidy up

  • Property svn:executable set to *
File size: 8.3 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import java.util.Set;
15
16import org.apache.commons.compress.compressors.CompressorException;
17import org.json.JSONObject;
18import org.apache.lucene.analysis.TokenStream;
19import org.apache.lucene.analysis.Tokenizer;
20import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
21import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
23
24public class SolrDocJSON {
25
26 protected static String generateSolrText(JSONObject ef_token_pos_count, WhitelistBloomFilter whitelist_bloomfilter)
27 {
28 boolean solr_icu_tokenize = true;
29
30 ArrayList<String> tokens = new ArrayList<String>();
31
32 Iterator<String> token_iter = ef_token_pos_count.keys();
33
34 while (token_iter.hasNext()) {
35 String token = token_iter.next();
36
37 if (solr_icu_tokenize == true) {
38 Reader reader = new StringReader(token);
39
40 Tokenizer tokenizer = new ICUTokenizer();
41 tokenizer.setReader(reader);
42
43 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
44
45 try {
46 tokenizer.reset();
47
48 while (tokenizer.incrementToken()) {
49 String term = charTermAttribute.toString();
50 tokens.add(term);
51 }
52
53 tokenizer.end();
54 tokenizer.close();
55 }
56 catch (IOException e) {
57 e.printStackTrace();
58 }
59 }
60 else {
61 tokens.add(token);
62 }
63 }
64
65 StringBuilder sb = new StringBuilder();
66
67 if (whitelist_bloomfilter == null) {
68
69 boolean first_append = true;
70
71 for (int i=0; i<tokens.size(); i++) {
72 String token = tokens.get(i);
73
74 if (!first_append) {
75 sb.append(" ");
76 }
77 else {
78 first_append = false;
79 }
80 sb.append(token);
81 }
82 }
83 else {
84 boolean first_append = true;
85
86 for (int i=0; i<tokens.size(); i++) {
87 String token = tokens.get(i);
88
89 if (whitelist_bloomfilter.contains(token)) {
90 if (!first_append) {
91 sb.append(" ");
92 }
93 else {
94 first_append = false;
95 }
96 sb.append(token);
97 }
98 }
99
100 }
101 /*
102 Set<String> token_keys = ef_token_pos_count.keySet();
103 for (String token : token_keys) {
104 sb.append(token + " ");
105 }
106 */
107
108 return sb.toString();
109 }
110
111 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
112 WhitelistBloomFilter whitelist_bloomfilter)
113 {
114 JSONObject solr_update_json = null;
115
116 if (ef_page != null) {
117 JSONObject ef_body = ef_page.getJSONObject("body");
118 if (ef_body != null) {
119 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
120 if (ef_token_pos_count != null) {
121
122 JSONObject solr_add_json = new JSONObject();
123
124 String text = generateSolrText(ef_token_pos_count,whitelist_bloomfilter);
125
126 JSONObject solr_doc_json = new JSONObject();
127 solr_doc_json.put("id", page_id);
128 solr_doc_json.put("volumeid_s", volume_id);
129 if (!text.equals("")) {
130 solr_doc_json.put("eftext_txt", text);
131 }
132 else {
133 solr_doc_json.put("efnotext_b", true);
134 }
135 solr_add_json.put("commitWithin", 5000);
136 solr_add_json.put("doc", solr_doc_json);
137
138 solr_update_json = new JSONObject();
139 solr_update_json.put("add",solr_add_json);
140
141 }
142 else {
143 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
144 }
145 }
146 else {
147 System.err.println("Warning: empty body field for '" + page_id + "'");
148 }
149
150 }
151 else {
152 System.err.println("Warning: null page for '" + page_id + "'");
153 }
154
155
156 /*
157
158 /update/json/docs
159 */
160
161 // For Reference ...
162 // Example documentation on Solr JSON syntax:
163 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
164 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
165
166 /*
167 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
168 {
169 "add": {
170 "doc": {
171 "id": "DOC1",
172 "my_boosted_field": { use a map with boost/value for a boosted field
173 "boost": 2.3,
174 "value": "test"
175 },
176 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
177 }
178 },
179 "add": {
180 "commitWithin": 5000, commit this document within 5 seconds
181 "overwrite": false, don't check for existing documents with the same uniqueKey
182 "boost": 3.45, a document boost
183 "doc": {
184 "f1": "v1", Can use repeated keys for a multi-valued field
185 "f1": "v2"
186 }
187 },
188
189 "commit": {},
190 "optimize": { "waitSearcher":false },
191
192 "delete": { "id":"ID" }, delete by ID
193 "delete": { "query":"QUERY" } delete by query
194 }'
195 */
196
197 return solr_update_json;
198 }
199
200 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page)
201 {
202 ArrayList<String> word_list = new ArrayList<String>();
203
204 if (ef_page != null) {
205 JSONObject ef_body = ef_page.getJSONObject("body");
206 if (ef_body != null) {
207 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
208 if (ef_token_pos_count != null) {
209
210 Iterator<String> token_iter = ef_token_pos_count.keys();
211 while (token_iter.hasNext()) {
212 String token = token_iter.next();
213 word_list.add(token);
214 }
215 }
216 else {
217 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
218 }
219 }
220 else {
221 System.err.println("Warning: empty body field for '" + page_id + "'");
222 }
223
224 }
225 else {
226 System.err.println("Warning: null page for '" + page_id + "'");
227 }
228
229 return word_list;
230 }
231
232 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
233 {
234 try {
235 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
236 bw.write(solr_add_doc_json.toString());
237 bw.close();
238 } catch (IOException e) {
239 e.printStackTrace();
240 } catch (CompressorException e) {
241 e.printStackTrace();
242 }
243 }
244
245 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
246 {
247
248 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
249 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
250 //curl_popen += " --data-binary '";
251 //curl_popen += "'"
252
253
254 try {
255 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
256 httpcon.setDoOutput(true);
257 httpcon.setRequestProperty("Content-Type", "application/json");
258 httpcon.setRequestProperty("Accept", "application/json");
259 httpcon.setRequestMethod("POST");
260 httpcon.connect();
261
262 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
263 OutputStream os = httpcon.getOutputStream();
264 os.write(outputBytes);
265 os.close();
266
267
268 // Read response
269 StringBuilder sb = new StringBuilder();
270 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
271 String decodedString;
272 while ((decodedString = in.readLine()) != null) {
273 sb.append(decodedString);
274 }
275 in.close();
276
277 JSONObject solr_status_json = new JSONObject(sb.toString());
278 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
279 if (response_header_json != null) {
280 int status = response_header_json.getInt("status");
281 if (status != 0) {
282 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
283 System.err.println("Full response was: " + sb);
284 }
285 }
286 else {
287 System.err.println("Failed response to Solr POST: " + sb);
288 }
289
290
291
292 }
293 catch (Exception e) {
294 e.printStackTrace();
295 }
296
297 }
298}
Note: See TracBrowser for help on using the repository browser.