source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30980

Last change on this file since 30980 was 30980, checked in by davidb, 8 years ago

Code added to read response

  • Property svn:executable set to *
File size: 9.6 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.net.HttpURLConnection;
9import java.net.URL;
10import java.util.ArrayList;
11import java.util.Iterator;
12import java.util.Set;
13
14import org.apache.commons.compress.compressors.CompressorException;
15import org.apache.spark.api.java.function.FlatMapFunction;
16import org.json.JSONArray;
17import org.json.JSONObject;
18
19/*
20class PagedJSON implements Function<String, Boolean> {
21
22 private static final long serialVersionUID = 1L;
23
24 public Boolean call(String s) { return s.contains("a"); }
25}
26 */
27
28
29class PagedJSON implements FlatMapFunction<String, String>
30{
31 private static final long serialVersionUID = 1L;
32
33 protected String _input_dir;
34 protected String _solr_url;
35 protected String _output_dir;
36 protected int _verbosity;
37
38 public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity)
39 {
40 _input_dir = input_dir;
41 _solr_url = solr_url;
42 _output_dir = output_dir;
43 _verbosity = verbosity;
44 }
45
46 protected JSONObject readJSONFile(String filename)
47 {
48 StringBuilder sb = new StringBuilder();
49
50 try {
51
52 String str;
53 BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
54 while ((str = br.readLine()) != null) {
55 sb.append(str);
56 }
57
58 br.close();
59 }
60 catch (Exception e) {
61 e.printStackTrace();
62 }
63
64 JSONObject json_obj = new JSONObject(sb.toString());
65
66
67 return json_obj;
68 }
69
70 protected String generateSolrText(JSONObject ef_token_pos_count)
71 {
72 StringBuilder sb = new StringBuilder();
73
74 Iterator<String> token_iter = ef_token_pos_count.keys();
75 while (token_iter.hasNext()) {
76 String token = token_iter.next();
77
78 sb.append(token);
79 if (token_iter.hasNext()) {
80 sb.append(" ");
81 }
82 }
83
84 /*
85 Set<String> token_keys = ef_token_pos_count.keySet();
86 for (String token : token_keys) {
87 sb.append(token + " ");
88 }
89*/
90
91 return sb.toString();
92 }
93
94 protected JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
95 {
96 JSONObject solr_update_json = null;
97
98 if (ef_page != null) {
99 JSONObject ef_body = ef_page.getJSONObject("body");
100 if (ef_body != null) {
101 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
102 if (ef_token_pos_count != null) {
103
104 JSONObject solr_add_json = new JSONObject();
105
106 String text = generateSolrText(ef_token_pos_count);
107
108 JSONObject solr_doc_json = new JSONObject();
109 solr_doc_json.put("id", page_id);
110 solr_doc_json.put("volumeid_s", volume_id);
111 solr_doc_json.put("_text_", text);
112
113 solr_add_json.put("commitWithin", 5000);
114 solr_add_json.put("doc", solr_doc_json);
115
116 solr_update_json = new JSONObject();
117 solr_update_json.put("add",solr_add_json);
118
119 }
120 else {
121 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
122 }
123 }
124 else {
125 System.err.println("Warning: empty body field for '" + page_id + "'");
126 }
127
128 }
129 else {
130 System.err.println("Warning: null page for '" + page_id + "'");
131 }
132
133
134 /*
135
136 /update/json/docs
137 */
138
139 // For Reference ...
140 // Example documentation on Solr JSON syntax:
141 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
142 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
143
144 /*
145 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
146 {
147 "add": {
148 "doc": {
149 "id": "DOC1",
150 "my_boosted_field": { use a map with boost/value for a boosted field
151 "boost": 2.3,
152 "value": "test"
153 },
154 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
155 }
156 },
157 "add": {
158 "commitWithin": 5000, commit this document within 5 seconds
159 "overwrite": false, don't check for existing documents with the same uniqueKey
160 "boost": 3.45, a document boost
161 "doc": {
162 "f1": "v1", Can use repeated keys for a multi-valued field
163 "f1": "v2"
164 }
165 },
166
167 "commit": {},
168 "optimize": { "waitSearcher":false },
169
170 "delete": { "id":"ID" }, delete by ID
171 "delete": { "query":"QUERY" } delete by query
172 }'
173 */
174
175 return solr_update_json;
176 }
177
178 protected void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
179 {
180 try {
181 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_file_json_bz2);
182 bw.write(solr_add_doc_json.toString());
183 bw.close();
184 } catch (IOException e) {
185 e.printStackTrace();
186 } catch (CompressorException e) {
187 e.printStackTrace();
188 }
189 }
190
191 protected void postSolrDoc(JSONObject solr_add_doc_json)
192 {
193 String post_url = _solr_url;
194
195 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
196 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
197 //curl_popen += " --data-binary '";
198 //curl_popen += "'"
199
200
201 try {
202 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
203 httpcon.setDoOutput(true);
204 httpcon.setRequestProperty("Content-Type", "application/json");
205 httpcon.setRequestProperty("Accept", "application/json");
206 httpcon.setRequestMethod("POST");
207 httpcon.connect();
208
209 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
210 OutputStream os = httpcon.getOutputStream();
211 os.write(outputBytes);
212 os.close();
213
214
215 // Read response
216 BufferedReader in = new BufferedReader(new InputStreamReader(
217 httpcon.getInputStream()));
218 String decodedString;
219 while ((decodedString = in.readLine()) != null) {
220 System.out.println(decodedString);
221 }
222 in.close();
223
224 }
225 catch (Exception e) {
226 e.printStackTrace();
227 }
228
229 }
230 public Iterator<String> call(String json_file_in)
231 {
232 JSONObject extracted_feature_record = readJSONFile(json_file_in);
233
234 String volume_id = extracted_feature_record.getString("id");
235
236 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
237 //String title= ef_metadata.getString("title");
238
239 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
240
241
242 int ef_page_count = ef_features.getInt("pageCount");
243
244 if (_verbosity >= 1) {
245 System.out.println("Processing: " + json_file_in);
246 System.out.println(" pageCount = " + ef_page_count);
247 }
248
249 JSONArray ef_pages = ef_features.getJSONArray("pages");
250 int ef_num_pages = ef_pages.length();
251
252 // Make directory for page-level JSON output
253 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
254 String page_json_dir = json_dir + "/pages";
255 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
256
257 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
258 for (int i = 0; i < ef_page_count; i++) {
259 String formatted_i = String.format("page-%06d", i);
260 String page_id = volume_id + "." + formatted_i;
261
262 if (_verbosity >= 2) {
263 System.out.println(" Page: " + page_id);
264 }
265
266 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
267 ids.add(output_json_bz2);
268
269 if (i==0) {
270 System.out.println("Sample output JSON page file: " + output_json_bz2);
271 }
272
273 JSONObject ef_page = ef_pages.getJSONObject(i);
274
275 if (ef_page != null) {
276 // Convert to Solr add form
277 JSONObject solr_add_doc_json = generateSolrDocJSON(volume_id, page_id, ef_page);
278
279 if (i==20) {
280 System.out.println("==================");
281 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
282 System.out.println("==================");
283 //System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("_text_"));
284 }
285
286 // create JSON obj of just the page (for now), and write it out
287 // write out the JSONOBject as a bz2 compressed file
288 /*
289 try {
290 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2);
291 bw.write(ef_page.toString());
292 bw.close();
293 } catch (IOException e) {
294 e.printStackTrace();
295 } catch (CompressorException e) {
296 e.printStackTrace();
297 }
298 */
299
300 if (_solr_url != null) {
301 if (i==20) {
302 System.out.println("==================");
303 System.out.println("Posting to: " + _solr_url);
304 System.out.println("==================");
305 }
306 postSolrDoc(solr_add_doc_json);
307 }
308
309 if (_output_dir != null) {
310 if (i==20) {
311 System.out.println("==================");
312 System.out.println("Saving to: " + _output_dir);
313 System.out.println("==================");
314 }
315 saveSolrDoc(solr_add_doc_json,output_json_bz2);
316 }
317 }
318 else {
319 System.err.println("Skipping: " + page_id);
320 }
321
322 }
323
324 /*
325 for (int i = 0; i < ef_num_pages; i++)
326 {
327 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
328 //......
329 }
330 */
331 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
332/*
333 JSONArray arr = obj.getJSONArray("posts");
334 for (int i = 0; i < arr.length(); i++)
335 {
336 String post_id = arr.getJSONObject(i).getString("post_id");
337 ......
338 }
339*/
340
341
342 ids.add(volume_id);
343
344 return ids.iterator();
345 }
346}
347
Note: See TracBrowser for help on using the repository browser.