source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30979

Last change on this file since 30979 was 30979, checked in by davidb, 7 years ago

_solr_url needs to be stored in class!

  • Property svn:executable set to *
File size: 9.2 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.OutputStream;
7import java.net.HttpURLConnection;
8import java.net.URL;
9import java.util.ArrayList;
10import java.util.Iterator;
11import java.util.Set;
12
13import org.apache.commons.compress.compressors.CompressorException;
14import org.apache.spark.api.java.function.FlatMapFunction;
15import org.json.JSONArray;
16import org.json.JSONObject;
17
18/*
19class PagedJSON implements Function<String, Boolean> {
20
21 private static final long serialVersionUID = 1L;
22
23 public Boolean call(String s) { return s.contains("a"); }
24}
25 */
26
27
28class PagedJSON implements FlatMapFunction<String, String>
29{
30 private static final long serialVersionUID = 1L;
31
32 protected String _input_dir;
33 protected String _solr_url;
34 protected String _output_dir;
35 protected int _verbosity;
36
37 public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity)
38 {
39 _input_dir = input_dir;
40 _solr_url = solr_url;
41 _output_dir = output_dir;
42 _verbosity = verbosity;
43 }
44
45 protected JSONObject readJSONFile(String filename)
46 {
47 StringBuilder sb = new StringBuilder();
48
49 try {
50
51 String str;
52 BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
53 while ((str = br.readLine()) != null) {
54 sb.append(str);
55 }
56
57 br.close();
58 }
59 catch (Exception e) {
60 e.printStackTrace();
61 }
62
63 JSONObject json_obj = new JSONObject(sb.toString());
64
65
66 return json_obj;
67 }
68
69 protected String generateSolrText(JSONObject ef_token_pos_count)
70 {
71 StringBuilder sb = new StringBuilder();
72
73 Iterator<String> token_iter = ef_token_pos_count.keys();
74 while (token_iter.hasNext()) {
75 String token = token_iter.next();
76
77 sb.append(token);
78 if (token_iter.hasNext()) {
79 sb.append(" ");
80 }
81 }
82
83 /*
84 Set<String> token_keys = ef_token_pos_count.keySet();
85 for (String token : token_keys) {
86 sb.append(token + " ");
87 }
88*/
89
90 return sb.toString();
91 }
92
93 protected JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
94 {
95 JSONObject solr_update_json = null;
96
97 if (ef_page != null) {
98 JSONObject ef_body = ef_page.getJSONObject("body");
99 if (ef_body != null) {
100 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
101 if (ef_token_pos_count != null) {
102
103 JSONObject solr_add_json = new JSONObject();
104
105 String text = generateSolrText(ef_token_pos_count);
106
107 JSONObject solr_doc_json = new JSONObject();
108 solr_doc_json.put("id", page_id);
109 solr_doc_json.put("volumeid_s", volume_id);
110 solr_doc_json.put("_text_", text);
111
112 solr_add_json.put("commitWithin", 5000);
113 solr_add_json.put("doc", solr_doc_json);
114
115 solr_update_json = new JSONObject();
116 solr_update_json.put("add",solr_add_json);
117
118 }
119 else {
120 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
121 }
122 }
123 else {
124 System.err.println("Warning: empty body field for '" + page_id + "'");
125 }
126
127 }
128 else {
129 System.err.println("Warning: null page for '" + page_id + "'");
130 }
131
132
133 /*
134
135 /update/json/docs
136 */
137
138 // For Reference ...
139 // Example documentation on Solr JSON syntax:
140 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
141 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
142
143 /*
144 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
145 {
146 "add": {
147 "doc": {
148 "id": "DOC1",
149 "my_boosted_field": { use a map with boost/value for a boosted field
150 "boost": 2.3,
151 "value": "test"
152 },
153 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
154 }
155 },
156 "add": {
157 "commitWithin": 5000, commit this document within 5 seconds
158 "overwrite": false, don't check for existing documents with the same uniqueKey
159 "boost": 3.45, a document boost
160 "doc": {
161 "f1": "v1", Can use repeated keys for a multi-valued field
162 "f1": "v2"
163 }
164 },
165
166 "commit": {},
167 "optimize": { "waitSearcher":false },
168
169 "delete": { "id":"ID" }, delete by ID
170 "delete": { "query":"QUERY" } delete by query
171 }'
172 */
173
174 return solr_update_json;
175 }
176
177 protected void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
178 {
179 try {
180 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_file_json_bz2);
181 bw.write(solr_add_doc_json.toString());
182 bw.close();
183 } catch (IOException e) {
184 e.printStackTrace();
185 } catch (CompressorException e) {
186 e.printStackTrace();
187 }
188 }
189
190 protected void postSolrDoc(JSONObject solr_add_doc_json)
191 {
192 // "http://10.11.0.53:8983/solr/"
193 //String post_url = "http://10.11.0.53:8983/solr/htrc-pd-ef/update";
194 String post_url = _solr_url;
195
196 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
197 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
198 //curl_popen += " --data-binary '";
199 //curl_popen += "'"
200
201
202 try {
203 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
204 httpcon.setDoOutput(true);
205 httpcon.setRequestProperty("Content-Type", "application/json");
206 httpcon.setRequestProperty("Accept", "application/json");
207 httpcon.setRequestMethod("POST");
208 httpcon.connect();
209
210 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
211 OutputStream os = httpcon.getOutputStream();
212 os.write(outputBytes);
213 os.close();
214 }
215 catch (Exception e) {
216 e.printStackTrace();
217 }
218
219 }
220 public Iterator<String> call(String json_file_in)
221 {
222 JSONObject extracted_feature_record = readJSONFile(json_file_in);
223
224 String volume_id = extracted_feature_record.getString("id");
225
226 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
227 //String title= ef_metadata.getString("title");
228
229 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
230
231
232 int ef_page_count = ef_features.getInt("pageCount");
233
234 if (_verbosity >= 1) {
235 System.out.println("Processing: " + json_file_in);
236 System.out.println(" pageCount = " + ef_page_count);
237 }
238
239 JSONArray ef_pages = ef_features.getJSONArray("pages");
240 int ef_num_pages = ef_pages.length();
241
242 // Make directory for page-level JSON output
243 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
244 String page_json_dir = json_dir + "/pages";
245 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
246
247 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
248 for (int i = 0; i < ef_page_count; i++) {
249 String formatted_i = String.format("page-%06d", i);
250 String page_id = volume_id + "." + formatted_i;
251
252 if (_verbosity >= 2) {
253 System.out.println(" Page: " + page_id);
254 }
255
256 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
257 ids.add(output_json_bz2);
258
259 if (i==0) {
260 System.out.println("Sample output JSON page file: " + output_json_bz2);
261 }
262
263 JSONObject ef_page = ef_pages.getJSONObject(i);
264
265 if (ef_page != null) {
266 // Convert to Solr add form
267 JSONObject solr_add_doc_json = generateSolrDocJSON(volume_id, page_id, ef_page);
268
269 if (i==20) {
270 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
271 System.out.println("==================");
272 //System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("_text_"));
273 }
274
275 // create JSON obj of just the page (for now), and write it out
276 // write out the JSONOBject as a bz2 compressed file
277 /*
278 try {
279 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2);
280 bw.write(ef_page.toString());
281 bw.close();
282 } catch (IOException e) {
283 e.printStackTrace();
284 } catch (CompressorException e) {
285 e.printStackTrace();
286 }
287 */
288
289 if (_solr_url != null) {
290 if (i==20) {
291 System.out.println("Posting to: " + _solr_url);
292 }
293 postSolrDoc(solr_add_doc_json);
294 }
295
296 if (_output_dir != null) {
297 if (i==20) {
298 System.out.println("Saving to: " + _output_dir);
299 }
300 saveSolrDoc(solr_add_doc_json,output_json_bz2);
301 }
302 }
303 else {
304 System.err.println("Skipping: " + page_id);
305 }
306
307 }
308
309 /*
310 for (int i = 0; i < ef_num_pages; i++)
311 {
312 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
313 //......
314 }
315 */
316 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
317/*
318 JSONArray arr = obj.getJSONArray("posts");
319 for (int i = 0; i < arr.length(); i++)
320 {
321 String post_id = arr.getJSONObject(i).getString("post_id");
322 ......
323 }
324*/
325
326
327 ids.add(volume_id);
328
329 return ids.iterator();
330 }
331}
332
Note: See TracBrowser for help on using the repository browser.