source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30974

Last change on this file since 30974 was 30974, checked in by davidb, 7 years ago

update/add/doc JSON structure needed

  • Property svn:executable set to *
File size: 9.0 KB
Line 
1package org.hathitrust;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.OutputStream;
7import java.net.HttpURLConnection;
8import java.net.URL;
9import java.util.ArrayList;
10import java.util.Iterator;
11import java.util.Set;
12
13import org.apache.commons.compress.compressors.CompressorException;
14import org.apache.spark.api.java.function.FlatMapFunction;
15import org.json.JSONArray;
16import org.json.JSONObject;
17
18/*
19class PagedJSON implements Function<String, Boolean> {
20
21 private static final long serialVersionUID = 1L;
22
23 public Boolean call(String s) { return s.contains("a"); }
24}
25 */
26
27
28class PagedJSON implements FlatMapFunction<String, String>
29{
30 private static final long serialVersionUID = 1L;
31
32 protected String _input_dir;
33 protected String _output_dir;
34 protected int _verbosity;
35
36 public PagedJSON(String input_dir, String output_dir, int verbosity)
37 {
38 _input_dir = input_dir;
39 _output_dir = output_dir;
40 _verbosity = verbosity;
41 }
42
43 protected JSONObject readJSONFile(String filename)
44 {
45 StringBuilder sb = new StringBuilder();
46
47 try {
48
49 String str;
50 BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
51 while ((str = br.readLine()) != null) {
52 sb.append(str);
53 }
54
55 br.close();
56 }
57 catch (Exception e) {
58 e.printStackTrace();
59 }
60
61 JSONObject json_obj = new JSONObject(sb.toString());
62
63
64 return json_obj;
65 }
66
67 protected String generateSolrText(JSONObject ef_token_pos_count)
68 {
69 StringBuilder sb = new StringBuilder();
70
71 Iterator<String> token_iter = ef_token_pos_count.keys();
72 while (token_iter.hasNext()) {
73 String token = token_iter.next();
74
75 sb.append(token);
76 if (token_iter.hasNext()) {
77 sb.append(" ");
78 }
79 }
80
81 /*
82 Set<String> token_keys = ef_token_pos_count.keySet();
83 for (String token : token_keys) {
84 sb.append(token + " ");
85 }
86*/
87
88 return sb.toString();
89 }
90
91 protected JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
92 {
93 JSONObject solr_update_json = null;
94
95 if (ef_page != null) {
96 JSONObject ef_body = ef_page.getJSONObject("body");
97 if (ef_body != null) {
98 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
99 if (ef_token_pos_count != null) {
100
101 JSONObject solr_add_json = new JSONObject();
102
103 String text = generateSolrText(ef_token_pos_count);
104
105 JSONObject solr_doc_json = new JSONObject();
106 solr_doc_json.put("id", page_id);
107 solr_doc_json.put("volumeid_s", volume_id);
108 solr_doc_json.put("_text_", text);
109
110 solr_add_json.put("commitWithin", 5000);
111 solr_add_json.put("doc", solr_doc_json);
112
113 solr_update_json = new JSONObject();
114 solr_update_json.put("add",solr_add_json);
115
116 }
117 else {
118 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
119 }
120 }
121 else {
122 System.err.println("Warning: empty body field for '" + page_id + "'");
123 }
124
125 }
126 else {
127 System.err.println("Warning: null page for '" + page_id + "'");
128 }
129
130
131 /*
132
133 /update/json/docs
134 */
135
136 // For Reference ...
137 // Example documentation on Solr JSON syntax:
138 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
139 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
140
141 /*
142 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
143 {
144 "add": {
145 "doc": {
146 "id": "DOC1",
147 "my_boosted_field": { use a map with boost/value for a boosted field
148 "boost": 2.3,
149 "value": "test"
150 },
151 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
152 }
153 },
154 "add": {
155 "commitWithin": 5000, commit this document within 5 seconds
156 "overwrite": false, don't check for existing documents with the same uniqueKey
157 "boost": 3.45, a document boost
158 "doc": {
159 "f1": "v1", Can use repeated keys for a multi-valued field
160 "f1": "v2"
161 }
162 },
163
164 "commit": {},
165 "optimize": { "waitSearcher":false },
166
167 "delete": { "id":"ID" }, delete by ID
168 "delete": { "query":"QUERY" } delete by query
169 }'
170 */
171
172 //return solr_doc_json;
173 return solr_update_json;
174 }
175
176 protected void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
177 {
178 try {
179 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_file_json_bz2);
180 bw.write(solr_add_doc_json.toString());
181 bw.close();
182 } catch (IOException e) {
183 e.printStackTrace();
184 } catch (CompressorException e) {
185 e.printStackTrace();
186 }
187 }
188
189 protected void postSolrDoc(JSONObject solr_add_doc_json)
190 {
191 // "http://10.11.0.53:8983/solr/"
192 String post_url = "http://10.11.0.53:8983/solr/htrc-pd-ef/update";
193
194 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
195 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
196 //curl_popen += " --data-binary '";
197 //curl_popen += "'"
198
199
200 try {
201 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
202 httpcon.setDoOutput(true);
203 httpcon.setRequestProperty("Content-Type", "application/json");
204 httpcon.setRequestProperty("Accept", "application/json");
205 httpcon.setRequestMethod("POST");
206 httpcon.connect();
207
208 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
209 OutputStream os = httpcon.getOutputStream();
210 os.write(outputBytes);
211 os.close();
212 }
213 catch (Exception e) {
214 e.printStackTrace();
215 }
216
217 }
218 public Iterator<String> call(String json_file_in)
219 {
220 JSONObject extracted_feature_record = readJSONFile(json_file_in);
221
222 // Check output directory for volume exists, and create it if not
223
224
225 String volume_id = extracted_feature_record.getString("id");
226
227 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
228 //String title= ef_metadata.getString("title");
229
230 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
231
232
233 int ef_page_count = ef_features.getInt("pageCount");
234
235 if (_verbosity >= 1) {
236 System.out.println("Processing: " + json_file_in);
237 System.out.println(" pageCount = " + ef_page_count);
238 }
239
240 JSONArray ef_pages = ef_features.getJSONArray("pages");
241 int ef_num_pages = ef_pages.length();
242
243 // Make directory for page-level JSON output
244 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
245 String page_json_dir = json_dir + "/pages";
246 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
247
248 ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
249 for (int i = 0; i < ef_page_count; i++) {
250 String formatted_i = String.format("page-%06d", i);
251 String page_id = volume_id + "." + formatted_i;
252
253 if (_verbosity >= 2) {
254 System.out.println(" Page: " + page_id);
255 }
256
257 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
258 ids.add(output_json_bz2);
259
260 if (i==0) {
261 System.out.println("Sample output JSON page file: " + output_json_bz2);
262 }
263
264 JSONObject ef_page = ef_pages.getJSONObject(i);
265
266 if (ef_page != null) {
267 // Convert to Solr add form
268 JSONObject solr_add_doc_json = generateSolrDocJSON(volume_id, page_id, ef_page);
269
270 if (i==20) {
271 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
272 System.out.println("==================");
273 //System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("_text_"));
274 }
275
276 // create JSON obj of just the page (for now), and write it out
277 // write out the JSONOBject as a bz2 compressed file
278 /*
279 try {
280 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2);
281 bw.write(ef_page.toString());
282 bw.close();
283 } catch (IOException e) {
284 e.printStackTrace();
285 } catch (CompressorException e) {
286 e.printStackTrace();
287 }
288 */
289
290 saveSolrDoc(solr_add_doc_json,output_json_bz2);
291 //postSolrDoc(solr_add_doc_json);
292
293 }
294 else {
295 System.err.println("Skipping: " + page_id);
296 }
297
298 }
299
300 /*
301 for (int i = 0; i < ef_num_pages; i++)
302 {
303 //String post_id = ef_pages.getJSONObject(i).getString("post_id");
304 //......
305 }
306 */
307 //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
308/*
309 JSONArray arr = obj.getJSONArray("posts");
310 for (int i = 0; i < arr.length(); i++)
311 {
312 String post_id = arr.getJSONObject(i).getString("post_id");
313 ......
314 }
315*/
316
317
318 ids.add(volume_id);
319
320 return ids.iterator();
321 }
322}
323
Note: See TracBrowser for help on using the repository browser.