source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/JSONSolrTransform.java@ 31001

Last change on this file since 31001 was 31001, checked in by davidb, 7 years ago

Code to work per-volume and per-page

  • Property svn:executable set to *
File size: 5.5 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.net.HttpURLConnection;
9import java.net.URL;
10import java.util.Iterator;
11
12import org.apache.commons.compress.compressors.CompressorException;
13import org.json.JSONObject;
14
15public class JSONSolrTransform {
16
17 protected static String generateSolrText(JSONObject ef_token_pos_count)
18 {
19 StringBuilder sb = new StringBuilder();
20
21 Iterator<String> token_iter = ef_token_pos_count.keys();
22 while (token_iter.hasNext()) {
23 String token = token_iter.next();
24
25 sb.append(token);
26 if (token_iter.hasNext()) {
27 sb.append(" ");
28 }
29 }
30
31 /*
32 Set<String> token_keys = ef_token_pos_count.keySet();
33 for (String token : token_keys) {
34 sb.append(token + " ");
35 }
36 */
37
38 return sb.toString();
39 }
40
41 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
42 {
43 JSONObject solr_update_json = null;
44
45 if (ef_page != null) {
46 JSONObject ef_body = ef_page.getJSONObject("body");
47 if (ef_body != null) {
48 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
49 if (ef_token_pos_count != null) {
50
51 JSONObject solr_add_json = new JSONObject();
52
53 String text = generateSolrText(ef_token_pos_count);
54
55 JSONObject solr_doc_json = new JSONObject();
56 solr_doc_json.put("id", page_id);
57 solr_doc_json.put("volumeid_s", volume_id);
58 solr_doc_json.put("eftext_txt", text);
59
60 solr_add_json.put("commitWithin", 5000);
61 solr_add_json.put("doc", solr_doc_json);
62
63 solr_update_json = new JSONObject();
64 solr_update_json.put("add",solr_add_json);
65
66 }
67 else {
68 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
69 }
70 }
71 else {
72 System.err.println("Warning: empty body field for '" + page_id + "'");
73 }
74
75 }
76 else {
77 System.err.println("Warning: null page for '" + page_id + "'");
78 }
79
80
81 /*
82
83 /update/json/docs
84 */
85
86 // For Reference ...
87 // Example documentation on Solr JSON syntax:
88 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
89 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
90
91 /*
92 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
93 {
94 "add": {
95 "doc": {
96 "id": "DOC1",
97 "my_boosted_field": { use a map with boost/value for a boosted field
98 "boost": 2.3,
99 "value": "test"
100 },
101 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
102 }
103 },
104 "add": {
105 "commitWithin": 5000, commit this document within 5 seconds
106 "overwrite": false, don't check for existing documents with the same uniqueKey
107 "boost": 3.45, a document boost
108 "doc": {
109 "f1": "v1", Can use repeated keys for a multi-valued field
110 "f1": "v2"
111 }
112 },
113
114 "commit": {},
115 "optimize": { "waitSearcher":false },
116
117 "delete": { "id":"ID" }, delete by ID
118 "delete": { "query":"QUERY" } delete by query
119 }'
120 */
121
122 return solr_update_json;
123 }
124
125 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
126 {
127 try {
128 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
129 bw.write(solr_add_doc_json.toString());
130 bw.close();
131 } catch (IOException e) {
132 e.printStackTrace();
133 } catch (CompressorException e) {
134 e.printStackTrace();
135 }
136 }
137
138 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
139 {
140
141 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
142 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
143 //curl_popen += " --data-binary '";
144 //curl_popen += "'"
145
146
147 try {
148 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
149 httpcon.setDoOutput(true);
150 httpcon.setRequestProperty("Content-Type", "application/json");
151 httpcon.setRequestProperty("Accept", "application/json");
152 httpcon.setRequestMethod("POST");
153 httpcon.connect();
154
155 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
156 OutputStream os = httpcon.getOutputStream();
157 os.write(outputBytes);
158 os.close();
159
160
161 // Read response
162 StringBuilder sb = new StringBuilder();
163 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
164 String decodedString;
165 while ((decodedString = in.readLine()) != null) {
166 sb.append(decodedString);
167 }
168 in.close();
169
170 JSONObject solr_status_json = new JSONObject(sb.toString());
171 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
172 if (response_header_json != null) {
173 int status = response_header_json.getInt("status");
174 if (status != 0) {
175 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
176 System.err.println("Full response was: " + sb);
177 }
178 }
179 else {
180 System.err.println("Failed response to Solr POST: " + sb);
181 }
182
183
184
185 }
186 catch (Exception e) {
187 e.printStackTrace();
188 }
189
190 }
191}
Note: See TracBrowser for help on using the repository browser.