source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31176

Last change on this file since 31176 was 31176, checked in by davidb, 7 years ago

Support added for producing whitelist word count

  • Property svn:executable set to *
File size: 6.4 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.net.HttpURLConnection;
9import java.net.URL;
10import java.util.ArrayList;
11import java.util.Iterator;
12
13import org.apache.commons.compress.compressors.CompressorException;
14import org.json.JSONObject;
15
16public class SolrDocJSON {
17
18 protected static String generateSolrText(JSONObject ef_token_pos_count)
19 {
20 StringBuilder sb = new StringBuilder();
21
22 Iterator<String> token_iter = ef_token_pos_count.keys();
23 while (token_iter.hasNext()) {
24 String token = token_iter.next();
25
26 sb.append(token);
27 if (token_iter.hasNext()) {
28 sb.append(" ");
29 }
30 }
31
32 /*
33 Set<String> token_keys = ef_token_pos_count.keySet();
34 for (String token : token_keys) {
35 sb.append(token + " ");
36 }
37 */
38
39 return sb.toString();
40 }
41
42 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
43 {
44 JSONObject solr_update_json = null;
45
46 if (ef_page != null) {
47 JSONObject ef_body = ef_page.getJSONObject("body");
48 if (ef_body != null) {
49 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
50 if (ef_token_pos_count != null) {
51
52 JSONObject solr_add_json = new JSONObject();
53
54 String text = generateSolrText(ef_token_pos_count);
55
56 JSONObject solr_doc_json = new JSONObject();
57 solr_doc_json.put("id", page_id);
58 solr_doc_json.put("volumeid_s", volume_id);
59 solr_doc_json.put("eftext_txt", text);
60
61 solr_add_json.put("commitWithin", 5000);
62 solr_add_json.put("doc", solr_doc_json);
63
64 solr_update_json = new JSONObject();
65 solr_update_json.put("add",solr_add_json);
66
67 }
68 else {
69 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
70 }
71 }
72 else {
73 System.err.println("Warning: empty body field for '" + page_id + "'");
74 }
75
76 }
77 else {
78 System.err.println("Warning: null page for '" + page_id + "'");
79 }
80
81
82 /*
83
84 /update/json/docs
85 */
86
87 // For Reference ...
88 // Example documentation on Solr JSON syntax:
89 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
90 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
91
92 /*
93 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
94 {
95 "add": {
96 "doc": {
97 "id": "DOC1",
98 "my_boosted_field": { use a map with boost/value for a boosted field
99 "boost": 2.3,
100 "value": "test"
101 },
102 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
103 }
104 },
105 "add": {
106 "commitWithin": 5000, commit this document within 5 seconds
107 "overwrite": false, don't check for existing documents with the same uniqueKey
108 "boost": 3.45, a document boost
109 "doc": {
110 "f1": "v1", Can use repeated keys for a multi-valued field
111 "f1": "v2"
112 }
113 },
114
115 "commit": {},
116 "optimize": { "waitSearcher":false },
117
118 "delete": { "id":"ID" }, delete by ID
119 "delete": { "query":"QUERY" } delete by query
120 }'
121 */
122
123 return solr_update_json;
124 }
125
126 protected static ArrayList<String> generateTokenPostCountText(String volume_id, String page_id, JSONObject ef_page)
127 {
128 ArrayList<String> word_list = new ArrayList<String>();
129
130 if (ef_page != null) {
131 JSONObject ef_body = ef_page.getJSONObject("body");
132 if (ef_body != null) {
133 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
134 if (ef_token_pos_count != null) {
135
136 Iterator<String> token_iter = ef_token_pos_count.keys();
137 while (token_iter.hasNext()) {
138 String token = token_iter.next();
139 word_list.add(token);
140 }
141 }
142 else {
143 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
144 }
145 }
146 else {
147 System.err.println("Warning: empty body field for '" + page_id + "'");
148 }
149
150 }
151 else {
152 System.err.println("Warning: null page for '" + page_id + "'");
153 }
154
155 return word_list;
156 }
157
158 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
159 {
160 try {
161 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
162 bw.write(solr_add_doc_json.toString());
163 bw.close();
164 } catch (IOException e) {
165 e.printStackTrace();
166 } catch (CompressorException e) {
167 e.printStackTrace();
168 }
169 }
170
171 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
172 {
173
174 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
175 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
176 //curl_popen += " --data-binary '";
177 //curl_popen += "'"
178
179
180 try {
181 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
182 httpcon.setDoOutput(true);
183 httpcon.setRequestProperty("Content-Type", "application/json");
184 httpcon.setRequestProperty("Accept", "application/json");
185 httpcon.setRequestMethod("POST");
186 httpcon.connect();
187
188 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
189 OutputStream os = httpcon.getOutputStream();
190 os.write(outputBytes);
191 os.close();
192
193
194 // Read response
195 StringBuilder sb = new StringBuilder();
196 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
197 String decodedString;
198 while ((decodedString = in.readLine()) != null) {
199 sb.append(decodedString);
200 }
201 in.close();
202
203 JSONObject solr_status_json = new JSONObject(sb.toString());
204 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
205 if (response_header_json != null) {
206 int status = response_header_json.getInt("status");
207 if (status != 0) {
208 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
209 System.err.println("Full response was: " + sb);
210 }
211 }
212 else {
213 System.err.println("Failed response to Solr POST: " + sb);
214 }
215
216
217
218 }
219 catch (Exception e) {
220 e.printStackTrace();
221 }
222
223 }
224}
Note: See TracBrowser for help on using the repository browser.