source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31220

Last change on this file since 31220 was 31220, checked in by davidb, 7 years ago

Use of whitelist Bloom filter added to words going into Solr index

  • Property svn:executable set to *
File size: 7.0 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.net.HttpURLConnection;
9import java.net.URL;
10import java.util.ArrayList;
11import java.util.Iterator;
12
13import org.apache.commons.compress.compressors.CompressorException;
14import org.json.JSONObject;
15
16public class SolrDocJSON {
17
18 protected static String generateSolrText(JSONObject ef_token_pos_count, WhitelistBloomFilter whitelist_bloomfilter)
19 {
20 StringBuilder sb = new StringBuilder();
21
22 Iterator<String> token_iter = ef_token_pos_count.keys();
23
24 if (whitelist_bloomfilter == null) {
25
26 while (token_iter.hasNext()) {
27 String token = token_iter.next();
28 sb.append(token);
29 if (token_iter.hasNext()) {
30 sb.append(" ");
31 }
32 }
33 }
34 else {
35 while (token_iter.hasNext()) {
36 String token = token_iter.next();
37 if (whitelist_bloomfilter.contains(token)) {
38 sb.append(token);
39 if (token_iter.hasNext()) {
40 sb.append(" ");
41 }
42 }
43 }
44
45 }
46 /*
47 Set<String> token_keys = ef_token_pos_count.keySet();
48 for (String token : token_keys) {
49 sb.append(token + " ");
50 }
51 */
52
53 return sb.toString();
54 }
55
56 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
57 WhitelistBloomFilter whitelist_bloomfilter)
58 {
59 JSONObject solr_update_json = null;
60
61 if (ef_page != null) {
62 JSONObject ef_body = ef_page.getJSONObject("body");
63 if (ef_body != null) {
64 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
65 if (ef_token_pos_count != null) {
66
67 JSONObject solr_add_json = new JSONObject();
68
69 String text = generateSolrText(ef_token_pos_count,whitelist_bloomfilter);
70
71 JSONObject solr_doc_json = new JSONObject();
72 solr_doc_json.put("id", page_id);
73 solr_doc_json.put("volumeid_s", volume_id);
74 if (!text.equals("")) {
75 solr_doc_json.put("eftext_txt", text);
76 }
77 else {
78 solr_doc_json.put("efnotext_b", true);
79 }
80 solr_add_json.put("commitWithin", 5000);
81 solr_add_json.put("doc", solr_doc_json);
82
83 solr_update_json = new JSONObject();
84 solr_update_json.put("add",solr_add_json);
85
86 }
87 else {
88 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
89 }
90 }
91 else {
92 System.err.println("Warning: empty body field for '" + page_id + "'");
93 }
94
95 }
96 else {
97 System.err.println("Warning: null page for '" + page_id + "'");
98 }
99
100
101 /*
102
103 /update/json/docs
104 */
105
106 // For Reference ...
107 // Example documentation on Solr JSON syntax:
108 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
109 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
110
111 /*
112 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
113 {
114 "add": {
115 "doc": {
116 "id": "DOC1",
117 "my_boosted_field": { use a map with boost/value for a boosted field
118 "boost": 2.3,
119 "value": "test"
120 },
121 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
122 }
123 },
124 "add": {
125 "commitWithin": 5000, commit this document within 5 seconds
126 "overwrite": false, don't check for existing documents with the same uniqueKey
127 "boost": 3.45, a document boost
128 "doc": {
129 "f1": "v1", Can use repeated keys for a multi-valued field
130 "f1": "v2"
131 }
132 },
133
134 "commit": {},
135 "optimize": { "waitSearcher":false },
136
137 "delete": { "id":"ID" }, delete by ID
138 "delete": { "query":"QUERY" } delete by query
139 }'
140 */
141
142 return solr_update_json;
143 }
144
145 protected static ArrayList<String> generateTokenPostCountText(String volume_id, String page_id, JSONObject ef_page)
146 {
147 ArrayList<String> word_list = new ArrayList<String>();
148
149 if (ef_page != null) {
150 JSONObject ef_body = ef_page.getJSONObject("body");
151 if (ef_body != null) {
152 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
153 if (ef_token_pos_count != null) {
154
155 Iterator<String> token_iter = ef_token_pos_count.keys();
156 while (token_iter.hasNext()) {
157 String token = token_iter.next();
158 word_list.add(token);
159 }
160 }
161 else {
162 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
163 }
164 }
165 else {
166 System.err.println("Warning: empty body field for '" + page_id + "'");
167 }
168
169 }
170 else {
171 System.err.println("Warning: null page for '" + page_id + "'");
172 }
173
174 return word_list;
175 }
176
177 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
178 {
179 try {
180 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
181 bw.write(solr_add_doc_json.toString());
182 bw.close();
183 } catch (IOException e) {
184 e.printStackTrace();
185 } catch (CompressorException e) {
186 e.printStackTrace();
187 }
188 }
189
190 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
191 {
192
193 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
194 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
195 //curl_popen += " --data-binary '";
196 //curl_popen += "'"
197
198
199 try {
200 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
201 httpcon.setDoOutput(true);
202 httpcon.setRequestProperty("Content-Type", "application/json");
203 httpcon.setRequestProperty("Accept", "application/json");
204 httpcon.setRequestMethod("POST");
205 httpcon.connect();
206
207 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
208 OutputStream os = httpcon.getOutputStream();
209 os.write(outputBytes);
210 os.close();
211
212
213 // Read response
214 StringBuilder sb = new StringBuilder();
215 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
216 String decodedString;
217 while ((decodedString = in.readLine()) != null) {
218 sb.append(decodedString);
219 }
220 in.close();
221
222 JSONObject solr_status_json = new JSONObject(sb.toString());
223 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
224 if (response_header_json != null) {
225 int status = response_header_json.getInt("status");
226 if (status != 0) {
227 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
228 System.err.println("Full response was: " + sb);
229 }
230 }
231 else {
232 System.err.println("Failed response to Solr POST: " + sb);
233 }
234
235
236
237 }
238 catch (Exception e) {
239 e.printStackTrace();
240 }
241
242 }
243}
Note: See TracBrowser for help on using the repository browser.