source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31254

Last change on this file since 31254 was 31254, checked in by davidb, 7 years ago

Experimenting with Lucene lowercase filter

  • Property svn:executable set to *
File size: 8.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import org.apache.commons.compress.compressors.CompressorException;
15import org.json.JSONObject;
16import org.apache.lucene.analysis.TokenStream;
17import org.apache.lucene.analysis.Tokenizer;
18import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
19import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
20import org.apache.lucene.analysis.core.LowerCaseFilter;
21
22public class SolrDocJSON {
23
24 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
25 boolean icu_tokenize)
26 {
27 boolean lowercase_filter = true;
28
29 ArrayList<String> words = new ArrayList<String>();
30
31 if (ef_token_pos_count != null) {
32
33 Iterator<String> token_iter = ef_token_pos_count.keys();
34 while (token_iter.hasNext()) {
35 String token = token_iter.next();
36
37 if (icu_tokenize == true) {
38 Reader reader = new StringReader(token);
39
40 ICUTokenizer icu_tokenizer = new ICUTokenizer();
41 icu_tokenizer.setReader(reader);
42
43 CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
44
45 TokenStream token_stream = null;
46
47 if (lowercase_filter) {
48 token_stream = new LowerCaseFilter(icu_tokenizer);
49 }
50 else {
51 token_stream = icu_tokenizer;
52 }
53
54 try {
55 token_stream.reset();
56
57 while (token_stream.incrementToken()) {
58 String term = charTermAttribute.toString();
59 words.add(term);
60 }
61
62 token_stream.end();
63 token_stream.close();
64 }
65 catch (IOException e) {
66 e.printStackTrace();
67 }
68 }
69 else {
70 words.add(token);
71 }
72 }
73 }
74 else {
75 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
76 }
77
78 /* Alternative way to get at keys
79 Set<String> token_keys = ef_token_pos_count.keySet();
80 for (String token : token_keys) {
81 sb.append(token + " ");
82 }
83*/
84 return words;
85 }
86
87
88 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
89 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
90 {
91 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
92
93 StringBuilder sb = new StringBuilder();
94
95 if (whitelist_bloomfilter == null) {
96
97 boolean first_append = true;
98
99 for (int i=0; i<tokens.size(); i++) {
100 String token = tokens.get(i);
101
102 if (!first_append) {
103 sb.append(" ");
104 }
105 else {
106 first_append = false;
107 }
108 sb.append(token);
109 }
110 }
111 else {
112 boolean first_append = true;
113
114 for (int i=0; i<tokens.size(); i++) {
115 String token = tokens.get(i);
116
117 if (whitelist_bloomfilter.contains(token)) {
118 if (!first_append) {
119 sb.append(" ");
120 }
121 else {
122 first_append = false;
123 }
124 sb.append(token);
125 }
126 }
127
128 }
129
130
131 return sb.toString();
132 }
133
134 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
135 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
136 {
137 JSONObject solr_update_json = null;
138
139 if (ef_page != null) {
140 JSONObject ef_body = ef_page.getJSONObject("body");
141 if (ef_body != null) {
142 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
143 if (ef_token_pos_count != null) {
144
145 JSONObject solr_add_json = new JSONObject();
146
147 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
148
149 JSONObject solr_doc_json = new JSONObject();
150 solr_doc_json.put("id", page_id);
151 solr_doc_json.put("volumeid_s", volume_id);
152 if (!text.equals("")) {
153 solr_doc_json.put("eftext_txt", text);
154 }
155 else {
156 solr_doc_json.put("efnotext_b", true);
157 }
158 solr_add_json.put("commitWithin", 5000);
159 solr_add_json.put("doc", solr_doc_json);
160
161 solr_update_json = new JSONObject();
162 solr_update_json.put("add",solr_add_json);
163
164 }
165 else {
166 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
167 }
168 }
169 else {
170 System.err.println("Warning: empty body field for '" + page_id + "'");
171 }
172
173 }
174 else {
175 System.err.println("Warning: null page for '" + page_id + "'");
176 }
177
178
179 /*
180
181 /update/json/docs
182 */
183
184 // For Reference ...
185 // Example documentation on Solr JSON syntax:
186 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
187 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
188
189 /*
190 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
191 {
192 "add": {
193 "doc": {
194 "id": "DOC1",
195 "my_boosted_field": { use a map with boost/value for a boosted field
196 "boost": 2.3,
197 "value": "test"
198 },
199 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
200 }
201 },
202 "add": {
203 "commitWithin": 5000, commit this document within 5 seconds
204 "overwrite": false, don't check for existing documents with the same uniqueKey
205 "boost": 3.45, a document boost
206 "doc": {
207 "f1": "v1", Can use repeated keys for a multi-valued field
208 "f1": "v2"
209 }
210 },
211
212 "commit": {},
213 "optimize": { "waitSearcher":false },
214
215 "delete": { "id":"ID" }, delete by ID
216 "delete": { "query":"QUERY" } delete by query
217 }'
218 */
219
220 return solr_update_json;
221 }
222
223 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
224 boolean icu_tokenize)
225 {
226 ArrayList<String> word_list = null;
227
228 if (ef_page != null) {
229 JSONObject ef_body = ef_page.getJSONObject("body");
230 if (ef_body != null) {
231 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
232 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
233 }
234 else {
235 System.err.println("Warning: empty body field for '" + page_id + "'");
236 }
237
238 }
239 else {
240 System.err.println("Warning: null page for '" + page_id + "'");
241 }
242
243 return word_list;
244 }
245
246 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
247 {
248 try {
249 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
250 bw.write(solr_add_doc_json.toString());
251 bw.close();
252 } catch (IOException e) {
253 e.printStackTrace();
254 } catch (CompressorException e) {
255 e.printStackTrace();
256 }
257 }
258
259 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
260 {
261
262 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
263 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
264 //curl_popen += " --data-binary '";
265 //curl_popen += "'"
266
267
268 try {
269 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
270 httpcon.setDoOutput(true);
271 httpcon.setRequestProperty("Content-Type", "application/json");
272 httpcon.setRequestProperty("Accept", "application/json");
273 httpcon.setRequestMethod("POST");
274 httpcon.connect();
275
276 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
277 OutputStream os = httpcon.getOutputStream();
278 os.write(outputBytes);
279 os.close();
280
281
282 // Read response
283 StringBuilder sb = new StringBuilder();
284 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
285 String decodedString;
286 while ((decodedString = in.readLine()) != null) {
287 sb.append(decodedString);
288 }
289 in.close();
290
291 JSONObject solr_status_json = new JSONObject(sb.toString());
292 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
293 if (response_header_json != null) {
294 int status = response_header_json.getInt("status");
295 if (status != 0) {
296 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
297 System.err.println("Full response was: " + sb);
298 }
299 }
300 else {
301 System.err.println("Failed response to Solr POST: " + sb);
302 }
303
304
305
306 }
307 catch (Exception e) {
308 e.printStackTrace();
309 }
310
311 }
312}
Note: See TracBrowser for help on using the repository browser.