source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31243

Last change on this file since 31243 was 31243, checked in by davidb, 7 years ago

Experimenting with Lucene/Solr's ICU tokenizer

  • Property svn:executable set to *
File size: 8.6 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.io.OutputStream;
8import java.io.Reader;
9import java.io.StringReader;
10import java.net.HttpURLConnection;
11import java.net.URL;
12import java.util.ArrayList;
13import java.util.Iterator;
14import java.util.Set;
15
16import org.apache.commons.compress.compressors.CompressorException;
17import org.json.JSONObject;
18import org.apache.lucene.analysis.Tokenizer;
19import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
20import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
22
23public class SolrDocJSON {
24
25 protected static String generateSolrText(JSONObject ef_token_pos_count, WhitelistBloomFilter whitelist_bloomfilter)
26 {
27 boolean solr_icu_tokenize = true;
28
29 ArrayList<String> tokens = new ArrayList<String>();
30
31 Iterator<String> token_iter = ef_token_pos_count.keys();
32
33 while (token_iter.hasNext()) {
34 String token = token_iter.next();
35
36 if (solr_icu_tokenize == true) {
37 Reader reader = new StringReader(token);
38
39 Tokenizer tokenizer = new ICUTokenizer();
40 tokenizer.setReader(reader);
41
42 //TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
43 //OffsetAttribute offsetAttribute = tokenizer.addAttribute(OffsetAttribute.class);
44 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
45
46 try {
47 tokenizer.reset();
48
49 while (tokenizer.incrementToken()) {
50 //int startOffset = offsetAttribute.startOffset();
51 //int endOffset = offsetAttribute.endOffset();
52 String term = charTermAttribute.toString();
53 tokens.add(term);
54 }
55
56 tokenizer.close();
57 }
58 catch (IOException e) {
59 e.printStackTrace();
60 }
61 }
62 else {
63 tokens.add(token);
64 }
65 }
66
67 StringBuilder sb = new StringBuilder();
68
69 if (whitelist_bloomfilter == null) {
70
71 boolean first_append = true;
72
73 for (int i=0; i<tokens.size(); i++) {
74 String token = tokens.get(i);
75
76 if (!first_append) {
77 sb.append(" ");
78 }
79 else {
80 first_append = false;
81 }
82 sb.append(token);
83 }
84 }
85 else {
86 boolean first_append = true;
87
88 for (int i=0; i<tokens.size(); i++) {
89 String token = tokens.get(i);
90
91 if (whitelist_bloomfilter.contains(token)) {
92 if (!first_append) {
93 sb.append(" ");
94 }
95 else {
96 first_append = false;
97 }
98 sb.append(token);
99 }
100 }
101
102 }
103 /*
104 Set<String> token_keys = ef_token_pos_count.keySet();
105 for (String token : token_keys) {
106 sb.append(token + " ");
107 }
108 */
109
110
111
112 return sb.toString();
113 }
114
115 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
116 WhitelistBloomFilter whitelist_bloomfilter)
117 {
118 JSONObject solr_update_json = null;
119
120 if (ef_page != null) {
121 JSONObject ef_body = ef_page.getJSONObject("body");
122 if (ef_body != null) {
123 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
124 if (ef_token_pos_count != null) {
125
126 JSONObject solr_add_json = new JSONObject();
127
128 String text = generateSolrText(ef_token_pos_count,whitelist_bloomfilter);
129
130 JSONObject solr_doc_json = new JSONObject();
131 solr_doc_json.put("id", page_id);
132 solr_doc_json.put("volumeid_s", volume_id);
133 if (!text.equals("")) {
134 solr_doc_json.put("eftext_txt", text);
135 }
136 else {
137 solr_doc_json.put("efnotext_b", true);
138 }
139 solr_add_json.put("commitWithin", 5000);
140 solr_add_json.put("doc", solr_doc_json);
141
142 solr_update_json = new JSONObject();
143 solr_update_json.put("add",solr_add_json);
144
145 }
146 else {
147 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
148 }
149 }
150 else {
151 System.err.println("Warning: empty body field for '" + page_id + "'");
152 }
153
154 }
155 else {
156 System.err.println("Warning: null page for '" + page_id + "'");
157 }
158
159
160 /*
161
162 /update/json/docs
163 */
164
165 // For Reference ...
166 // Example documentation on Solr JSON syntax:
167 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
168 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
169
170 /*
171 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
172 {
173 "add": {
174 "doc": {
175 "id": "DOC1",
176 "my_boosted_field": { use a map with boost/value for a boosted field
177 "boost": 2.3,
178 "value": "test"
179 },
180 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
181 }
182 },
183 "add": {
184 "commitWithin": 5000, commit this document within 5 seconds
185 "overwrite": false, don't check for existing documents with the same uniqueKey
186 "boost": 3.45, a document boost
187 "doc": {
188 "f1": "v1", Can use repeated keys for a multi-valued field
189 "f1": "v2"
190 }
191 },
192
193 "commit": {},
194 "optimize": { "waitSearcher":false },
195
196 "delete": { "id":"ID" }, delete by ID
197 "delete": { "query":"QUERY" } delete by query
198 }'
199 */
200
201 return solr_update_json;
202 }
203
204 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page)
205 {
206 ArrayList<String> word_list = new ArrayList<String>();
207
208 if (ef_page != null) {
209 JSONObject ef_body = ef_page.getJSONObject("body");
210 if (ef_body != null) {
211 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
212 if (ef_token_pos_count != null) {
213
214 Iterator<String> token_iter = ef_token_pos_count.keys();
215 while (token_iter.hasNext()) {
216 String token = token_iter.next();
217 word_list.add(token);
218 }
219 }
220 else {
221 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
222 }
223 }
224 else {
225 System.err.println("Warning: empty body field for '" + page_id + "'");
226 }
227
228 }
229 else {
230 System.err.println("Warning: null page for '" + page_id + "'");
231 }
232
233 return word_list;
234 }
235
236 public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
237 {
238 try {
239 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
240 bw.write(solr_add_doc_json.toString());
241 bw.close();
242 } catch (IOException e) {
243 e.printStackTrace();
244 } catch (CompressorException e) {
245 e.printStackTrace();
246 }
247 }
248
249 public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
250 {
251
252 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
253 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
254 //curl_popen += " --data-binary '";
255 //curl_popen += "'"
256
257
258 try {
259 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
260 httpcon.setDoOutput(true);
261 httpcon.setRequestProperty("Content-Type", "application/json");
262 httpcon.setRequestProperty("Accept", "application/json");
263 httpcon.setRequestMethod("POST");
264 httpcon.connect();
265
266 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
267 OutputStream os = httpcon.getOutputStream();
268 os.write(outputBytes);
269 os.close();
270
271
272 // Read response
273 StringBuilder sb = new StringBuilder();
274 BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
275 String decodedString;
276 while ((decodedString = in.readLine()) != null) {
277 sb.append(decodedString);
278 }
279 in.close();
280
281 JSONObject solr_status_json = new JSONObject(sb.toString());
282 JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
283 if (response_header_json != null) {
284 int status = response_header_json.getInt("status");
285 if (status != 0) {
286 System.err.println("Warning: POST request to " + post_url + " returned status " + status);
287 System.err.println("Full response was: " + sb);
288 }
289 }
290 else {
291 System.err.println("Failed response to Solr POST: " + sb);
292 }
293
294
295
296 }
297 catch (Exception e) {
298 e.printStackTrace();
299 }
300
301 }
302}
Note: See TracBrowser for help on using the repository browser.