Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31254

Last change on this file since 31254 was 31254, checked in by davidb, 7 years ago
Experimenting with Lucene lowercase filter
Property svn:executable set to ``*
File size: 8.8 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.BufferedReader;
4	import java.io.BufferedWriter;
5	import java.io.IOException;
6	import java.io.InputStreamReader;
7	import java.io.OutputStream;
8	import java.io.Reader;
9	import java.io.StringReader;
10	import java.net.HttpURLConnection;
11	import java.net.URL;
12	import java.util.ArrayList;
13	import java.util.Iterator;
14	import org.apache.commons.compress.compressors.CompressorException;
15	import org.json.JSONObject;
16	import org.apache.lucene.analysis.TokenStream;
17	import org.apache.lucene.analysis.Tokenizer;
18	import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
19	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
20	import org.apache.lucene.analysis.core.LowerCaseFilter;
21
22	public class SolrDocJSON {
23
24	protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
25	boolean icu_tokenize)
26	{
27	boolean lowercase_filter = true;
28
29	ArrayList<String> words = new ArrayList<String>();
30
31	if (ef_token_pos_count != null) {
32
33	Iterator<String> token_iter = ef_token_pos_count.keys();
34	while (token_iter.hasNext()) {
35	String token = token_iter.next();
36
37	if (icu_tokenize == true) {
38	Reader reader = new StringReader(token);
39
40	ICUTokenizer icu_tokenizer = new ICUTokenizer();
41	icu_tokenizer.setReader(reader);
42
43	CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
44
45	TokenStream token_stream = null;
46
47	if (lowercase_filter) {
48	token_stream = new LowerCaseFilter(icu_tokenizer);
49	}
50	else {
51	token_stream = icu_tokenizer;
52	}
53
54	try {
55	token_stream.reset();
56
57	while (token_stream.incrementToken()) {
58	String term = charTermAttribute.toString();
59	words.add(term);
60	}
61
62	token_stream.end();
63	token_stream.close();
64	}
65	catch (IOException e) {
66	e.printStackTrace();
67	}
68	}
69	else {
70	words.add(token);
71	}
72	}
73	}
74	else {
75	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
76	}
77
78	/* Alternative way to get at keys
79	Set<String> token_keys = ef_token_pos_count.keySet();
80	for (String token : token_keys) {
81	sb.append(token + " ");
82	}
83	*/
84	return words;
85	}
86
87
88	protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
89	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
90	{
91	ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
92
93	StringBuilder sb = new StringBuilder();
94
95	if (whitelist_bloomfilter == null) {
96
97	boolean first_append = true;
98
99	for (int i=0; i<tokens.size(); i++) {
100	String token = tokens.get(i);
101
102	if (!first_append) {
103	sb.append(" ");
104	}
105	else {
106	first_append = false;
107	}
108	sb.append(token);
109	}
110	}
111	else {
112	boolean first_append = true;
113
114	for (int i=0; i<tokens.size(); i++) {
115	String token = tokens.get(i);
116
117	if (whitelist_bloomfilter.contains(token)) {
118	if (!first_append) {
119	sb.append(" ");
120	}
121	else {
122	first_append = false;
123	}
124	sb.append(token);
125	}
126	}
127
128	}
129
130
131	return sb.toString();
132	}
133
134	protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
135	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
136	{
137	JSONObject solr_update_json = null;
138
139	if (ef_page != null) {
140	JSONObject ef_body = ef_page.getJSONObject("body");
141	if (ef_body != null) {
142	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
143	if (ef_token_pos_count != null) {
144
145	JSONObject solr_add_json = new JSONObject();
146
147	String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
148
149	JSONObject solr_doc_json = new JSONObject();
150	solr_doc_json.put("id", page_id);
151	solr_doc_json.put("volumeid_s", volume_id);
152	if (!text.equals("")) {
153	solr_doc_json.put("eftext_txt", text);
154	}
155	else {
156	solr_doc_json.put("efnotext_b", true);
157	}
158	solr_add_json.put("commitWithin", 5000);
159	solr_add_json.put("doc", solr_doc_json);
160
161	solr_update_json = new JSONObject();
162	solr_update_json.put("add",solr_add_json);
163
164	}
165	else {
166	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
167	}
168	}
169	else {
170	System.err.println("Warning: empty body field for '" + page_id + "'");
171	}
172
173	}
174	else {
175	System.err.println("Warning: null page for '" + page_id + "'");
176	}
177
178
179	/*
180
181	/update/json/docs
182	*/
183
184	// For Reference ...
185	// Example documentation on Solr JSON syntax:
186	// https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
187	// #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
188
189	/*
190	curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
191	{
192	"add": {
193	"doc": {
194	"id": "DOC1",
195	"my_boosted_field": { use a map with boost/value for a boosted field
196	"boost": 2.3,
197	"value": "test"
198	},
199	"my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
200	}
201	},
202	"add": {
203	"commitWithin": 5000, commit this document within 5 seconds
204	"overwrite": false, don't check for existing documents with the same uniqueKey
205	"boost": 3.45, a document boost
206	"doc": {
207	"f1": "v1", Can use repeated keys for a multi-valued field
208	"f1": "v2"
209	}
210	},
211
212	"commit": {},
213	"optimize": { "waitSearcher":false },
214
215	"delete": { "id":"ID" }, delete by ID
216	"delete": { "query":"QUERY" } delete by query
217	}'
218	*/
219
220	return solr_update_json;
221	}
222
223	protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
224	boolean icu_tokenize)
225	{
226	ArrayList<String> word_list = null;
227
228	if (ef_page != null) {
229	JSONObject ef_body = ef_page.getJSONObject("body");
230	if (ef_body != null) {
231	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
232	word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
233	}
234	else {
235	System.err.println("Warning: empty body field for '" + page_id + "'");
236	}
237
238	}
239	else {
240	System.err.println("Warning: null page for '" + page_id + "'");
241	}
242
243	return word_list;
244	}
245
246	public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
247	{
248	try {
249	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
250	bw.write(solr_add_doc_json.toString());
251	bw.close();
252	} catch (IOException e) {
253	e.printStackTrace();
254	} catch (CompressorException e) {
255	e.printStackTrace();
256	}
257	}
258
259	public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
260	{
261
262	//String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
263	//curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
264	//curl_popen += " --data-binary '";
265	//curl_popen += "'"
266
267
268	try {
269	HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
270	httpcon.setDoOutput(true);
271	httpcon.setRequestProperty("Content-Type", "application/json");
272	httpcon.setRequestProperty("Accept", "application/json");
273	httpcon.setRequestMethod("POST");
274	httpcon.connect();
275
276	byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
277	OutputStream os = httpcon.getOutputStream();
278	os.write(outputBytes);
279	os.close();
280
281
282	// Read response
283	StringBuilder sb = new StringBuilder();
284	BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
285	String decodedString;
286	while ((decodedString = in.readLine()) != null) {
287	sb.append(decodedString);
288	}
289	in.close();
290
291	JSONObject solr_status_json = new JSONObject(sb.toString());
292	JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
293	if (response_header_json != null) {
294	int status = response_header_json.getInt("status");
295	if (status != 0) {
296	System.err.println("Warning: POST request to " + post_url + " returned status " + status);
297	System.err.println("Full response was: " + sb);
298	}
299	}
300	else {
301	System.err.println("Failed response to Solr POST: " + sb);
302	}
303
304
305
306	}
307	catch (Exception e) {
308	e.printStackTrace();
309	}
310
311	}
312	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: