Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31273

Last change on this file since 31273 was 31273, checked in by davidb, 7 years ago
Code moved to store fields for multilingual use using dynamic Solr fields *_htrctoken. Text is now also put in as separate tokens
Property svn:executable set to ``*
File size: 15.3 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.BufferedReader;
4	import java.io.BufferedWriter;
5	import java.io.IOException;
6	import java.io.InputStreamReader;
7	import java.io.OutputStream;
8	import java.io.Reader;
9	import java.io.StringReader;
10	import java.net.HttpURLConnection;
11	import java.net.URL;
12	import java.util.ArrayList;
13	import java.util.Iterator;
14	import org.apache.commons.compress.compressors.CompressorException;
15	import org.json.JSONArray;
16	import org.json.JSONObject;
17	import org.apache.lucene.analysis.TokenStream;
18	import org.apache.lucene.analysis.Tokenizer;
19	import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
20	import org.apache.lucene.analysis.standard.StandardTokenizer;
21	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
22	import org.apache.lucene.analysis.core.LowerCaseFilter;
23
24	public class SolrDocJSON {
25
26
27	protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
28	boolean icu_tokenize)
29	{
30	boolean lowercase_filter = true;
31
32	ArrayList<String> words = new ArrayList<String>();
33
34	if (ef_token_pos_count != null) {
35
36	Iterator<String> word_token_iter = ef_token_pos_count.keys();
37	while (word_token_iter.hasNext()) {
38	String word_token = word_token_iter.next();
39
40	if (icu_tokenize == true) {
41	Reader reader = new StringReader(word_token);
42
43	ICUTokenizer icu_tokenizer = new ICUTokenizer();
44	icu_tokenizer.setReader(reader);
45
46	CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
47
48	TokenStream token_stream = null;
49
50	if (lowercase_filter) {
51	token_stream = new LowerCaseFilter(icu_tokenizer);
52	}
53	else {
54	token_stream = icu_tokenizer;
55	}
56
57	try {
58	token_stream.reset();
59
60	while (token_stream.incrementToken()) {
61	String term = charTermAttribute.toString();
62	words.add(term);
63	}
64
65	token_stream.end();
66	token_stream.close();
67	}
68	catch (IOException e) {
69	e.printStackTrace();
70	}
71	}
72	else {
73	words.add(word_token);
74	}
75	}
76	}
77	else {
78	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
79	}
80
81	/* Alternative way to get at keys
82	Set<String> token_keys = ef_token_pos_count.keySet();
83	for (String token : token_keys) {
84	sb.append(token + " ");
85	}
86	*/
87	return words;
88	}
89
90	protected static ArrayList<String> getTokenPosCountWordsArrayList(JSONObject ef_token_pos_count, String page_id,
91	boolean icu_tokenize)
92	{
93	ArrayList<String> words = new ArrayList<String>();
94
95	if (ef_token_pos_count != null) {
96
97	Iterator<String> word_token_iter = ef_token_pos_count.keys();
98	while (word_token_iter.hasNext()) {
99	String word_token = word_token_iter.next();
100
101	if (icu_tokenize == true) {
102	Reader reader = new StringReader(word_token);
103
104	ICUTokenizer icu_tokenizer = new ICUTokenizer();
105	icu_tokenizer.setReader(reader);
106
107	CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
108
109	TokenStream token_stream = icu_tokenizer;
110
111	try {
112	token_stream.reset();
113
114	while (token_stream.incrementToken()) {
115	String term = charTermAttribute.toString();
116	words.add(term);
117	}
118
119	token_stream.end();
120	token_stream.close();
121	}
122	catch (IOException e) {
123	e.printStackTrace();
124	}
125	}
126	else {
127	words.add(word_token);
128	}
129	}
130	}
131	else {
132	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
133	}
134
135	return words;
136	}
137	protected static ArrayList<String> getTokenPosCountWordsMapCaseInsensitive(ArrayList<String> words_in)
138	{
139	ArrayList<String> words_out = new ArrayList<String>();
140
141	for (String word: words_in) {
142
143	Reader reader = new StringReader(word);
144
145	Tokenizer tokenizer = new StandardTokenizer();
146	tokenizer.setReader(reader);
147	CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
148
149	TokenStream token_stream = new LowerCaseFilter(tokenizer);
150
151	try {
152	token_stream.reset();
153
154	while (token_stream.incrementToken()) {
155	String term = charTermAttribute.toString();
156	words_out.add(term);
157	}
158
159	token_stream.end();
160	token_stream.close();
161	}
162	catch (IOException e) {
163	e.printStackTrace();
164	}
165
166	}
167
168	return words_out;
169	}
170
171	protected static ArrayList<String> getTokenPosCountWordsMapWhitelist(ArrayList<String> words_in,
172	WhitelistBloomFilter whitelist_bloomfilter)
173	{
174	ArrayList<String> words_out = new ArrayList<String>();
175
176	for (String word: words_in) {
177
178	if (whitelist_bloomfilter.contains(word)) {
179	words_out.add(word);
180	}
181	}
182
183	return words_out;
184	}
185
186	protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
187	{
188	ArrayList<String> pos_labels = new ArrayList<String>();
189
190	if (ef_token_pos_count != null) {
191
192	Iterator<String> word_token_iter = ef_token_pos_count.keys();
193	while (word_token_iter.hasNext()) {
194	String word_token = word_token_iter.next();
195
196	JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
197
198	Iterator<String> pos_token_iter = word_pos_labels.keys();
199	while (pos_token_iter.hasNext()) {
200	String pos_token = pos_token_iter.next();
201
202	pos_labels.add(pos_token);
203	}
204	}
205	}
206	else {
207	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
208	}
209
210	return pos_labels;
211	}
212
213
214
215	protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
216	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
217	{
218	ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
219
220	StringBuilder sb = new StringBuilder();
221
222	if (whitelist_bloomfilter == null) {
223
224	boolean first_append = true;
225
226	for (int i=0; i<tokens.size(); i++) {
227	String token = tokens.get(i);
228
229	if (!first_append) {
230	sb.append(" ");
231	}
232	else {
233	first_append = false;
234	}
235	sb.append(token);
236	}
237	}
238	else {
239	boolean first_append = true;
240
241	for (int i=0; i<tokens.size(); i++) {
242	String token = tokens.get(i);
243
244	if (whitelist_bloomfilter.contains(token)) {
245	if (!first_append) {
246	sb.append(" ");
247	}
248	else {
249	first_append = false;
250	}
251	sb.append(token);
252	}
253	}
254
255	}
256
257
258	return sb.toString();
259	}
260
261	protected static ArrayList<String> filterSolrTextFields(JSONObject ef_token_pos_count, String page_id,
262	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
263	{
264	ArrayList<String> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize);
265	ArrayList<String> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);
266
267	ArrayList<String> tokens = null;
268	if (whitelist_bloomfilter != null) {
269	tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter);
270	}
271	else {
272	tokens = lc_tokens;
273	}
274
275	return tokens;
276	}
277
278	protected static void addSolrLanguageTextFields(JSONObject ef_page, ArrayList<String> text_al,
279	JSONObject solr_doc_json)
280	{
281	// e.g. ... "languages":[{"ko":"0.71"},{"ja":"0.29"}]
282	JSONArray ef_languages = ef_page.getJSONArray("languages");
283	if (ef_languages != null) {
284
285	int lang_len = ef_languages.length();
286	String [] lang_list = new String[lang_len];
287
288	for (int i=0; i<lang_len; i++) {
289	JSONObject lang_rec = ef_languages.getJSONObject(i);
290
291	Iterator<String> lang_key_iter = lang_rec.keys();
292	while (lang_key_iter.hasNext()) {
293	String lang_label = lang_key_iter.next();
294
295	String solr_field = lang_label + "_htrctoken";
296	lang_list[i] = solr_field;
297	}
298	}
299
300	int text_len = text_al.size();
301	for (int ti=0; ti<text_len; ti++) {
302	String text_value = text_al.get(ti);
303	for (int li=0; li<lang_len; li++) {
304	String lang_text_field = lang_list[li];
305
306	solr_doc_json.put(lang_text_field, text_value);
307
308	}
309	}
310
311	}
312	}
313	protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
314	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
315	{
316	JSONObject solr_update_json = null;
317
318	if (ef_page != null) {
319	JSONObject ef_body = ef_page.getJSONObject("body");
320	if (ef_body != null) {
321	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
322	if (ef_token_pos_count != null) {
323
324	JSONObject solr_add_json = new JSONObject();
325
326	ArrayList<String> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
327
328	JSONObject solr_doc_json = new JSONObject();
329	solr_doc_json.put("id", page_id);
330	solr_doc_json.put("volumeid_s", volume_id);
331	if (text_al.size()>0) {
332	addSolrLanguageTextFields(ef_page,text_al, solr_doc_json);
333	//solr_doc_json.put("eftext_txt", text_al.toString()); // ****
334	}
335	else {
336	solr_doc_json.put("efnotext_b", true);
337	}
338	solr_add_json.put("commitWithin", 5000);
339	solr_add_json.put("doc", solr_doc_json);
340
341	solr_update_json = new JSONObject();
342	solr_update_json.put("add",solr_add_json);
343
344	}
345	else {
346	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
347	}
348	}
349	else {
350	System.err.println("Warning: empty body field for '" + page_id + "'");
351	}
352
353	}
354	else {
355	System.err.println("Warning: null page for '" + page_id + "'");
356	}
357
358
359	/*
360
361	/update/json/docs
362	*/
363
364	// For Reference ...
365	// Example documentation on Solr JSON syntax:
366	// https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
367	// #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
368
369	/*
370	curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
371	{
372	"add": {
373	"doc": {
374	"id": "DOC1",
375	"my_boosted_field": { use a map with boost/value for a boosted field
376	"boost": 2.3,
377	"value": "test"
378	},
379	"my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
380	}
381	},
382	"add": {
383	"commitWithin": 5000, commit this document within 5 seconds
384	"overwrite": false, don't check for existing documents with the same uniqueKey
385	"boost": 3.45, a document boost
386	"doc": {
387	"f1": "v1", Can use repeated keys for a multi-valued field
388	"f1": "v2"
389	}
390	},
391
392	"commit": {},
393	"optimize": { "waitSearcher":false },
394
395	"delete": { "id":"ID" }, delete by ID
396	"delete": { "query":"QUERY" } delete by query
397	}'
398	*/
399
400	return solr_update_json;
401	}
402
403	public static ArrayList<String> generateTokenPosCountWhitelistText(String volume_id, String page_id, JSONObject ef_page,
404	boolean icu_tokenize)
405	{
406	ArrayList<String> word_list = null;
407
408	if (ef_page != null) {
409	JSONObject ef_body = ef_page.getJSONObject("body");
410	if (ef_body != null) {
411	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
412	word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
413	}
414	else {
415	System.err.println("Warning: empty body field for '" + page_id + "'");
416	}
417
418	}
419	else {
420	System.err.println("Warning: null page for '" + page_id + "'");
421	}
422
423	return word_list;
424	}
425
426	public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
427	{
428	ArrayList<String> word_list = null;
429
430	if (ef_page != null) {
431	JSONObject ef_body = ef_page.getJSONObject("body");
432	if (ef_body != null) {
433	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
434	word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
435	}
436	else {
437	System.err.println("Warning: empty body field for '" + page_id + "'");
438	}
439
440	}
441	else {
442	System.err.println("Warning: null page for '" + page_id + "'");
443	}
444
445	return word_list;
446	}
447
448	public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
449	{
450	ArrayList<String> lang_list = new ArrayList<String>();;
451
452	if (ef_page != null) {
453	JSONArray ef_languages = ef_page.getJSONArray("languages");
454	if (ef_languages != null) {
455
456	int lang_len = ef_languages.length();
457	for (int i=0; i<lang_len; i++) {
458	JSONObject lang_rec = ef_languages.getJSONObject(i);
459
460	Iterator<String> lang_key_iter = lang_rec.keys();
461	while (lang_key_iter.hasNext()) {
462	String lang_label = lang_key_iter.next();
463
464	lang_list.add(lang_label);
465	}
466	}
467	}
468	else {
469	System.err.println("Warning: empty languages field for '" + page_id + "'");
470	}
471
472	}
473	else {
474	System.err.println("Warning: null page for '" + page_id + "'");
475	}
476
477	return lang_list;
478	}
479
480	public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
481	{
482	try {
483	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
484	bw.write(solr_add_doc_json.toString());
485	bw.close();
486	} catch (IOException e) {
487	e.printStackTrace();
488	} catch (CompressorException e) {
489	e.printStackTrace();
490	}
491	}
492
493	public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
494	{
495
496	//String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
497	//curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
498	//curl_popen += " --data-binary '";
499	//curl_popen += "'"
500
501
502	try {
503	HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
504	httpcon.setDoOutput(true);
505	httpcon.setRequestProperty("Content-Type", "application/json");
506	httpcon.setRequestProperty("Accept", "application/json");
507	httpcon.setRequestMethod("POST");
508	httpcon.connect();
509
510	byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
511	OutputStream os = httpcon.getOutputStream();
512	os.write(outputBytes);
513	os.close();
514
515
516	// Read response
517	StringBuilder sb = new StringBuilder();
518	BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
519	String decodedString;
520	while ((decodedString = in.readLine()) != null) {
521	sb.append(decodedString);
522	}
523	in.close();
524
525	JSONObject solr_status_json = new JSONObject(sb.toString());
526	JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
527	if (response_header_json != null) {
528	int status = response_header_json.getInt("status");
529	if (status != 0) {
530	System.err.println("Warning: POST request to " + post_url + " returned status " + status);
531	System.err.println("Full response was: " + sb);
532	}
533	}
534	else {
535	System.err.println("Failed response to Solr POST: " + sb);
536	}
537
538
539
540	}
541	catch (Exception e) {
542	e.printStackTrace();
543	}
544
545	}
546	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: