Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java@ 31260

Last change on this file since 31260 was 31260, checked in by davidb, 7 years ago
Language counting
Property svn:executable set to ``*
File size: 11.1 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.BufferedReader;
4	import java.io.BufferedWriter;
5	import java.io.IOException;
6	import java.io.InputStreamReader;
7	import java.io.OutputStream;
8	import java.io.Reader;
9	import java.io.StringReader;
10	import java.net.HttpURLConnection;
11	import java.net.URL;
12	import java.util.ArrayList;
13	import java.util.Iterator;
14	import org.apache.commons.compress.compressors.CompressorException;
15	import org.json.JSONArray;
16	import org.json.JSONObject;
17	import org.apache.lucene.analysis.TokenStream;
18	import org.apache.lucene.analysis.Tokenizer;
19	import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
20	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21	import org.apache.lucene.analysis.core.LowerCaseFilter;
22
23	public class SolrDocJSON {
24
25	protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
26	boolean icu_tokenize)
27	{
28	boolean lowercase_filter = true;
29
30	ArrayList<String> words = new ArrayList<String>();
31
32	if (ef_token_pos_count != null) {
33
34	Iterator<String> word_token_iter = ef_token_pos_count.keys();
35	while (word_token_iter.hasNext()) {
36	String word_token = word_token_iter.next();
37
38	if (icu_tokenize == true) {
39	Reader reader = new StringReader(word_token);
40
41	ICUTokenizer icu_tokenizer = new ICUTokenizer();
42	icu_tokenizer.setReader(reader);
43
44	CharTermAttribute charTermAttribute = icu_tokenizer.addAttribute(CharTermAttribute.class);
45
46	TokenStream token_stream = null;
47
48	if (lowercase_filter) {
49	token_stream = new LowerCaseFilter(icu_tokenizer);
50	}
51	else {
52	token_stream = icu_tokenizer;
53	}
54
55	try {
56	token_stream.reset();
57
58	while (token_stream.incrementToken()) {
59	String term = charTermAttribute.toString();
60	words.add(term);
61	}
62
63	token_stream.end();
64	token_stream.close();
65	}
66	catch (IOException e) {
67	e.printStackTrace();
68	}
69	}
70	else {
71	words.add(word_token);
72	}
73	}
74	}
75	else {
76	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
77	}
78
79	/* Alternative way to get at keys
80	Set<String> token_keys = ef_token_pos_count.keySet();
81	for (String token : token_keys) {
82	sb.append(token + " ");
83	}
84	*/
85	return words;
86	}
87
88	protected static ArrayList<String> getTokenPosCountPOSLabels(JSONObject ef_token_pos_count, String page_id)
89	{
90	ArrayList<String> pos_labels = new ArrayList<String>();
91
92	if (ef_token_pos_count != null) {
93
94	Iterator<String> word_token_iter = ef_token_pos_count.keys();
95	while (word_token_iter.hasNext()) {
96	String word_token = word_token_iter.next();
97
98	JSONObject word_pos_labels = ef_token_pos_count.getJSONObject(word_token);
99
100	Iterator<String> pos_token_iter = word_pos_labels.keys();
101	while (pos_token_iter.hasNext()) {
102	String pos_token = pos_token_iter.next();
103
104	pos_labels.add(pos_token);
105	}
106	}
107	}
108	else {
109	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
110	}
111
112	return pos_labels;
113	}
114
115
116
117	protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
118	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
119	{
120	ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
121
122	StringBuilder sb = new StringBuilder();
123
124	if (whitelist_bloomfilter == null) {
125
126	boolean first_append = true;
127
128	for (int i=0; i<tokens.size(); i++) {
129	String token = tokens.get(i);
130
131	if (!first_append) {
132	sb.append(" ");
133	}
134	else {
135	first_append = false;
136	}
137	sb.append(token);
138	}
139	}
140	else {
141	boolean first_append = true;
142
143	for (int i=0; i<tokens.size(); i++) {
144	String token = tokens.get(i);
145
146	if (whitelist_bloomfilter.contains(token)) {
147	if (!first_append) {
148	sb.append(" ");
149	}
150	else {
151	first_append = false;
152	}
153	sb.append(token);
154	}
155	}
156
157	}
158
159
160	return sb.toString();
161	}
162
163	protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
164	WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
165	{
166	JSONObject solr_update_json = null;
167
168	if (ef_page != null) {
169	JSONObject ef_body = ef_page.getJSONObject("body");
170	if (ef_body != null) {
171	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
172	if (ef_token_pos_count != null) {
173
174	JSONObject solr_add_json = new JSONObject();
175
176	String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
177
178	JSONObject solr_doc_json = new JSONObject();
179	solr_doc_json.put("id", page_id);
180	solr_doc_json.put("volumeid_s", volume_id);
181	if (!text.equals("")) {
182	solr_doc_json.put("eftext_txt", text);
183	}
184	else {
185	solr_doc_json.put("efnotext_b", true);
186	}
187	solr_add_json.put("commitWithin", 5000);
188	solr_add_json.put("doc", solr_doc_json);
189
190	solr_update_json = new JSONObject();
191	solr_update_json.put("add",solr_add_json);
192
193	}
194	else {
195	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
196	}
197	}
198	else {
199	System.err.println("Warning: empty body field for '" + page_id + "'");
200	}
201
202	}
203	else {
204	System.err.println("Warning: null page for '" + page_id + "'");
205	}
206
207
208	/*
209
210	/update/json/docs
211	*/
212
213	// For Reference ...
214	// Example documentation on Solr JSON syntax:
215	// https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
216	// #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
217
218	/*
219	curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
220	{
221	"add": {
222	"doc": {
223	"id": "DOC1",
224	"my_boosted_field": { use a map with boost/value for a boosted field
225	"boost": 2.3,
226	"value": "test"
227	},
228	"my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
229	}
230	},
231	"add": {
232	"commitWithin": 5000, commit this document within 5 seconds
233	"overwrite": false, don't check for existing documents with the same uniqueKey
234	"boost": 3.45, a document boost
235	"doc": {
236	"f1": "v1", Can use repeated keys for a multi-valued field
237	"f1": "v2"
238	}
239	},
240
241	"commit": {},
242	"optimize": { "waitSearcher":false },
243
244	"delete": { "id":"ID" }, delete by ID
245	"delete": { "query":"QUERY" } delete by query
246	}'
247	*/
248
249	return solr_update_json;
250	}
251
252	public static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
253	boolean icu_tokenize)
254	{
255	ArrayList<String> word_list = null;
256
257	if (ef_page != null) {
258	JSONObject ef_body = ef_page.getJSONObject("body");
259	if (ef_body != null) {
260	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
261	word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
262	}
263	else {
264	System.err.println("Warning: empty body field for '" + page_id + "'");
265	}
266
267	}
268	else {
269	System.err.println("Warning: null page for '" + page_id + "'");
270	}
271
272	return word_list;
273	}
274
275	public static ArrayList<String> generateTokenPosCountPOSLabels(String volume_id, String page_id, JSONObject ef_page)
276	{
277	ArrayList<String> word_list = null;
278
279	if (ef_page != null) {
280	JSONObject ef_body = ef_page.getJSONObject("body");
281	if (ef_body != null) {
282	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
283	word_list = getTokenPosCountPOSLabels(ef_token_pos_count,page_id);
284	}
285	else {
286	System.err.println("Warning: empty body field for '" + page_id + "'");
287	}
288
289	}
290	else {
291	System.err.println("Warning: null page for '" + page_id + "'");
292	}
293
294	return word_list;
295	}
296
297	public static ArrayList<String> generateTokenPosCountLangLabels(String volume_id, String page_id, JSONObject ef_page)
298	{
299	ArrayList<String> lang_list = new ArrayList<String>();;
300
301	if (ef_page != null) {
302	JSONArray ef_languages = ef_page.getJSONArray("languages");
303	if (ef_languages != null) {
304
305	int lang_len = ef_languages.length();
306	for (int i=0; i<lang_len; i++) {
307	JSONObject lang_rec = ef_languages.getJSONObject(i);
308
309	Iterator<String> lang_key_iter = lang_rec.keys();
310	while (lang_key_iter.hasNext()) {
311	String lang_label = lang_key_iter.next();
312
313	lang_list.add(lang_label);
314	}
315	}
316	}
317	else {
318	System.err.println("Warning: empty languages field for '" + page_id + "'");
319	}
320
321	}
322	else {
323	System.err.println("Warning: null page for '" + page_id + "'");
324	}
325
326	return lang_list;
327	}
328
329	public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
330	{
331	try {
332	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2);
333	bw.write(solr_add_doc_json.toString());
334	bw.close();
335	} catch (IOException e) {
336	e.printStackTrace();
337	} catch (CompressorException e) {
338	e.printStackTrace();
339	}
340	}
341
342	public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json)
343	{
344
345	//String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
346	//curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
347	//curl_popen += " --data-binary '";
348	//curl_popen += "'"
349
350
351	try {
352	HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
353	httpcon.setDoOutput(true);
354	httpcon.setRequestProperty("Content-Type", "application/json");
355	httpcon.setRequestProperty("Accept", "application/json");
356	httpcon.setRequestMethod("POST");
357	httpcon.connect();
358
359	byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
360	OutputStream os = httpcon.getOutputStream();
361	os.write(outputBytes);
362	os.close();
363
364
365	// Read response
366	StringBuilder sb = new StringBuilder();
367	BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream()));
368	String decodedString;
369	while ((decodedString = in.readLine()) != null) {
370	sb.append(decodedString);
371	}
372	in.close();
373
374	JSONObject solr_status_json = new JSONObject(sb.toString());
375	JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader");
376	if (response_header_json != null) {
377	int status = response_header_json.getInt("status");
378	if (status != 0) {
379	System.err.println("Warning: POST request to " + post_url + " returned status " + status);
380	System.err.println("Full response was: " + sb);
381	}
382	}
383	else {
384	System.err.println("Failed response to Solr POST: " + sb);
385	}
386
387
388
389	}
390	catch (Exception e) {
391	e.printStackTrace();
392	}
393
394	}
395	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: