Context Navigation

source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30980

Last change on this file since 30980 was 30980, checked in by davidb, 8 years ago
Code added to read response
Property svn:executable set to ``*
File size: 9.6 KB

Line
1	package org.hathitrust;
2
3	import java.io.BufferedReader;
4	import java.io.BufferedWriter;
5	import java.io.IOException;
6	import java.io.InputStreamReader;
7	import java.io.OutputStream;
8	import java.net.HttpURLConnection;
9	import java.net.URL;
10	import java.util.ArrayList;
11	import java.util.Iterator;
12	import java.util.Set;
13
14	import org.apache.commons.compress.compressors.CompressorException;
15	import org.apache.spark.api.java.function.FlatMapFunction;
16	import org.json.JSONArray;
17	import org.json.JSONObject;
18
19	/*
20	class PagedJSON implements Function<String, Boolean> {
21
22	private static final long serialVersionUID = 1L;
23
24	public Boolean call(String s) { return s.contains("a"); }
25	}
26	*/
27
28
29	class PagedJSON implements FlatMapFunction<String, String>
30	{
31	private static final long serialVersionUID = 1L;
32
33	protected String _input_dir;
34	protected String _solr_url;
35	protected String _output_dir;
36	protected int _verbosity;
37
38	public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity)
39	{
40	_input_dir = input_dir;
41	_solr_url = solr_url;
42	_output_dir = output_dir;
43	_verbosity = verbosity;
44	}
45
46	protected JSONObject readJSONFile(String filename)
47	{
48	StringBuilder sb = new StringBuilder();
49
50	try {
51
52	String str;
53	BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
54	while ((str = br.readLine()) != null) {
55	sb.append(str);
56	}
57
58	br.close();
59	}
60	catch (Exception e) {
61	e.printStackTrace();
62	}
63
64	JSONObject json_obj = new JSONObject(sb.toString());
65
66
67	return json_obj;
68	}
69
70	protected String generateSolrText(JSONObject ef_token_pos_count)
71	{
72	StringBuilder sb = new StringBuilder();
73
74	Iterator<String> token_iter = ef_token_pos_count.keys();
75	while (token_iter.hasNext()) {
76	String token = token_iter.next();
77
78	sb.append(token);
79	if (token_iter.hasNext()) {
80	sb.append(" ");
81	}
82	}
83
84	/*
85	Set<String> token_keys = ef_token_pos_count.keySet();
86	for (String token : token_keys) {
87	sb.append(token + " ");
88	}
89	*/
90
91	return sb.toString();
92	}
93
94	protected JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
95	{
96	JSONObject solr_update_json = null;
97
98	if (ef_page != null) {
99	JSONObject ef_body = ef_page.getJSONObject("body");
100	if (ef_body != null) {
101	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
102	if (ef_token_pos_count != null) {
103
104	JSONObject solr_add_json = new JSONObject();
105
106	String text = generateSolrText(ef_token_pos_count);
107
108	JSONObject solr_doc_json = new JSONObject();
109	solr_doc_json.put("id", page_id);
110	solr_doc_json.put("volumeid_s", volume_id);
111	solr_doc_json.put("_text_", text);
112
113	solr_add_json.put("commitWithin", 5000);
114	solr_add_json.put("doc", solr_doc_json);
115
116	solr_update_json = new JSONObject();
117	solr_update_json.put("add",solr_add_json);
118
119	}
120	else {
121	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
122	}
123	}
124	else {
125	System.err.println("Warning: empty body field for '" + page_id + "'");
126	}
127
128	}
129	else {
130	System.err.println("Warning: null page for '" + page_id + "'");
131	}
132
133
134	/*
135
136	/update/json/docs
137	*/
138
139	// For Reference ...
140	// Example documentation on Solr JSON syntax:
141	// https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
142	// #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
143
144	/*
145	curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
146	{
147	"add": {
148	"doc": {
149	"id": "DOC1",
150	"my_boosted_field": { use a map with boost/value for a boosted field
151	"boost": 2.3,
152	"value": "test"
153	},
154	"my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
155	}
156	},
157	"add": {
158	"commitWithin": 5000, commit this document within 5 seconds
159	"overwrite": false, don't check for existing documents with the same uniqueKey
160	"boost": 3.45, a document boost
161	"doc": {
162	"f1": "v1", Can use repeated keys for a multi-valued field
163	"f1": "v2"
164	}
165	},
166
167	"commit": {},
168	"optimize": { "waitSearcher":false },
169
170	"delete": { "id":"ID" }, delete by ID
171	"delete": { "query":"QUERY" } delete by query
172	}'
173	*/
174
175	return solr_update_json;
176	}
177
178	protected void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
179	{
180	try {
181	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_file_json_bz2);
182	bw.write(solr_add_doc_json.toString());
183	bw.close();
184	} catch (IOException e) {
185	e.printStackTrace();
186	} catch (CompressorException e) {
187	e.printStackTrace();
188	}
189	}
190
191	protected void postSolrDoc(JSONObject solr_add_doc_json)
192	{
193	String post_url = _solr_url;
194
195	//String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
196	//curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
197	//curl_popen += " --data-binary '";
198	//curl_popen += "'"
199
200
201	try {
202	HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
203	httpcon.setDoOutput(true);
204	httpcon.setRequestProperty("Content-Type", "application/json");
205	httpcon.setRequestProperty("Accept", "application/json");
206	httpcon.setRequestMethod("POST");
207	httpcon.connect();
208
209	byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
210	OutputStream os = httpcon.getOutputStream();
211	os.write(outputBytes);
212	os.close();
213
214
215	// Read response
216	BufferedReader in = new BufferedReader(new InputStreamReader(
217	httpcon.getInputStream()));
218	String decodedString;
219	while ((decodedString = in.readLine()) != null) {
220	System.out.println(decodedString);
221	}
222	in.close();
223
224	}
225	catch (Exception e) {
226	e.printStackTrace();
227	}
228
229	}
230	public Iterator<String> call(String json_file_in)
231	{
232	JSONObject extracted_feature_record = readJSONFile(json_file_in);
233
234	String volume_id = extracted_feature_record.getString("id");
235
236	//JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
237	//String title= ef_metadata.getString("title");
238
239	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
240
241
242	int ef_page_count = ef_features.getInt("pageCount");
243
244	if (_verbosity >= 1) {
245	System.out.println("Processing: " + json_file_in);
246	System.out.println(" pageCount = " + ef_page_count);
247	}
248
249	JSONArray ef_pages = ef_features.getJSONArray("pages");
250	int ef_num_pages = ef_pages.length();
251
252	// Make directory for page-level JSON output
253	String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
254	String page_json_dir = json_dir + "/pages";
255	ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
256
257	ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
258	for (int i = 0; i < ef_page_count; i++) {
259	String formatted_i = String.format("page-%06d", i);
260	String page_id = volume_id + "." + formatted_i;
261
262	if (_verbosity >= 2) {
263	System.out.println(" Page: " + page_id);
264	}
265
266	String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
267	ids.add(output_json_bz2);
268
269	if (i==0) {
270	System.out.println("Sample output JSON page file: " + output_json_bz2);
271	}
272
273	JSONObject ef_page = ef_pages.getJSONObject(i);
274
275	if (ef_page != null) {
276	// Convert to Solr add form
277	JSONObject solr_add_doc_json = generateSolrDocJSON(volume_id, page_id, ef_page);
278
279	if (i==20) {
280	System.out.println("==================");
281	System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
282	System.out.println("==================");
283	//System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("_text_"));
284	}
285
286	// create JSON obj of just the page (for now), and write it out
287	// write out the JSONOBject as a bz2 compressed file
288	/*
289	try {
290	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2);
291	bw.write(ef_page.toString());
292	bw.close();
293	} catch (IOException e) {
294	e.printStackTrace();
295	} catch (CompressorException e) {
296	e.printStackTrace();
297	}
298	*/
299
300	if (_solr_url != null) {
301	if (i==20) {
302	System.out.println("==================");
303	System.out.println("Posting to: " + _solr_url);
304	System.out.println("==================");
305	}
306	postSolrDoc(solr_add_doc_json);
307	}
308
309	if (_output_dir != null) {
310	if (i==20) {
311	System.out.println("==================");
312	System.out.println("Saving to: " + _output_dir);
313	System.out.println("==================");
314	}
315	saveSolrDoc(solr_add_doc_json,output_json_bz2);
316	}
317	}
318	else {
319	System.err.println("Skipping: " + page_id);
320	}
321
322	}
323
324	/*
325	for (int i = 0; i < ef_num_pages; i++)
326	{
327	//String post_id = ef_pages.getJSONObject(i).getString("post_id");
328	//......
329	}
330	*/
331	//String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
332	/*
333	JSONArray arr = obj.getJSONArray("posts");
334	for (int i = 0; i < arr.length(); i++)
335	{
336	String post_id = arr.getJSONObject(i).getString("post_id");
337	......
338	}
339	*/
340
341
342	ids.add(volume_id);
343
344	return ids.iterator();
345	}
346	}
347

Note: See TracBrowser for help on using the repository browser.

Download in other formats: