Context Navigation

source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java@ 30979

Last change on this file since 30979 was 30979, checked in by davidb, 7 years ago
_solr_url needs to be stored in class!
Property svn:executable set to ``*
File size: 9.2 KB

Line
1	package org.hathitrust;
2
3	import java.io.BufferedReader;
4	import java.io.BufferedWriter;
5	import java.io.IOException;
6	import java.io.OutputStream;
7	import java.net.HttpURLConnection;
8	import java.net.URL;
9	import java.util.ArrayList;
10	import java.util.Iterator;
11	import java.util.Set;
12
13	import org.apache.commons.compress.compressors.CompressorException;
14	import org.apache.spark.api.java.function.FlatMapFunction;
15	import org.json.JSONArray;
16	import org.json.JSONObject;
17
18	/*
19	class PagedJSON implements Function<String, Boolean> {
20
21	private static final long serialVersionUID = 1L;
22
23	public Boolean call(String s) { return s.contains("a"); }
24	}
25	*/
26
27
28	class PagedJSON implements FlatMapFunction<String, String>
29	{
30	private static final long serialVersionUID = 1L;
31
32	protected String _input_dir;
33	protected String _solr_url;
34	protected String _output_dir;
35	protected int _verbosity;
36
37	public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity)
38	{
39	_input_dir = input_dir;
40	_solr_url = solr_url;
41	_output_dir = output_dir;
42	_verbosity = verbosity;
43	}
44
45	protected JSONObject readJSONFile(String filename)
46	{
47	StringBuilder sb = new StringBuilder();
48
49	try {
50
51	String str;
52	BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(_input_dir + "/" + filename);
53	while ((str = br.readLine()) != null) {
54	sb.append(str);
55	}
56
57	br.close();
58	}
59	catch (Exception e) {
60	e.printStackTrace();
61	}
62
63	JSONObject json_obj = new JSONObject(sb.toString());
64
65
66	return json_obj;
67	}
68
69	protected String generateSolrText(JSONObject ef_token_pos_count)
70	{
71	StringBuilder sb = new StringBuilder();
72
73	Iterator<String> token_iter = ef_token_pos_count.keys();
74	while (token_iter.hasNext()) {
75	String token = token_iter.next();
76
77	sb.append(token);
78	if (token_iter.hasNext()) {
79	sb.append(" ");
80	}
81	}
82
83	/*
84	Set<String> token_keys = ef_token_pos_count.keySet();
85	for (String token : token_keys) {
86	sb.append(token + " ");
87	}
88	*/
89
90	return sb.toString();
91	}
92
93	protected JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
94	{
95	JSONObject solr_update_json = null;
96
97	if (ef_page != null) {
98	JSONObject ef_body = ef_page.getJSONObject("body");
99	if (ef_body != null) {
100	JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
101	if (ef_token_pos_count != null) {
102
103	JSONObject solr_add_json = new JSONObject();
104
105	String text = generateSolrText(ef_token_pos_count);
106
107	JSONObject solr_doc_json = new JSONObject();
108	solr_doc_json.put("id", page_id);
109	solr_doc_json.put("volumeid_s", volume_id);
110	solr_doc_json.put("_text_", text);
111
112	solr_add_json.put("commitWithin", 5000);
113	solr_add_json.put("doc", solr_doc_json);
114
115	solr_update_json = new JSONObject();
116	solr_update_json.put("add",solr_add_json);
117
118	}
119	else {
120	System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
121	}
122	}
123	else {
124	System.err.println("Warning: empty body field for '" + page_id + "'");
125	}
126
127	}
128	else {
129	System.err.println("Warning: null page for '" + page_id + "'");
130	}
131
132
133	/*
134
135	/update/json/docs
136	*/
137
138	// For Reference ...
139	// Example documentation on Solr JSON syntax:
140	// https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
141	// #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
142
143	/*
144	curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
145	{
146	"add": {
147	"doc": {
148	"id": "DOC1",
149	"my_boosted_field": { use a map with boost/value for a boosted field
150	"boost": 2.3,
151	"value": "test"
152	},
153	"my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
154	}
155	},
156	"add": {
157	"commitWithin": 5000, commit this document within 5 seconds
158	"overwrite": false, don't check for existing documents with the same uniqueKey
159	"boost": 3.45, a document boost
160	"doc": {
161	"f1": "v1", Can use repeated keys for a multi-valued field
162	"f1": "v2"
163	}
164	},
165
166	"commit": {},
167	"optimize": { "waitSearcher":false },
168
169	"delete": { "id":"ID" }, delete by ID
170	"delete": { "query":"QUERY" } delete by query
171	}'
172	*/
173
174	return solr_update_json;
175	}
176
177	protected void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2)
178	{
179	try {
180	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_file_json_bz2);
181	bw.write(solr_add_doc_json.toString());
182	bw.close();
183	} catch (IOException e) {
184	e.printStackTrace();
185	} catch (CompressorException e) {
186	e.printStackTrace();
187	}
188	}
189
190	protected void postSolrDoc(JSONObject solr_add_doc_json)
191	{
192	// "http://10.11.0.53:8983/solr/"
193	//String post_url = "http://10.11.0.53:8983/solr/htrc-pd-ef/update";
194	String post_url = _solr_url;
195
196	//String curl_popen = "curl -X POST -H 'Content-Type: application/json'";
197	//curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'";
198	//curl_popen += " --data-binary '";
199	//curl_popen += "'"
200
201
202	try {
203	HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection()));
204	httpcon.setDoOutput(true);
205	httpcon.setRequestProperty("Content-Type", "application/json");
206	httpcon.setRequestProperty("Accept", "application/json");
207	httpcon.setRequestMethod("POST");
208	httpcon.connect();
209
210	byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8");
211	OutputStream os = httpcon.getOutputStream();
212	os.write(outputBytes);
213	os.close();
214	}
215	catch (Exception e) {
216	e.printStackTrace();
217	}
218
219	}
220	public Iterator<String> call(String json_file_in)
221	{
222	JSONObject extracted_feature_record = readJSONFile(json_file_in);
223
224	String volume_id = extracted_feature_record.getString("id");
225
226	//JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
227	//String title= ef_metadata.getString("title");
228
229	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
230
231
232	int ef_page_count = ef_features.getInt("pageCount");
233
234	if (_verbosity >= 1) {
235	System.out.println("Processing: " + json_file_in);
236	System.out.println(" pageCount = " + ef_page_count);
237	}
238
239	JSONArray ef_pages = ef_features.getJSONArray("pages");
240	int ef_num_pages = ef_pages.length();
241
242	// Make directory for page-level JSON output
243	String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
244	String page_json_dir = json_dir + "/pages";
245	ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
246
247	ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
248	for (int i = 0; i < ef_page_count; i++) {
249	String formatted_i = String.format("page-%06d", i);
250	String page_id = volume_id + "." + formatted_i;
251
252	if (_verbosity >= 2) {
253	System.out.println(" Page: " + page_id);
254	}
255
256	String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
257	ids.add(output_json_bz2);
258
259	if (i==0) {
260	System.out.println("Sample output JSON page file: " + output_json_bz2);
261	}
262
263	JSONObject ef_page = ef_pages.getJSONObject(i);
264
265	if (ef_page != null) {
266	// Convert to Solr add form
267	JSONObject solr_add_doc_json = generateSolrDocJSON(volume_id, page_id, ef_page);
268
269	if (i==20) {
270	System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
271	System.out.println("==================");
272	//System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("_text_"));
273	}
274
275	// create JSON obj of just the page (for now), and write it out
276	// write out the JSONOBject as a bz2 compressed file
277	/*
278	try {
279	BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2);
280	bw.write(ef_page.toString());
281	bw.close();
282	} catch (IOException e) {
283	e.printStackTrace();
284	} catch (CompressorException e) {
285	e.printStackTrace();
286	}
287	*/
288
289	if (_solr_url != null) {
290	if (i==20) {
291	System.out.println("Posting to: " + _solr_url);
292	}
293	postSolrDoc(solr_add_doc_json);
294	}
295
296	if (_output_dir != null) {
297	if (i==20) {
298	System.out.println("Saving to: " + _output_dir);
299	}
300	saveSolrDoc(solr_add_doc_json,output_json_bz2);
301	}
302	}
303	else {
304	System.err.println("Skipping: " + page_id);
305	}
306
307	}
308
309	/*
310	for (int i = 0; i < ef_num_pages; i++)
311	{
312	//String post_id = ef_pages.getJSONObject(i).getString("post_id");
313	//......
314	}
315	*/
316	//String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
317	/*
318	JSONArray arr = obj.getJSONArray("posts");
319	for (int i = 0; i < arr.length(); i++)
320	{
321	String post_id = arr.getJSONObject(i).getString("post_id");
322	......
323	}
324	*/
325
326
327	ids.add(volume_id);
328
329	return ids.iterator();
330	}
331	}
332

Note: See TracBrowser for help on using the repository browser.

Download in other formats: