package org.hathitrust.extractedfeatures; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import org.apache.commons.compress.compressors.CompressorException; import org.json.JSONObject; public class SolrDocJSON { protected static String generateSolrText(JSONObject ef_token_pos_count) { StringBuilder sb = new StringBuilder(); Iterator token_iter = ef_token_pos_count.keys(); while (token_iter.hasNext()) { String token = token_iter.next(); sb.append(token); if (token_iter.hasNext()) { sb.append(" "); } } /* Set token_keys = ef_token_pos_count.keySet(); for (String token : token_keys) { sb.append(token + " "); } */ return sb.toString(); } protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page) { JSONObject solr_update_json = null; if (ef_page != null) { JSONObject ef_body = ef_page.getJSONObject("body"); if (ef_body != null) { JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); if (ef_token_pos_count != null) { JSONObject solr_add_json = new JSONObject(); String text = generateSolrText(ef_token_pos_count); JSONObject solr_doc_json = new JSONObject(); solr_doc_json.put("id", page_id); solr_doc_json.put("volumeid_s", volume_id); solr_doc_json.put("eftext_txt", text); solr_add_json.put("commitWithin", 5000); solr_add_json.put("doc", solr_doc_json); solr_update_json = new JSONObject(); solr_update_json.put("add",solr_add_json); } else { System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); } } else { System.err.println("Warning: empty body field for '" + page_id + "'"); } } else { System.err.println("Warning: null page for '" + page_id + "'"); } /* /update/json/docs */ // For Reference ... // Example documentation on Solr JSON syntax: // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates /* curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary ' { "add": { "doc": { "id": "DOC1", "my_boosted_field": { use a map with boost/value for a boosted field "boost": 2.3, "value": "test" }, "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field } }, "add": { "commitWithin": 5000, commit this document within 5 seconds "overwrite": false, don't check for existing documents with the same uniqueKey "boost": 3.45, a document boost "doc": { "f1": "v1", Can use repeated keys for a multi-valued field "f1": "v2" } }, "commit": {}, "optimize": { "waitSearcher":false }, "delete": { "id":"ID" }, delete by ID "delete": { "query":"QUERY" } delete by query }' */ return solr_update_json; } protected static ArrayList generateTokenPostCountText(String volume_id, String page_id, JSONObject ef_page) { ArrayList word_list = new ArrayList(); if (ef_page != null) { JSONObject ef_body = ef_page.getJSONObject("body"); if (ef_body != null) { JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); if (ef_token_pos_count != null) { Iterator token_iter = ef_token_pos_count.keys(); while (token_iter.hasNext()) { String token = token_iter.next(); word_list.add(token); } } else { System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); } } else { System.err.println("Warning: empty body field for '" + page_id + "'"); } } else { System.err.println("Warning: null page for '" + page_id + "'"); } return word_list; } public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2) { try { BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2); bw.write(solr_add_doc_json.toString()); bw.close(); } catch (IOException e) { e.printStackTrace(); } catch (CompressorException e) { e.printStackTrace(); } } public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json) { //String curl_popen = "curl -X POST -H 'Content-Type: application/json'"; //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'"; //curl_popen += " --data-binary '"; //curl_popen += "'" try { HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection())); httpcon.setDoOutput(true); httpcon.setRequestProperty("Content-Type", "application/json"); httpcon.setRequestProperty("Accept", "application/json"); httpcon.setRequestMethod("POST"); httpcon.connect(); byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8"); OutputStream os = httpcon.getOutputStream(); os.write(outputBytes); os.close(); // Read response StringBuilder sb = new StringBuilder(); BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream())); String decodedString; while ((decodedString = in.readLine()) != null) { sb.append(decodedString); } in.close(); JSONObject solr_status_json = new JSONObject(sb.toString()); JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader"); if (response_header_json != null) { int status = response_header_json.getInt("status"); if (status != 0) { System.err.println("Warning: POST request to " + post_url + " returned status " + status); System.err.println("Full response was: " + sb); } } else { System.err.println("Failed response to Solr POST: " + sb); } } catch (Exception e) { e.printStackTrace(); } } }