package org.hathitrust.extractedfeatures; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.io.StringReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.Set; import org.apache.commons.compress.compressors.CompressorException; import org.json.JSONObject; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; public class SolrDocJSON { protected static ArrayList getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id) { boolean solr_icu_tokenize = true; ArrayList words = new ArrayList(); if (ef_token_pos_count != null) { Iterator token_iter = ef_token_pos_count.keys(); while (token_iter.hasNext()) { String token = token_iter.next(); if (solr_icu_tokenize == true) { Reader reader = new StringReader(token); Tokenizer tokenizer = new ICUTokenizer(); tokenizer.setReader(reader); CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); try { tokenizer.reset(); while (tokenizer.incrementToken()) { String term = charTermAttribute.toString(); words.add(term); } tokenizer.end(); tokenizer.close(); } catch (IOException e) { e.printStackTrace(); } } else { words.add(token); } } } else { System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); } /* Alternative way to get at keys Set token_keys = ef_token_pos_count.keySet(); for (String token : token_keys) { sb.append(token + " "); } */ return words; } protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id, WhitelistBloomFilter whitelist_bloomfilter) { ArrayList tokens = getTokenPosCountWords(ef_token_pos_count, page_id); StringBuilder sb = new StringBuilder(); if (whitelist_bloomfilter == null) { boolean first_append = true; for (int i=0; i generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page) { ArrayList word_list = null; if (ef_page != null) { JSONObject ef_body = ef_page.getJSONObject("body"); if (ef_body != null) { JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); word_list = getTokenPosCountWords(ef_token_pos_count,page_id); } else { System.err.println("Warning: empty body field for '" + page_id + "'"); } } else { System.err.println("Warning: null page for '" + page_id + "'"); } return word_list; } public static void saveSolrDoc(JSONObject solr_add_doc_json, String output_file_json_bz2) { try { BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_file_json_bz2); bw.write(solr_add_doc_json.toString()); bw.close(); } catch (IOException e) { e.printStackTrace(); } catch (CompressorException e) { e.printStackTrace(); } } public static void postSolrDoc(String post_url, JSONObject solr_add_doc_json) { //String curl_popen = "curl -X POST -H 'Content-Type: application/json'"; //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'"; //curl_popen += " --data-binary '"; //curl_popen += "'" try { HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection())); httpcon.setDoOutput(true); httpcon.setRequestProperty("Content-Type", "application/json"); httpcon.setRequestProperty("Accept", "application/json"); httpcon.setRequestMethod("POST"); httpcon.connect(); byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8"); OutputStream os = httpcon.getOutputStream(); os.write(outputBytes); os.close(); // Read response StringBuilder sb = new StringBuilder(); BufferedReader in = new BufferedReader(new InputStreamReader(httpcon.getInputStream())); String decodedString; while ((decodedString = in.readLine()) != null) { sb.append(decodedString); } in.close(); JSONObject solr_status_json = new JSONObject(sb.toString()); JSONObject response_header_json = solr_status_json.getJSONObject("responseHeader"); if (response_header_json != null) { int status = response_header_json.getInt("status"); if (status != 0) { System.err.println("Warning: POST request to " + post_url + " returned status " + status); System.err.println("Full response was: " + sb); } } else { System.err.println("Failed response to Solr POST: " + sb); } } catch (Exception e) { e.printStackTrace(); } } }