Changeset 30970 for other-projects
- Timestamp:
- 2016-10-28T11:10:32+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java
r30953 r30970 4 4 import java.io.BufferedWriter; 5 5 import java.io.IOException; 6 import java.io.OutputStream; 7 import java.net.HttpURLConnection; 8 import java.net.URL; 6 9 import java.util.ArrayList; 7 10 import java.util.Iterator; 11 import java.util.Set; 8 12 9 13 import org.apache.commons.compress.compressors.CompressorException; … … 39 43 protected JSONObject readJSONFile(String filename) 40 44 { 41 //Path path = Paths.get(filename);42 43 45 StringBuilder sb = new StringBuilder(); 44 46 … … 59 61 JSONObject json_obj = new JSONObject(sb.toString()); 60 62 63 61 64 return json_obj; 62 65 } 63 66 67 protected String generateSolrText(JSONObject ef_token_pos_count) 68 { 69 StringBuilder sb = new StringBuilder(); 70 71 Iterator<String> token_iter = ef_token_pos_count.keys(); 72 while (token_iter.hasNext()) { 73 String token = token_iter.next(); 74 75 sb.append(token); 76 if (token_iter.hasNext()) { 77 sb.append(" "); 78 } 79 } 80 81 /* 82 Set<String> token_keys = ef_token_pos_count.keySet(); 83 for (String token : token_keys) { 84 sb.append(token + " "); 85 } 86 */ 87 88 return sb.toString(); 89 } 90 91 protected JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page) 92 { 93 JSONObject solr_add_json = null; 94 95 if (ef_page != null) { 96 JSONObject ef_body = ef_page.getJSONObject("body"); 97 if (ef_body != null) { 98 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); 99 if (ef_token_pos_count != null) { 100 101 solr_add_json = new JSONObject(); 102 103 String text = generateSolrText(ef_token_pos_count); 104 105 106 107 JSONObject solr_doc_json = new JSONObject(); 108 solr_doc_json.put("id", page_id); 109 solr_doc_json.put("volumeid", volume_id); 110 solr_doc_json.put("text_t", text); 111 solr_doc_json.put("commitWithin", 5000); 112 113 solr_add_json.put("add", solr_doc_json); 114 } 115 else { 116 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'"); 117 } 118 } 119 else { 120 System.err.println("Warning: empty body field for '" + page_id + "'"); 121 } 122 123 } 124 else { 125 System.err.println("Warning: null page for '" + page_id + "'"); 126 } 127 128 129 /* 130 131 /update/json/docs 132 */ 133 134 // For Reference ... 135 // Example documentation on Solr JSON syntax: 136 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers 137 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates 138 139 /* 140 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary ' 141 { 142 "add": { 143 "doc": { 144 "id": "DOC1", 145 "my_boosted_field": { use a map with boost/value for a boosted field 146 "boost": 2.3, 147 "value": "test" 148 }, 149 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field 150 } 151 }, 152 "add": { 153 "commitWithin": 5000, commit this document within 5 seconds 154 "overwrite": false, don't check for existing documents with the same uniqueKey 155 "boost": 3.45, a document boost 156 "doc": { 157 "f1": "v1", Can use repeated keys for a multi-valued field 158 "f1": "v2" 159 } 160 }, 161 162 "commit": {}, 163 "optimize": { "waitSearcher":false }, 164 165 "delete": { "id":"ID" }, delete by ID 166 "delete": { "query":"QUERY" } delete by query 167 }' 168 */ 169 170 //return solr_doc_json; 171 return solr_add_json; 172 } 173 174 protected void postSolrDoc(JSONObject solr_add_doc_json) 175 { 176 // "http://10.11.0.53:8983/solr/" 177 String post_url = "http://10.11.0.53:8983/solr/htrc-pd-ef/update"; 178 179 //String curl_popen = "curl -X POST -H 'Content-Type: application/json'"; 180 //curl_popen += " 'http://10.11.0.53:8983/solr/htrc-pd-ef/update'"; 181 //curl_popen += " --data-binary '"; 182 //curl_popen += "'" 183 184 185 try { 186 HttpURLConnection httpcon = (HttpURLConnection) ((new URL(post_url).openConnection())); 187 httpcon.setDoOutput(true); 188 httpcon.setRequestProperty("Content-Type", "application/json"); 189 httpcon.setRequestProperty("Accept", "application/json"); 190 httpcon.setRequestMethod("POST"); 191 httpcon.connect(); 192 193 byte[] outputBytes = solr_add_doc_json.toString().getBytes("UTF-8"); 194 OutputStream os = httpcon.getOutputStream(); 195 os.write(outputBytes); 196 os.close(); 197 } 198 catch (Exception e) { 199 e.printStackTrace(); 200 } 201 202 } 64 203 public Iterator<String> call(String json_file_in) 65 204 { … … 69 208 70 209 71 String id = extracted_feature_record.getString("id");210 String volume_id = extracted_feature_record.getString("id"); 72 211 73 212 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 213 //String title= ef_metadata.getString("title"); 214 74 215 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 75 216 … … 89 230 String page_json_dir = json_dir + "/pages"; 90 231 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 91 //System.out.println("mkdir: " + _output_dir + "/" + page_json_dir);92 232 93 233 ArrayList<String> ids = new ArrayList<String>(ef_num_pages); 94 234 for (int i = 0; i < ef_page_count; i++) { 95 235 String formatted_i = String.format("page-%06d", i); 96 String page_id = id + "." + formatted_i;236 String page_id = volume_id + "." + formatted_i; 97 237 98 238 if (_verbosity >= 2) { … … 107 247 } 108 248 109 // create JSON obj of just the page (for now), and write it out110 249 JSONObject ef_page = ef_pages.getJSONObject(i); 111 try { 112 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2); 113 bw.write(ef_page.toString()); 114 bw.close(); 115 } catch (IOException e) { 116 e.printStackTrace(); 117 } catch (CompressorException e) { 118 e.printStackTrace(); 119 } 120 250 251 if (ef_page != null) { 252 // Convert to Solr add form 253 JSONObject solr_add_doc_json = generateSolrDocJSON(volume_id, page_id, ef_page); 254 255 if (i==20) { 256 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 257 System.out.println("=================="); 258 //System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("text_t")); 259 } 260 261 // create JSON obj of just the page (for now), and write it out 262 // write out the JSONOBject as a bz2 compressed file 263 /* 264 try { 265 BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2); 266 bw.write(ef_page.toString()); 267 bw.close(); 268 } catch (IOException e) { 269 e.printStackTrace(); 270 } catch (CompressorException e) { 271 e.printStackTrace(); 272 } 273 */ 274 275 //postSolrDoc(solr_add_doc_json); 276 277 } 278 else { 279 System.err.println("Skipping: " + page_id); 280 } 121 281 122 282 } … … 140 300 141 301 142 ids.add( id);302 ids.add(volume_id); 143 303 144 304 return ids.iterator();
Note:
See TracChangeset
for help on using the changeset viewer.