source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/JSONSolrTransform.java@ 30996

Last change on this file since 30996 was 30996, checked in by davidb, 7 years ago

Code refactoring

  • Property svn:executable set to *
File size: 3.2 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.util.Iterator;
4
5import org.json.JSONObject;
6
7public class JSONSolrTransform {
8
9 protected static String generateSolrText(JSONObject ef_token_pos_count)
10 {
11 StringBuilder sb = new StringBuilder();
12
13 Iterator<String> token_iter = ef_token_pos_count.keys();
14 while (token_iter.hasNext()) {
15 String token = token_iter.next();
16
17 sb.append(token);
18 if (token_iter.hasNext()) {
19 sb.append(" ");
20 }
21 }
22
23 /*
24 Set<String> token_keys = ef_token_pos_count.keySet();
25 for (String token : token_keys) {
26 sb.append(token + " ");
27 }
28 */
29
30 return sb.toString();
31 }
32
33 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page)
34 {
35 JSONObject solr_update_json = null;
36
37 if (ef_page != null) {
38 JSONObject ef_body = ef_page.getJSONObject("body");
39 if (ef_body != null) {
40 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
41 if (ef_token_pos_count != null) {
42
43 JSONObject solr_add_json = new JSONObject();
44
45 String text = generateSolrText(ef_token_pos_count);
46
47 JSONObject solr_doc_json = new JSONObject();
48 solr_doc_json.put("id", page_id);
49 solr_doc_json.put("volumeid_s", volume_id);
50 solr_doc_json.put("eftext_txt", text);
51
52 solr_add_json.put("commitWithin", 5000);
53 solr_add_json.put("doc", solr_doc_json);
54
55 solr_update_json = new JSONObject();
56 solr_update_json.put("add",solr_add_json);
57
58 }
59 else {
60 System.err.println("Warning: empty tokenPosCount field for '" + page_id + "'");
61 }
62 }
63 else {
64 System.err.println("Warning: empty body field for '" + page_id + "'");
65 }
66
67 }
68 else {
69 System.err.println("Warning: null page for '" + page_id + "'");
70 }
71
72
73 /*
74
75 /update/json/docs
76 */
77
78 // For Reference ...
79 // Example documentation on Solr JSON syntax:
80 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers
81 // #UploadingDatawithIndexHandlers-JSONFormattedIndexUpdates
82
83 /*
84 curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/my_collection/update' --data-binary '
85 {
86 "add": {
87 "doc": {
88 "id": "DOC1",
89 "my_boosted_field": { use a map with boost/value for a boosted field
90 "boost": 2.3,
91 "value": "test"
92 },
93 "my_multivalued_field": [ "aaa", "bbb" ] Can use an array for a multi-valued field
94 }
95 },
96 "add": {
97 "commitWithin": 5000, commit this document within 5 seconds
98 "overwrite": false, don't check for existing documents with the same uniqueKey
99 "boost": 3.45, a document boost
100 "doc": {
101 "f1": "v1", Can use repeated keys for a multi-valued field
102 "f1": "v2"
103 }
104 },
105
106 "commit": {},
107 "optimize": { "waitSearcher":false },
108
109 "delete": { "id":"ID" }, delete by ID
110 "delete": { "query":"QUERY" } delete by query
111 }'
112 */
113
114 return solr_update_json;
115 }
116
117}
Note: See TracBrowser for help on using the repository browser.