Changeset 31783 for other-projects/hathitrust/wcsa/extracted-features-solr
- Timestamp:
- 2017-07-07T23:31:25+12:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31675 r31783 168 168 // Convert to Solr add form 169 169 JSONObject solr_add_doc_json 170 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _universal_langmap, _icu_tokenize); 170 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, 171 ef_metadata, ef_page, 172 _whitelist_bloomfilter, _universal_langmap, _icu_tokenize); 171 173 172 174 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31779 r31783 31 31 public class SolrDocJSON { 32 32 33 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata) 34 { 35 JSONObject solr_update_json = null; 33 protected static String [] metadata_single = new String[] { 34 "accessProfile", 35 "bibliographicFormat", 36 "dateCreated", // date 37 //"enumerationChronology", // What is this? 38 //"governmentDocument", // bool: true/false 39 "handleUrl", 40 "hathitrustRecordNumber", // int? 41 "htBibUrl", 42 "imprint", 43 "issuance", 44 "language", 45 "lastUpdateDate", 46 "pubDate", 47 "pubPlace", 48 "rightsAttributes", 49 "schemaVersion", 50 "sourceInstitution", 51 "sourceInstitutionRecordNumber", 52 "title", 53 "typeOfResource", 54 "volumeIdentifier" 55 }; 56 57 protected static String [] metadata_multiple = new String[] { 58 "oclc", 59 "isbn", 60 "issn", 61 "lccn", 62 "genre", 63 "names" 64 }; 65 66 protected static String [] metadata_hashmap_multiple = new String[] { 67 "classification" 68 }; 69 70 protected static JSONObject generateMetadataSolrDocJSON(String id, JSONObject ef_metadata, boolean is_page_level) 71 { 36 72 /* 37 73 Example JSON for id: "gri.ark:/13960/t0003qw46 … … 81 117 */ 82 118 83 String [] metadata_single = new String[] { 84 "accessProfile", 85 "bibliographicFormat", 86 "dateCreated", // date 87 //"enumerationChronology", // What is this? 88 //"governmentDocument", // bool: true/false 89 "handleUrl", 90 "hathitrustRecordNumber", // int? 91 "htBibUrl", 92 "imprint", 93 "issuance", 94 "language", 95 "lastUpdateDate", 96 "pubDate", 97 "pubPlace", 98 "rightsAttributes", 99 "schemaVersion", 100 "sourceInstitution", 101 "sourceInstitutionRecordNumber", 102 "title", 103 "typeOfResource", 104 "volumeIdentifier" 105 }; 106 107 String [] metadata_multiple = new String[] { 108 "oclc", 109 "isbn", 110 "issn", 111 "lccn", 112 "genre", 113 "names" 114 }; 115 116 String [] metadata_hashmap_multiple = new String[] { 117 "classification" 118 }; 119 119 120 // For JSON Solr format see: 121 // https://cwiki.apache.org/confluence/display/solr/Uploading+Data+with+Index+Handlers 122 123 //String title= ef_metadata.getString("title"); 124 125 JSONObject solr_doc_json = new JSONObject(); 126 solr_doc_json.put("id", id); 127 128 for (String metaname: metadata_single) { 129 String metavalue = ef_metadata.getString(metaname); 130 131 if (metavalue != null) { 132 if (is_page_level) { 133 solr_doc_json.put("volume"+metaname+"_txt",metavalue); 134 solr_doc_json.put("volume"+metaname+"_htrcstring",metavalue); 135 } 136 else { 137 solr_doc_json.put(metaname+"_t",metavalue); 138 solr_doc_json.put(metaname+"_s",metavalue); 139 } 140 } 141 } 142 143 for (String metaname: metadata_multiple) { 144 JSONArray metavalues = ef_metadata.getJSONArray(metaname); 145 if (metavalues != null) { 146 if (is_page_level) { 147 solr_doc_json.put("volume"+metaname+"_txt",metavalues); 148 solr_doc_json.put("volume"+metaname+"_htrcstrings",metavalues); 149 } 150 else { 151 solr_doc_json.put(metaname+"_t",metavalues); 152 solr_doc_json.put(metaname+"_ss",metavalues); 153 } 154 } 155 } 156 157 for (String metaname: metadata_hashmap_multiple) { 158 JSONObject metakeys = ef_metadata.getJSONObject(metaname); 159 160 if (metakeys != null) { 161 162 Iterator<String> metakey_iter = metakeys.keys(); 163 while (metakey_iter.hasNext()) { 164 String metakey = metakey_iter.next(); 165 166 JSONArray metavalues = metakeys.getJSONArray(metakey); 167 if (metavalues != null) { 168 String combined_metaname = metaname + "_" + metakey; 169 if (is_page_level) { 170 solr_doc_json.put("volume"+combined_metaname+"_txt",metavalues); 171 solr_doc_json.put("volume"+combined_metaname+"_htrcstrings",metavalues); 172 } 173 else { 174 solr_doc_json.put(combined_metaname+"_t",metavalues); 175 solr_doc_json.put(combined_metaname+"_ss",metavalues); 176 } 177 } 178 } 179 } 180 } 181 182 return solr_doc_json; 183 184 } 185 186 protected static JSONObject generateToplevelMetadataSolrDocJSON(String volume_id, JSONObject ef_metadata) 187 { 188 JSONObject solr_update_json = null; 120 189 121 190 if (ef_metadata != null) { … … 127 196 JSONObject solr_add_json = new JSONObject(); 128 197 129 JSONObject solr_doc_json = new JSONObject(); 130 solr_doc_json.put("id", volume_id); 131 132 for (String metaname: metadata_single) { 133 String metavalue = ef_metadata.getString(metaname); 134 if (metavalue != null) { 135 solr_doc_json.put(metaname+"_t",metavalue); 136 solr_doc_json.put(metaname+"_s",metavalue); 137 } 138 } 139 140 for (String metaname: metadata_multiple) { 141 JSONArray metavalues = ef_metadata.getJSONArray(metaname); 142 if (metavalues != null) { 143 solr_doc_json.put(metaname+"_t",metavalues); 144 solr_doc_json.put(metaname+"_ss",metavalues); 145 } 146 } 147 148 for (String metaname: metadata_hashmap_multiple) { 149 JSONObject metakeys = ef_metadata.getJSONObject(metaname); 150 151 if (metakeys != null) { 152 Iterator<String> metakey_iter = metakeys.keys(); 153 while (metakey_iter.hasNext()) { 154 String metakey = metakey_iter.next(); 155 156 JSONArray metavalues = metakeys.getJSONArray(metakey); 157 if (metavalues != null) { 158 String combined_metaname = metaname + "_" + metakey; 159 solr_doc_json.put(combined_metaname+"_t",metavalues); 160 solr_doc_json.put(combined_metaname+"_ss",metavalues); 161 } 162 } 163 } 164 } 198 JSONObject solr_doc_json = generateMetadataSolrDocJSON(volume_id,ef_metadata,false); 165 199 166 200 solr_add_json.put("commitWithin", 60000); // used to be 5000 … … 179 213 180 214 215 216 181 217 182 218 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id, … … 599 635 } 600 636 601 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page, 637 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, 638 JSONObject ef_metadata, JSONObject ef_page, 602 639 WhitelistBloomFilter whitelist_bloomfilter, 603 640 UniversalPOSLangMap universal_langmap, … … 613 650 614 651 JSONObject solr_add_json = new JSONObject(); 615 652 616 653 ArrayList<POSString> text_al = filterSolrTextFields(ef_token_pos_count,page_id,whitelist_bloomfilter,universal_langmap,icu_tokenize); 617 654 618 JSONObject solr_doc_json = new JSONObject(); 619 solr_doc_json.put("id", page_id); 655 //JSONObject solr_doc_json = new JSONObject(); 656 JSONObject solr_doc_json = generateMetadataSolrDocJSON(page_id,ef_metadata,true); 657 658 //solr_doc_json.put("id", page_id); // now done in generateMetadataSolrDocJSON 620 659 solr_doc_json.put("volumeid_s", volume_id); 660 621 661 if (text_al.size()>0) { 622 662 addSolrLanguageTextFields(ef_page,text_al, universal_langmap, solr_doc_json); … … 626 666 solr_doc_json.put("efnotext_b", true); 627 667 } 628 solr_add_json.put("commitWithin", 5000);668 solr_add_json.put("commitWithin", 60000); // used to be 5000 629 669 solr_add_json.put("doc", solr_doc_json); 630 670
Note:
See TracChangeset
for help on using the changeset viewer.