Changeset 31675 for other-projects/hathitrust/wcsa/extracted-features-solr
- Timestamp:
- 2017-05-11T22:19:06+12:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31597 r31675 134 134 } 135 135 } 136 /*136 137 137 // 138 138 // Now move on to POS extracted features per-page 139 139 // 140 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 141 142 int ef_page_count = ef_features.getInt("pageCount"); 143 144 if (_verbosity >= 1) { 145 System.out.println("Processing: " + volume_id); 146 System.out.println(" pageCount = " + ef_page_count); 147 } 148 149 JSONArray ef_pages = ef_features.getJSONArray("pages"); 150 ef_num_pages = ef_pages.length(); 151 152 153 for (int i = 0; i < ef_page_count; i++) { 154 String formatted_i = String.format("page-%06d", i); 155 String page_id = volume_id + "." + formatted_i; 156 157 if (_verbosity >= 2) { 158 System.out.println(" Page: " + page_id); 159 } 160 161 162 JSONObject ef_page = ef_pages.getJSONObject(i); 163 164 if (ef_page != null) { 165 // Convert to Solr add form 166 JSONObject solr_add_doc_json 167 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _universal_langmap, _icu_tokenize); 168 169 170 if ((_verbosity >=2) && (i==20)) { 171 System.out.println("=================="); 172 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 173 System.out.println("=================="); 174 } 175 176 177 if (solr_url != null) { 178 SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json, 140 boolean index_pages = true; 141 if (index_pages) { 142 143 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 144 145 int ef_page_count = ef_features.getInt("pageCount"); 146 147 if (_verbosity >= 1) { 148 System.out.println("Processing: " + volume_id); 149 System.out.println(" pageCount = " + ef_page_count); 150 } 151 152 JSONArray ef_pages = ef_features.getJSONArray("pages"); 153 ef_num_pages = ef_pages.length(); 154 155 156 for (int i = 0; i < ef_page_count; i++) { 157 String formatted_i = String.format("page-%06d", i); 158 String page_id = volume_id + "." + formatted_i; 159 160 if (_verbosity >= 2) { 161 System.out.println(" Page: " + page_id); 162 } 163 164 165 JSONObject ef_page = ef_pages.getJSONObject(i); 166 167 if (ef_page != null) { 168 // Convert to Solr add form 169 JSONObject solr_add_doc_json 170 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _universal_langmap, _icu_tokenize); 171 172 173 if ((_verbosity >=2) && (i==20)) { 174 System.out.println("=================="); 175 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 176 System.out.println("=================="); 177 } 178 179 180 if (solr_url != null) { 181 SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json, 179 182 volume_id, page_id); 180 }181 }182 else {183 System.err.println("Skipping: " + page_id);184 }185 186 }187 */183 } 184 } 185 else { 186 System.err.println("Skipping: " + page_id); 187 } 188 189 } 190 } 188 191 } 189 192 } -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31597 r31675 34 34 { 35 35 JSONObject solr_update_json = null; 36 36 /* 37 Example JSON for id: "gri.ark:/13960/t0003qw46 38 metadata: { 39 40 "accessProfile": "open", 41 "bibliographicFormat": "BK", 42 "classification": { 43 "lcc": [ 44 "ND646 .B8 1900" 45 ] 46 }, 47 "dateCreated": "2016-06-19T08:30:16.11199Z", 48 "enumerationChronology": " ", 49 "genre": [ 50 "not fiction" 51 ], 52 "governmentDocument": false, 53 "handleUrl": "http://hdl.handle.net/2027/gri.ark:/13960/t0003qw46", 54 "hathitrustRecordNumber": "100789562", 55 "htBibUrl": "http://catalog.hathitrust.org/api/volumes/full/htid/gri.ark:/13960/t0003qw46.json", 56 "imprint": "Burlington Fine Arts Club, 1900.", 57 "isbn": [], 58 "issn": [], 59 "issuance": "monographic", 60 "language": "eng", 61 "lastUpdateDate": "2015-09-14 13:25:03", 62 "lccn": [], 63 "names": [ 64 "Burlington Fine Arts Club " 65 ], 66 "oclc": [ 67 "25259734" 68 ], 69 "pubDate": "1900", 70 "pubPlace": "enk", 71 "rightsAttributes": "pd", 72 "schemaVersion": "1.3", 73 "sourceInstitution": "CMALG", 74 "sourceInstitutionRecordNumber": "9928077890001551", 75 "title": "Exhibition of pictures by Dutch masters of the seventeenth century.", 76 "typeOfResource": "text", 77 "volumeIdentifier": "gri.ark:/13960/t0003qw46" 78 79 } 80 81 */ 37 82 38 83 String [] metadata_single = new String[] { 39 84 "accessProfile", 40 "rightsAttributes", 41 "hathitrustRecordNumber", 42 "title", 85 "bibliographicFormat", 86 "dateCreated", // date 87 //"enumerationChronology", // What is this? 88 "governmentDocument", // bool: true/false 89 "handleUrl", 90 "hathitrustRecordNumber", // int? 91 "htBibUrl", 43 92 "imprint", 93 "issuance", 94 "language", 95 "lastUpdateDate", 44 96 "pubDate", 45 97 "pubPlace", 46 "language", 47 "issuance", 48 "typeOfResource" 98 "rightsAttributes", 99 "schemaVersion", 100 "sourceInstitution", 101 "sourceInstitutionRecordNumber", 102 "title", 103 "typeOfResource", 104 "volumeIdentifier" 49 105 }; 50 106 … … 58 114 }; 59 115 116 String [] metadata_hashmap_multiple = new String[] { 117 "classification" 118 }; 119 120 60 121 if (ef_metadata != null) { 61 122 … … 82 143 solr_doc_json.put(metaname+"_t",metavalues); 83 144 solr_doc_json.put(metaname+"_ss",metavalues); 145 } 146 } 147 148 for (String metaname: metadata_hashmap_multiple) { 149 JSONObject metakeys = ef_metadata.getJSONObject(metaname); 150 151 if (metakeys != null) { 152 Iterator<String> metakey_iter = metakeys.keys(); 153 while (metakey_iter.hasNext()) { 154 String metakey = metakey_iter.next(); 155 156 JSONArray metavalues = metakeys.getJSONArray(metakey); 157 if (metavalues != null) { 158 String combined_metaname = metaname + "_" + metakey; 159 solr_doc_json.put(combined_metaname+"_t",metavalues); 160 solr_doc_json.put(combined_metaname+"_ss",metavalues); 161 } 162 } 84 163 } 85 164 }
Note:
See TracChangeset
for help on using the changeset viewer.