[31001] | 1 | package org.hathitrust.extractedfeatures;
|
---|
| 2 |
|
---|
[31252] | 3 | import java.io.IOException;
|
---|
[31266] | 4 | import java.util.ArrayList;
|
---|
[31375] | 5 | import java.util.HashMap;
|
---|
[31266] | 6 | import java.util.Iterator;
|
---|
[31252] | 7 |
|
---|
[31372] | 8 | import org.apache.hadoop.io.Text;
|
---|
[31266] | 9 | import org.apache.spark.api.java.function.FlatMapFunction;
|
---|
[31372] | 10 | import org.apache.spark.api.java.function.Function;
|
---|
[31001] | 11 | import org.apache.spark.api.java.function.VoidFunction;
|
---|
| 12 | import org.apache.spark.util.DoubleAccumulator;
|
---|
| 13 | import org.json.JSONArray;
|
---|
| 14 | import org.json.JSONObject;
|
---|
| 15 |
|
---|
| 16 | /*
|
---|
| 17 | class PagedJSON implements Function<String, Boolean> {
|
---|
| 18 |
|
---|
| 19 | private static final long serialVersionUID = 1L;
|
---|
| 20 |
|
---|
| 21 | public Boolean call(String s) { return s.contains("a"); }
|
---|
| 22 | }
|
---|
| 23 | */
|
---|
| 24 |
|
---|
| 25 |
|
---|
[31266] | 26 | //public class PerVolumeJSON implements VoidFunction<String>
|
---|
[31372] | 27 | public class PerVolumeJSON implements Function<Text,Integer>
|
---|
[31001] | 28 | {
|
---|
| 29 | private static final long serialVersionUID = 1L;
|
---|
[31005] | 30 | protected String _input_dir;
|
---|
[31225] | 31 | protected String _whitelist_filename;
|
---|
[31375] | 32 | protected String _langmap_directory;
|
---|
[31225] | 33 |
|
---|
[31451] | 34 | protected final ArrayList<String> _solr_endpoints;
|
---|
| 35 | protected final int _solr_endpoints_len;
|
---|
| 36 |
|
---|
| 37 | //protected String _solr_url;
|
---|
[31005] | 38 | protected String _output_dir;
|
---|
[31225] | 39 |
|
---|
[31005] | 40 | protected int _verbosity;
|
---|
[31001] | 41 |
|
---|
[31225] | 42 | protected WhitelistBloomFilter _whitelist_bloomfilter;
|
---|
[31375] | 43 | protected UniversalPOSLangMap _universal_langmap;
|
---|
[31372] | 44 |
|
---|
[31375] | 45 | boolean _icu_tokenize;
|
---|
| 46 | boolean _strict_file_io;
|
---|
| 47 |
|
---|
[31450] | 48 | public PerVolumeJSON(String input_dir, String whitelist_filename, String langmap_directory,
|
---|
[31451] | 49 | ArrayList<String> solr_endpoints, String output_dir, int verbosity,
|
---|
[31252] | 50 | boolean icu_tokenize, boolean strict_file_io)
|
---|
[31001] | 51 | {
|
---|
[31450] | 52 | System.out.println("*** PerVolumeJSON Constructor, langmap_directory = " + langmap_directory);
|
---|
| 53 |
|
---|
[31005] | 54 | _input_dir = input_dir;
|
---|
[31225] | 55 | _whitelist_filename = whitelist_filename;
|
---|
[31450] | 56 | _langmap_directory = langmap_directory;
|
---|
[31220] | 57 |
|
---|
[31451] | 58 | _solr_endpoints = solr_endpoints;
|
---|
| 59 | _solr_endpoints_len = solr_endpoints.size();
|
---|
| 60 |
|
---|
| 61 | //_solr_url = solr_url;
|
---|
[31005] | 62 | _output_dir = output_dir;
|
---|
| 63 | _verbosity = verbosity;
|
---|
| 64 |
|
---|
[31252] | 65 | _icu_tokenize = icu_tokenize;
|
---|
| 66 | _strict_file_io = strict_file_io;
|
---|
| 67 |
|
---|
[31225] | 68 | _whitelist_bloomfilter = null;
|
---|
[31375] | 69 | _universal_langmap = null;
|
---|
[31001] | 70 | }
|
---|
[31005] | 71 |
|
---|
[31372] | 72 |
|
---|
| 73 | public Integer call(Text json_text) throws IOException
|
---|
| 74 |
|
---|
| 75 | {
|
---|
[31500] | 76 | if (_whitelist_filename != null) {
|
---|
| 77 |
|
---|
| 78 | synchronized (_whitelist_filename) {
|
---|
| 79 | if (_whitelist_bloomfilter == null) {
|
---|
| 80 |
|
---|
| 81 | _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
|
---|
| 82 | }
|
---|
| 83 | }
|
---|
[31372] | 84 | }
|
---|
[31375] | 85 |
|
---|
[31500] | 86 | if (_langmap_directory != null) {
|
---|
| 87 |
|
---|
| 88 | synchronized (_langmap_directory) {
|
---|
| 89 | if (_universal_langmap == null) {
|
---|
| 90 | _universal_langmap = new UniversalPOSLangMap(_langmap_directory);
|
---|
| 91 | }
|
---|
| 92 | }
|
---|
[31375] | 93 | }
|
---|
[31500] | 94 |
|
---|
[31372] | 95 | int ef_num_pages = 0;
|
---|
| 96 |
|
---|
[31451] | 97 | String solr_url = null;
|
---|
| 98 | if (_solr_endpoints_len > 0) {
|
---|
| 99 | int random_choice = (int)(_solr_endpoints_len * Math.random());
|
---|
| 100 | solr_url = _solr_endpoints.get(random_choice);
|
---|
| 101 | }
|
---|
| 102 |
|
---|
[31372] | 103 | try {
|
---|
| 104 |
|
---|
| 105 |
|
---|
| 106 | JSONObject extracted_feature_record = new JSONObject(json_text.toString());
|
---|
| 107 |
|
---|
| 108 | if (extracted_feature_record != null) {
|
---|
| 109 | String volume_id = extracted_feature_record.getString("id");
|
---|
| 110 |
|
---|
[31505] | 111 | JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
|
---|
[31372] | 112 | //String title= ef_metadata.getString("title");
|
---|
| 113 |
|
---|
[31505] | 114 | //
|
---|
| 115 | // Top-level metadata Solr doc
|
---|
| 116 | //
|
---|
| 117 | JSONObject solr_add_metadata_doc_json = SolrDocJSON.generateToplevelMetadataSolrDocJSON(volume_id,ef_metadata);
|
---|
| 118 | if (solr_add_metadata_doc_json != null) {
|
---|
| 119 |
|
---|
| 120 | if ((_verbosity >=2)) {
|
---|
| 121 | System.out.println("==================");
|
---|
| 122 | System.out.println("Metadata JSON: " + solr_add_metadata_doc_json.toString());
|
---|
| 123 | System.out.println("==================");
|
---|
| 124 | }
|
---|
| 125 |
|
---|
| 126 | if (solr_url != null) {
|
---|
| 127 |
|
---|
| 128 | if ((_verbosity >=2) ) {
|
---|
| 129 | System.out.println("==================");
|
---|
| 130 | System.out.println("Posting to: " + solr_url);
|
---|
| 131 | System.out.println("==================");
|
---|
| 132 | }
|
---|
| 133 | SolrDocJSON.postSolrDoc(solr_url, solr_add_metadata_doc_json, volume_id, "top-level-metadata");
|
---|
| 134 | }
|
---|
| 135 | }
|
---|
[31675] | 136 |
|
---|
[31505] | 137 | //
|
---|
| 138 | // Now move on to POS extracted features per-page
|
---|
| 139 | //
|
---|
[31675] | 140 | boolean index_pages = true;
|
---|
| 141 | if (index_pages) {
|
---|
| 142 |
|
---|
| 143 | JSONObject ef_features = extracted_feature_record.getJSONObject("features");
|
---|
[31372] | 144 |
|
---|
[31675] | 145 | int ef_page_count = ef_features.getInt("pageCount");
|
---|
[31372] | 146 |
|
---|
[31675] | 147 | if (_verbosity >= 1) {
|
---|
| 148 | System.out.println("Processing: " + volume_id);
|
---|
| 149 | System.out.println(" pageCount = " + ef_page_count);
|
---|
| 150 | }
|
---|
[31372] | 151 |
|
---|
[31675] | 152 | JSONArray ef_pages = ef_features.getJSONArray("pages");
|
---|
| 153 | ef_num_pages = ef_pages.length();
|
---|
[31372] | 154 |
|
---|
| 155 |
|
---|
[31675] | 156 | for (int i = 0; i < ef_page_count; i++) {
|
---|
| 157 | String formatted_i = String.format("page-%06d", i);
|
---|
| 158 | String page_id = volume_id + "." + formatted_i;
|
---|
[31372] | 159 |
|
---|
[31675] | 160 | if (_verbosity >= 2) {
|
---|
| 161 | System.out.println(" Page: " + page_id);
|
---|
| 162 | }
|
---|
[31372] | 163 |
|
---|
| 164 |
|
---|
[31675] | 165 | JSONObject ef_page = ef_pages.getJSONObject(i);
|
---|
[31372] | 166 |
|
---|
[31675] | 167 | if (ef_page != null) {
|
---|
| 168 | // Convert to Solr add form
|
---|
| 169 | JSONObject solr_add_doc_json
|
---|
[31783] | 170 | = SolrDocJSON.generateSolrDocJSON(volume_id, page_id,
|
---|
| 171 | ef_metadata, ef_page,
|
---|
| 172 | _whitelist_bloomfilter, _universal_langmap, _icu_tokenize);
|
---|
[31372] | 173 |
|
---|
| 174 |
|
---|
[31675] | 175 | if ((_verbosity >=2) && (i==20)) {
|
---|
| 176 | System.out.println("==================");
|
---|
| 177 | System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
|
---|
| 178 | System.out.println("==================");
|
---|
| 179 | }
|
---|
[31372] | 180 |
|
---|
| 181 |
|
---|
[31675] | 182 | if (solr_url != null) {
|
---|
| 183 | SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json,
|
---|
[31500] | 184 | volume_id, page_id);
|
---|
[31675] | 185 | }
|
---|
[31372] | 186 | }
|
---|
[31675] | 187 | else {
|
---|
| 188 | System.err.println("Skipping: " + page_id);
|
---|
| 189 | }
|
---|
| 190 |
|
---|
[31372] | 191 | }
|
---|
| 192 | }
|
---|
[31784] | 193 | else {
|
---|
| 194 | System.err.println("Skipping per-page POS text indexing");
|
---|
| 195 | }
|
---|
| 196 |
|
---|
[31372] | 197 | }
|
---|
| 198 | }
|
---|
| 199 | catch (Exception e) {
|
---|
| 200 | if (_strict_file_io) {
|
---|
| 201 | throw e;
|
---|
| 202 | }
|
---|
| 203 | else {
|
---|
| 204 | e.printStackTrace();
|
---|
| 205 | }
|
---|
| 206 | }
|
---|
| 207 |
|
---|
| 208 | return ef_num_pages;
|
---|
| 209 |
|
---|
| 210 | }
|
---|
| 211 |
|
---|
| 212 | /*
|
---|
[31266] | 213 | //public void call(String json_file_in) throws IOException
|
---|
[31372] | 214 | public Integer call(String json_file_in) throws IOException
|
---|
[31266] | 215 |
|
---|
[31001] | 216 | {
|
---|
[31226] | 217 | if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
|
---|
[31225] | 218 | _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
|
---|
| 219 | }
|
---|
| 220 |
|
---|
[31372] | 221 | int ef_num_pages = 0;
|
---|
| 222 |
|
---|
[31278] | 223 | ArrayList<String> ids = new ArrayList<String>(); // want it to be non-null so can return valid iterator
|
---|
[31266] | 224 |
|
---|
[31252] | 225 | String full_json_file_in = _input_dir + "/" + json_file_in;
|
---|
| 226 | JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
|
---|
[31001] | 227 |
|
---|
[31252] | 228 | if (extracted_feature_record != null) {
|
---|
| 229 | String volume_id = extracted_feature_record.getString("id");
|
---|
| 230 |
|
---|
| 231 | //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
|
---|
| 232 | //String title= ef_metadata.getString("title");
|
---|
| 233 |
|
---|
| 234 | JSONObject ef_features = extracted_feature_record.getJSONObject("features");
|
---|
| 235 |
|
---|
| 236 | int ef_page_count = ef_features.getInt("pageCount");
|
---|
| 237 |
|
---|
| 238 | if (_verbosity >= 1) {
|
---|
| 239 | System.out.println("Processing: " + json_file_in);
|
---|
| 240 | System.out.println(" pageCount = " + ef_page_count);
|
---|
[31001] | 241 | }
|
---|
[31252] | 242 |
|
---|
| 243 | JSONArray ef_pages = ef_features.getJSONArray("pages");
|
---|
[31372] | 244 | ef_num_pages = ef_pages.length();
|
---|
[31252] | 245 |
|
---|
| 246 | // Make directory for page-level JSON output
|
---|
| 247 | String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
|
---|
| 248 | String page_json_dir = json_dir + "/pages";
|
---|
| 249 |
|
---|
| 250 | if (_output_dir != null) {
|
---|
| 251 | ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
|
---|
[31001] | 252 | }
|
---|
[31278] | 253 |
|
---|
[31266] | 254 | ids = new ArrayList<String>(ef_num_pages);
|
---|
[31252] | 255 | for (int i = 0; i < ef_page_count; i++) {
|
---|
| 256 | String formatted_i = String.format("page-%06d", i);
|
---|
| 257 | String page_id = volume_id + "." + formatted_i;
|
---|
[31001] | 258 |
|
---|
[31252] | 259 | if (_verbosity >= 2) {
|
---|
| 260 | System.out.println(" Page: " + page_id);
|
---|
[31001] | 261 | }
|
---|
[31252] | 262 |
|
---|
| 263 | String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
|
---|
[31266] | 264 | ids.add(page_id);
|
---|
[31252] | 265 |
|
---|
[31269] | 266 | if (_verbosity >=2) {
|
---|
| 267 | if (i==0) {
|
---|
| 268 | System.out.println("Sample output JSON page file [i=0]: " + output_json_bz2);
|
---|
| 269 | }
|
---|
[31001] | 270 | }
|
---|
[31252] | 271 | JSONObject ef_page = ef_pages.getJSONObject(i);
|
---|
| 272 |
|
---|
| 273 | if (ef_page != null) {
|
---|
| 274 | // Convert to Solr add form
|
---|
| 275 | JSONObject solr_add_doc_json
|
---|
| 276 | = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
|
---|
| 277 |
|
---|
| 278 |
|
---|
[31001] | 279 | if ((_verbosity >=2) && (i==20)) {
|
---|
| 280 | System.out.println("==================");
|
---|
[31252] | 281 | System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
|
---|
[31001] | 282 | System.out.println("==================");
|
---|
| 283 | }
|
---|
[31252] | 284 |
|
---|
| 285 |
|
---|
| 286 | if (_solr_url != null) {
|
---|
| 287 | if ((_verbosity >=2) && (i==20)) {
|
---|
| 288 | System.out.println("==================");
|
---|
| 289 | System.out.println("Posting to: " + _solr_url);
|
---|
| 290 | System.out.println("==================");
|
---|
| 291 | }
|
---|
| 292 | SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
|
---|
| 293 | }
|
---|
| 294 |
|
---|
| 295 | if (_output_dir != null) {
|
---|
| 296 | if ((_verbosity >=2) && (i==20)) {
|
---|
| 297 | System.out.println("==================");
|
---|
| 298 | System.out.println("Saving to: " + _output_dir);
|
---|
| 299 | System.out.println("==================");
|
---|
| 300 | }
|
---|
| 301 | SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
|
---|
| 302 | }
|
---|
[31001] | 303 | }
|
---|
[31252] | 304 | else {
|
---|
| 305 | System.err.println("Skipping: " + page_id);
|
---|
| 306 | }
|
---|
| 307 |
|
---|
[31001] | 308 | }
|
---|
[31252] | 309 | }
|
---|
| 310 | else {
|
---|
| 311 | // File did not exist, or could not be parsed
|
---|
| 312 | String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
|
---|
| 313 | if (_strict_file_io) {
|
---|
| 314 | throw new IOException(mess);
|
---|
| 315 | }
|
---|
[31001] | 316 | else {
|
---|
[31252] | 317 | System.err.println("Warning: " + mess);
|
---|
| 318 | System.out.println("Warning: " + mess);
|
---|
[31001] | 319 | }
|
---|
| 320 | }
|
---|
| 321 |
|
---|
[31372] | 322 | return ef_num_pages;
|
---|
| 323 |
|
---|
[31001] | 324 | }
|
---|
[31372] | 325 | */
|
---|
[31001] | 326 | }
|
---|
| 327 |
|
---|