Changeset 31252 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
- Timestamp:
- 2016-12-20T14:15:05+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31226 r31252 1 1 package org.hathitrust.extractedfeatures; 2 3 import java.io.IOException; 2 4 3 5 import org.apache.spark.api.java.function.VoidFunction; … … 32 34 protected double _progress_step; 33 35 36 boolean _icu_tokenize; 37 boolean _strict_file_io; 38 34 39 public PerVolumeJSON(String input_dir, String whitelist_filename, 35 40 String solr_url, String output_dir, int verbosity, 36 DoubleAccumulator progress_accum, double progress_step) 41 DoubleAccumulator progress_accum, double progress_step, 42 boolean icu_tokenize, boolean strict_file_io) 37 43 { 38 44 _input_dir = input_dir; … … 46 52 _progress_step = progress_step; 47 53 54 _icu_tokenize = icu_tokenize; 55 _strict_file_io = strict_file_io; 56 48 57 _whitelist_bloomfilter = null; 49 58 } 50 59 51 60 //public Iterator<String> call(String json_file_in) 52 public void call(String json_file_in) 61 public void call(String json_file_in) throws IOException 53 62 { 54 63 if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) { … … 56 65 } 57 66 58 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(_input_dir + "/" + json_file_in); 67 String full_json_file_in = _input_dir + "/" + json_file_in; 68 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 59 69 60 String volume_id = extracted_feature_record.getString("id"); 61 62 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 63 //String title= ef_metadata.getString("title"); 64 65 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 66 67 68 int ef_page_count = ef_features.getInt("pageCount"); 69 70 if (_verbosity >= 1) { 71 System.out.println("Processing: " + json_file_in); 72 System.out.println(" pageCount = " + ef_page_count); 73 } 74 75 JSONArray ef_pages = ef_features.getJSONArray("pages"); 76 int ef_num_pages = ef_pages.length(); 77 78 // Make directory for page-level JSON output 79 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 80 String page_json_dir = json_dir + "/pages"; 81 82 if (_output_dir != null) { 83 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 84 } 85 86 //ArrayList<String> ids = new ArrayList<String>(ef_num_pages); 87 for (int i = 0; i < ef_page_count; i++) { 88 String formatted_i = String.format("page-%06d", i); 89 String page_id = volume_id + "." + formatted_i; 90 91 if (_verbosity >= 2) { 92 System.out.println(" Page: " + page_id); 70 if (extracted_feature_record != null) { 71 String volume_id = extracted_feature_record.getString("id"); 72 73 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 74 //String title= ef_metadata.getString("title"); 75 76 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 77 78 int ef_page_count = ef_features.getInt("pageCount"); 79 80 if (_verbosity >= 1) { 81 System.out.println("Processing: " + json_file_in); 82 System.out.println(" pageCount = " + ef_page_count); 93 83 } 94 95 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 96 //ids.add(output_json_bz2); // **** 97 98 if (i==0) { 99 System.out.println("Sample output JSON page file: " + output_json_bz2); 84 85 JSONArray ef_pages = ef_features.getJSONArray("pages"); 86 int ef_num_pages = ef_pages.length(); 87 88 // Make directory for page-level JSON output 89 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 90 String page_json_dir = json_dir + "/pages"; 91 92 if (_output_dir != null) { 93 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 100 94 } 101 102 JSONObject ef_page = ef_pages.getJSONObject(i);103 95 104 if (ef_page != null) {105 // Convert to Solr add form106 JSONObject solr_add_doc_json107 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter);96 //ArrayList<String> ids = new ArrayList<String>(ef_num_pages); 97 for (int i = 0; i < ef_page_count; i++) { 98 String formatted_i = String.format("page-%06d", i); 99 String page_id = volume_id + "." + formatted_i; 108 100 109 110 if ((_verbosity >=2) && (i==20)) { 111 System.out.println("=================="); 112 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 113 System.out.println("=================="); 101 if (_verbosity >= 2) { 102 System.out.println(" Page: " + page_id); 114 103 } 115 116 117 if (_solr_url != null) { 104 105 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 106 //ids.add(output_json_bz2); // **** 107 108 if (i==0) { 109 System.out.println("Sample output JSON page file: " + output_json_bz2); 110 } 111 112 JSONObject ef_page = ef_pages.getJSONObject(i); 113 114 if (ef_page != null) { 115 // Convert to Solr add form 116 JSONObject solr_add_doc_json 117 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize); 118 119 118 120 if ((_verbosity >=2) && (i==20)) { 119 121 System.out.println("=================="); 120 System.out.println(" Posting to: " + _solr_url);122 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 121 123 System.out.println("=================="); 122 124 } 123 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json); 125 126 127 if (_solr_url != null) { 128 if ((_verbosity >=2) && (i==20)) { 129 System.out.println("=================="); 130 System.out.println("Posting to: " + _solr_url); 131 System.out.println("=================="); 132 } 133 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json); 134 } 135 136 if (_output_dir != null) { 137 if ((_verbosity >=2) && (i==20)) { 138 System.out.println("=================="); 139 System.out.println("Saving to: " + _output_dir); 140 System.out.println("=================="); 141 } 142 SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2); 143 } 144 } 145 else { 146 System.err.println("Skipping: " + page_id); 124 147 } 125 148 126 if (_output_dir != null) { 127 if ((_verbosity >=2) && (i==20)) { 128 System.out.println("=================="); 129 System.out.println("Saving to: " + _output_dir); 130 System.out.println("=================="); 131 } 132 SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2); 133 } 149 } 150 } 151 else { 152 // File did not exist, or could not be parsed 153 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'"; 154 if (_strict_file_io) { 155 throw new IOException(mess); 134 156 } 135 157 else { 136 System.err.println("Skipping: " + page_id); 158 System.err.println("Warning: " + mess); 159 System.out.println("Warning: " + mess); 137 160 } 138 139 161 } 140 141 162 142 163 //ids.add(volume_id);
Note:
See TracChangeset
for help on using the changeset viewer.