Changeset 31045 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java
- Timestamp:
- 2016-11-02T21:34:47+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java
r31030 r31045 1 1 package org.hathitrust.extractedfeatures; 2 2 3 import java.io.IOException; 3 4 import java.util.ArrayList; 4 5 import java.util.Iterator; … … 31 32 protected double _progress_step; 32 33 34 boolean _strict_file_io; 35 33 36 public PerPageJSONFlatmap(String input_dir, String solr_url, String output_dir, int verbosity, 34 DoubleAccumulator progress_accum, double progress_step) 37 DoubleAccumulator progress_accum, double progress_step, 38 boolean strict_file_io) 35 39 { 36 40 _input_dir = input_dir; … … 41 45 _progress_accum = progress_accum; 42 46 _progress_step = progress_step; 47 48 _strict_file_io = strict_file_io; 43 49 } 44 50 45 public Iterator<JSONObject> call(String json_file_in) 51 public Iterator<JSONObject> call(String json_file_in) throws IOException 46 52 //public void call(String json_file_in) 47 53 { 48 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(_input_dir + "/" + json_file_in); 54 String full_json_file_in = _input_dir + "/" + json_file_in; 55 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 49 56 50 String volume_id = extracted_feature_record.getString("id");57 ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>(); 51 58 52 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 53 //String title= ef_metadata.getString("title"); 54 55 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 56 57 58 int ef_page_count = ef_features.getInt("pageCount"); 59 60 if (_verbosity >= 1) { 61 System.out.println("Processing: " + json_file_in); 62 System.out.println(" pageCount = " + ef_page_count); 59 if (extracted_feature_record != null) { 60 String volume_id = extracted_feature_record.getString("id"); 61 62 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 63 //String title= ef_metadata.getString("title"); 64 65 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 66 67 68 int ef_page_count = ef_features.getInt("pageCount"); 69 70 if (_verbosity >= 1) { 71 System.out.println("Processing: " + json_file_in); 72 System.out.println(" pageCount = " + ef_page_count); 73 } 74 75 JSONArray ef_pages = ef_features.getJSONArray("pages"); 76 int ef_num_pages = ef_pages.length(); 77 if (ef_num_pages != ef_page_count) { 78 System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")" 79 +" does not match 'pageCount' metadata (" + ef_page_count + ")"); 80 } 81 82 // Make directory for page-level JSON output 83 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 84 String page_json_dir = json_dir + "/pages"; 85 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 86 87 if (_verbosity >= 2) { 88 System.out.print(" Pages: "); 89 } 90 91 for (int i = 0; i < ef_page_count; i++) { 92 String formatted_i = String.format("page-%06d", i); 93 String page_id = volume_id + "." + formatted_i; 94 95 if (_verbosity >= 2) { 96 if (i>0) { 97 System.out.print(", "); 98 } 99 System.out.print(page_id); 100 } 101 102 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 103 104 if (i==(ef_page_count-1)) { 105 if (_verbosity >= 2) { 106 System.out.println(); 107 } 108 System.out.println("Sample output JSON page file: " + output_json_bz2); 109 } 110 111 JSONObject ef_page = ef_pages.getJSONObject(i); 112 113 if (ef_page != null) { 114 // Convert to Solr add form 115 JSONObject solr_add_doc_json = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page); 116 solr_add_doc_json.put("filename_json_bz2", output_json_bz2); 117 118 json_pages.add(solr_add_doc_json); 119 120 121 } 122 else { 123 System.err.println("Skipping: " + page_id); 124 } 125 126 } 63 127 } 64 65 JSONArray ef_pages = ef_features.getJSONArray("pages"); 66 int ef_num_pages = ef_pages.length(); 67 68 // Make directory for page-level JSON output 69 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 70 String page_json_dir = json_dir + "/pages"; 71 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 72 73 if (_verbosity >= 2) { 74 System.out.print(" Pages: "); 75 } 76 77 ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>(ef_num_pages); 78 for (int i = 0; i < ef_page_count; i++) { 79 String formatted_i = String.format("page-%06d", i); 80 String page_id = volume_id + "." + formatted_i; 81 82 if (_verbosity >= 2) { 83 if (i>0) { 84 System.out.print(", "); 85 } 86 System.out.print(page_id); 87 } 88 89 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 90 //ids.add(output_json_bz2); 91 92 if (i==(ef_page_count-1)) { 93 if (_verbosity >= 2) { 94 System.out.println(); 95 } 96 System.out.println("Sample output JSON page file: " + output_json_bz2); 97 } 98 99 JSONObject ef_page = ef_pages.getJSONObject(i); 100 101 if (ef_page != null) { 102 // Convert to Solr add form 103 JSONObject solr_add_doc_json = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page); 104 solr_add_doc_json.put("filename_json_bz2", output_json_bz2); 105 106 json_pages.add(solr_add_doc_json); 107 108 128 else { 129 // File did not exist, or could not be parsed 130 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'"; 131 if (_strict_file_io) { 132 throw new IOException(mess); 109 133 } 110 134 else { 111 System.err.println("Skipping: " + page_id); 135 System.err.println("Warning: " + mess); 136 System.out.println("Warning: " + mess); 112 137 } 113 114 138 } 115 116 117 //ids.add(volume_id); 139 118 140 _progress_accum.add(_progress_step); 119 141 120 //return ids.iterator();121 142 return json_pages.iterator(); 122 143 }
Note:
See TracChangeset
for help on using the changeset viewer.