Changeset 31252 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java
- Timestamp:
- 2016-12-20T14:15:05+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java
r31226 r31252 38 38 protected double _progress_step; 39 39 40 boolean _icu_tokenize; 40 41 boolean _strict_file_io; 41 42 … … 43 44 String solr_url, String output_dir, int verbosity, 44 45 DoubleAccumulator progress_accum, double progress_step, 45 boolean strict_file_io)46 boolean icu_tokenize, boolean strict_file_io) 46 47 { 47 48 _input_dir = input_dir; … … 55 56 _progress_step = progress_step; 56 57 58 _icu_tokenize = icu_tokenize; 57 59 _strict_file_io = strict_file_io; 58 60 … … 132 134 // Convert to Solr add form 133 135 JSONObject solr_add_doc_json 134 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter );136 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter,_icu_tokenize); 135 137 solr_add_doc_json.put("filename_json_bz2", output_json_bz2); 136 138 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31226 r31252 1 1 package org.hathitrust.extractedfeatures; 2 3 import java.io.IOException; 2 4 3 5 import org.apache.spark.api.java.function.VoidFunction; … … 32 34 protected double _progress_step; 33 35 36 boolean _icu_tokenize; 37 boolean _strict_file_io; 38 34 39 public PerVolumeJSON(String input_dir, String whitelist_filename, 35 40 String solr_url, String output_dir, int verbosity, 36 DoubleAccumulator progress_accum, double progress_step) 41 DoubleAccumulator progress_accum, double progress_step, 42 boolean icu_tokenize, boolean strict_file_io) 37 43 { 38 44 _input_dir = input_dir; … … 46 52 _progress_step = progress_step; 47 53 54 _icu_tokenize = icu_tokenize; 55 _strict_file_io = strict_file_io; 56 48 57 _whitelist_bloomfilter = null; 49 58 } 50 59 51 60 //public Iterator<String> call(String json_file_in) 52 public void call(String json_file_in) 61 public void call(String json_file_in) throws IOException 53 62 { 54 63 if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) { … … 56 65 } 57 66 58 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(_input_dir + "/" + json_file_in); 67 String full_json_file_in = _input_dir + "/" + json_file_in; 68 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 59 69 60 String volume_id = extracted_feature_record.getString("id"); 61 62 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 63 //String title= ef_metadata.getString("title"); 64 65 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 66 67 68 int ef_page_count = ef_features.getInt("pageCount"); 69 70 if (_verbosity >= 1) { 71 System.out.println("Processing: " + json_file_in); 72 System.out.println(" pageCount = " + ef_page_count); 73 } 74 75 JSONArray ef_pages = ef_features.getJSONArray("pages"); 76 int ef_num_pages = ef_pages.length(); 77 78 // Make directory for page-level JSON output 79 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 80 String page_json_dir = json_dir + "/pages"; 81 82 if (_output_dir != null) { 83 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 84 } 85 86 //ArrayList<String> ids = new ArrayList<String>(ef_num_pages); 87 for (int i = 0; i < ef_page_count; i++) { 88 String formatted_i = String.format("page-%06d", i); 89 String page_id = volume_id + "." + formatted_i; 90 91 if (_verbosity >= 2) { 92 System.out.println(" Page: " + page_id); 70 if (extracted_feature_record != null) { 71 String volume_id = extracted_feature_record.getString("id"); 72 73 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata"); 74 //String title= ef_metadata.getString("title"); 75 76 JSONObject ef_features = extracted_feature_record.getJSONObject("features"); 77 78 int ef_page_count = ef_features.getInt("pageCount"); 79 80 if (_verbosity >= 1) { 81 System.out.println("Processing: " + json_file_in); 82 System.out.println(" pageCount = " + ef_page_count); 93 83 } 94 95 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 96 //ids.add(output_json_bz2); // **** 97 98 if (i==0) { 99 System.out.println("Sample output JSON page file: " + output_json_bz2); 84 85 JSONArray ef_pages = ef_features.getJSONArray("pages"); 86 int ef_num_pages = ef_pages.length(); 87 88 // Make directory for page-level JSON output 89 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2"); 90 String page_json_dir = json_dir + "/pages"; 91 92 if (_output_dir != null) { 93 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir); 100 94 } 101 102 JSONObject ef_page = ef_pages.getJSONObject(i);103 95 104 if (ef_page != null) {105 // Convert to Solr add form106 JSONObject solr_add_doc_json107 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter);96 //ArrayList<String> ids = new ArrayList<String>(ef_num_pages); 97 for (int i = 0; i < ef_page_count; i++) { 98 String formatted_i = String.format("page-%06d", i); 99 String page_id = volume_id + "." + formatted_i; 108 100 109 110 if ((_verbosity >=2) && (i==20)) { 111 System.out.println("=================="); 112 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 113 System.out.println("=================="); 101 if (_verbosity >= 2) { 102 System.out.println(" Page: " + page_id); 114 103 } 115 116 117 if (_solr_url != null) { 104 105 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2"; 106 //ids.add(output_json_bz2); // **** 107 108 if (i==0) { 109 System.out.println("Sample output JSON page file: " + output_json_bz2); 110 } 111 112 JSONObject ef_page = ef_pages.getJSONObject(i); 113 114 if (ef_page != null) { 115 // Convert to Solr add form 116 JSONObject solr_add_doc_json 117 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize); 118 119 118 120 if ((_verbosity >=2) && (i==20)) { 119 121 System.out.println("=================="); 120 System.out.println(" Posting to: " + _solr_url);122 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString()); 121 123 System.out.println("=================="); 122 124 } 123 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json); 125 126 127 if (_solr_url != null) { 128 if ((_verbosity >=2) && (i==20)) { 129 System.out.println("=================="); 130 System.out.println("Posting to: " + _solr_url); 131 System.out.println("=================="); 132 } 133 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json); 134 } 135 136 if (_output_dir != null) { 137 if ((_verbosity >=2) && (i==20)) { 138 System.out.println("=================="); 139 System.out.println("Saving to: " + _output_dir); 140 System.out.println("=================="); 141 } 142 SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2); 143 } 144 } 145 else { 146 System.err.println("Skipping: " + page_id); 124 147 } 125 148 126 if (_output_dir != null) { 127 if ((_verbosity >=2) && (i==20)) { 128 System.out.println("=================="); 129 System.out.println("Saving to: " + _output_dir); 130 System.out.println("=================="); 131 } 132 SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2); 133 } 149 } 150 } 151 else { 152 // File did not exist, or could not be parsed 153 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'"; 154 if (_strict_file_io) { 155 throw new IOException(mess); 134 156 } 135 157 else { 136 System.err.println("Skipping: " + page_id); 158 System.err.println("Warning: " + mess); 159 System.out.println("Warning: " + mess); 137 160 } 138 139 161 } 140 141 162 142 163 //ids.add(volume_id); -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java
r31242 r31252 20 20 protected double _progress_step; 21 21 22 boolean _icu_tokenize; 22 23 boolean _strict_file_io; 23 24 24 25 public PerVolumeWordStreamFlatmap(String input_dir, int verbosity, 25 26 DoubleAccumulator progress_accum, double progress_step, 27 boolean icu_tokenize, 26 28 boolean strict_file_io) 27 29 { … … 32 34 _progress_step = progress_step; 33 35 36 _icu_tokenize = icu_tokenize; 34 37 _strict_file_io = strict_file_io; 35 38 } … … 87 90 if (ef_page != null) { 88 91 89 ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page );92 ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page, _icu_tokenize); 90 93 all_word_list.addAll(page_word_list); 91 94 } -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31220 r31252 121 121 System.err.println(); 122 122 123 boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize"); 124 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 125 123 126 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename, 124 _solr_url,_output_dir,_verbosity, progress_accum,per_vol); 127 _solr_url,_output_dir,_verbosity, progress_accum,per_vol, 128 icu_tokenize,strict_file_io); 125 129 126 130 json_list_data.foreach(per_vol_json); … … 161 165 DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); 162 166 163 //String strict_file_io_str = System.getProperty("wcsa-ef-ingest.strict-file-io","true");167 boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize"); 164 168 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 165 169 166 170 PerPageJSONFlatmap paged_solr_json_flatmap 167 171 = new PerPageJSONFlatmap(_input_dir,_whitelist_filename, 168 172 _solr_url,_output_dir,_verbosity, 169 173 per_vol_progress_accum,per_vol, 170 strict_file_io);174 icu_tokenize,strict_file_io); 171 175 JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache(); 172 176 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java
r31251 r31252 72 72 73 73 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 74 74 boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize"); 75 75 76 PerVolumeWordStreamFlatmap paged_solr_wordfreq_flatmap 76 77 = new PerVolumeWordStreamFlatmap(_input_dir,_verbosity, 77 78 per_vol_progress_accum,per_vol, 79 icu_tokenize, 78 80 strict_file_io); 79 81 JavaRDD<String> words = json_list_data.flatMap(paged_solr_wordfreq_flatmap); -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31245 r31252 24 24 public class SolrDocJSON { 25 25 26 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id )27 {28 boolean solr_icu_tokenize = true;26 protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id, 27 boolean icu_tokenize) 28 { 29 29 30 30 ArrayList<String> words = new ArrayList<String>(); … … 36 36 String token = token_iter.next(); 37 37 38 if ( solr_icu_tokenize == true) {38 if (icu_tokenize == true) { 39 39 Reader reader = new StringReader(token); 40 40 … … 79 79 80 80 protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id, 81 WhitelistBloomFilter whitelist_bloomfilter )82 { 83 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id );81 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize) 82 { 83 ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize); 84 84 85 85 StringBuilder sb = new StringBuilder(); … … 125 125 126 126 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page, 127 WhitelistBloomFilter whitelist_bloomfilter )127 WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize) 128 128 { 129 129 JSONObject solr_update_json = null; … … 137 137 JSONObject solr_add_json = new JSONObject(); 138 138 139 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter );139 String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize); 140 140 141 141 JSONObject solr_doc_json = new JSONObject(); … … 213 213 } 214 214 215 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page) 215 protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page, 216 boolean icu_tokenize) 216 217 { 217 218 ArrayList<String> word_list = null; … … 221 222 if (ef_body != null) { 222 223 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount"); 223 word_list = getTokenPosCountWords(ef_token_pos_count,page_id );224 word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize); 224 225 } 225 226 else {
Note:
See TracChangeset
for help on using the changeset viewer.