Changeset 31220 for other-projects
- Timestamp:
- 2016-12-12T20:18:04+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java
r31096 r31220 25 25 26 26 protected String _input_dir; 27 28 protected WhitelistBloomFilter _whitelist_bloomfilter; 29 27 30 protected String _solr_url; 28 31 protected String _output_dir; … … 34 37 boolean _strict_file_io; 35 38 36 public PerPageJSONFlatmap(String input_dir, String solr_url, String output_dir, int verbosity, 39 public PerPageJSONFlatmap(String input_dir, String whitelist_filename, 40 String solr_url, String output_dir, int verbosity, 37 41 DoubleAccumulator progress_accum, double progress_step, 38 42 boolean strict_file_io) 39 43 { 40 44 _input_dir = input_dir; 45 46 if (whitelist_filename != null) { 47 _whitelist_bloomfilter = new WhitelistBloomFilter(whitelist_filename,true); 48 } 49 41 50 _solr_url = solr_url; 42 51 _output_dir = output_dir; … … 51 60 52 61 public Iterator<JSONObject> call(String json_file_in) throws IOException 53 //public void call(String json_file_in)54 62 { 55 //ClusterFileIO.memory_usage("Before BZIP2 JSON file read"); 56 String full_json_file_in = _input_dir + "/" + json_file_in; 63 String full_json_file_in = _input_dir + "/" + json_file_in; 57 64 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 58 //ClusterFileIO.memory_usage("After BZIP2 JSON file read"); 59 65 60 66 ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>(); 61 67 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31096 r31220 20 20 private static final long serialVersionUID = 1L; 21 21 protected String _input_dir; 22 23 protected WhitelistBloomFilter _whitelist_bloomfilter; 24 22 25 protected String _solr_url; 23 26 protected String _output_dir; … … 27 30 protected double _progress_step; 28 31 29 public PerVolumeJSON(String input_dir, String solr_url, String output_dir, int verbosity, 32 public PerVolumeJSON(String input_dir, String whitelist_filename, 33 String solr_url, String output_dir, int verbosity, 30 34 DoubleAccumulator progress_accum, double progress_step) 31 35 { 32 36 _input_dir = input_dir; 37 38 if (whitelist_filename != null) { 39 _whitelist_bloomfilter = new WhitelistBloomFilter(whitelist_filename,true); 40 } 41 33 42 _solr_url = solr_url; 34 43 _output_dir = output_dir; -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31201 r31220 33 33 protected String _input_dir; 34 34 protected String _json_list_filename; 35 protected String _whitelist_filename; 35 36 protected String _solr_url; 36 37 protected String _output_dir; … … 44 45 _json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir; 45 46 47 boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist"); 48 _whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null; 49 46 50 _solr_url = solr_url; 47 51 _output_dir = output_dir; … … 117 121 System.err.println(); 118 122 119 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 123 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename, 124 _solr_url,_output_dir,_verbosity, progress_accum,per_vol); 120 125 121 126 json_list_data.foreach(per_vol_json); … … 160 165 161 166 PerPageJSONFlatmap paged_solr_json_flatmap 162 = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, 167 = new PerPageJSONFlatmap(_input_dir,_whitelist_filename, 168 _solr_url,_output_dir,_verbosity, 163 169 per_vol_progress_accum,per_vol, 164 170 strict_file_io); … … 309 315 ProcessForSolrIngest prep_for_ingest 310 316 = new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity); 311 312 313 boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist"); 314 315 if (use_whitelist) { 316 String whitelist_filename = System.getProperty("wcsa-ef-ingest.whitelist-filename"); 317 318 WhitelistBloomFilter whitelist_bloomfilter = new WhitelistBloomFilter(whitelist_filename,true); 319 whitelist_bloomfilter.contains("foo"); 320 } 321 322 317 323 318 String process_ef_json_mode = System.getProperty("wcsa-ef-ingest.process-ef-json-mode","per-page"); 324 319 if (process_ef_json_mode.equals("per-volume")) { -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31176 r31220 16 16 public class SolrDocJSON { 17 17 18 protected static String generateSolrText(JSONObject ef_token_pos_count )18 protected static String generateSolrText(JSONObject ef_token_pos_count, WhitelistBloomFilter whitelist_bloomfilter) 19 19 { 20 20 StringBuilder sb = new StringBuilder(); 21 21 22 22 Iterator<String> token_iter = ef_token_pos_count.keys(); 23 while (token_iter.hasNext()) { 24 String token = token_iter.next(); 25 26 sb.append(token); 27 if (token_iter.hasNext()) { 28 sb.append(" "); 29 } 30 } 31 23 24 if (whitelist_bloomfilter == null) { 25 26 while (token_iter.hasNext()) { 27 String token = token_iter.next(); 28 sb.append(token); 29 if (token_iter.hasNext()) { 30 sb.append(" "); 31 } 32 } 33 } 34 else { 35 while (token_iter.hasNext()) { 36 String token = token_iter.next(); 37 if (whitelist_bloomfilter.contains(token)) { 38 sb.append(token); 39 if (token_iter.hasNext()) { 40 sb.append(" "); 41 } 42 } 43 } 44 45 } 32 46 /* 33 47 Set<String> token_keys = ef_token_pos_count.keySet(); … … 40 54 } 41 55 42 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page) 56 protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page, 57 WhitelistBloomFilter whitelist_bloomfilter) 43 58 { 44 59 JSONObject solr_update_json = null; … … 52 67 JSONObject solr_add_json = new JSONObject(); 53 68 54 String text = generateSolrText(ef_token_pos_count );69 String text = generateSolrText(ef_token_pos_count,whitelist_bloomfilter); 55 70 56 71 JSONObject solr_doc_json = new JSONObject(); 57 72 solr_doc_json.put("id", page_id); 58 73 solr_doc_json.put("volumeid_s", volume_id); 59 solr_doc_json.put("eftext_txt", text); 60 74 if (!text.equals("")) { 75 solr_doc_json.put("eftext_txt", text); 76 } 77 else { 78 solr_doc_json.put("efnotext_b", true); 79 } 61 80 solr_add_json.put("commitWithin", 5000); 62 81 solr_add_json.put("doc", solr_doc_json);
Note:
See TracChangeset
for help on using the changeset viewer.