Changeset 31028 for other-projects/hathitrust/wcsa/extracted-features-solr
- Timestamp:
- 2016-11-02T14:17:45+13:00 (6 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/_RUN.bash
r31000 r31028 82 82 83 83 84 cmd="$cmd -- verbosity 1$input_dir $json_filelist $*"84 cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*" 85 85 86 86 echo "****" -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/ef-solr.properties
r31025 r31028 1 1 2 #ef-solr.process-json-mode = per-volume 3 ef-solr.process-json-mode = per-page 2 #wcsa-ef-ingest.process-ef-json-mode = per-volume 3 wcsa-ef-ingest.process-ef-json-mode = per-page 4 5 wcsa-ef-ingest.solr-endpoint-ips = 10.11.0.53,10.11.0.54,10.11.0.55 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONMap.java
r31013 r31028 1 1 package org.hathitrust.extractedfeatures; 2 3 import java.util.ArrayList; 2 4 3 5 import org.apache.spark.api.java.function.Function; 4 6 import org.apache.spark.api.java.function.MapFunction; 5 7 import org.apache.spark.util.DoubleAccumulator; 8 import org.apache.spark.util.LongAccumulator; 6 9 import org.json.JSONObject; 7 10 … … 12 15 13 16 protected String _input_dir; 14 protected String _solr_url;15 17 protected String _output_dir; 16 18 protected int _verbosity; 17 19 18 protected DoubleAccumulator _progress_accum; 19 protected double _progress_step; 20 protected final ArrayList<String> _solr_endpoints; 21 protected final int _solr_endpoints_len; 22 23 protected LongAccumulator _progress_accum; 24 protected long _progress_step; 20 25 21 26 22 public PerPageJSONMap(String input_dir, String solr_url, String output_dir, int verbosity,23 DoubleAccumulator progress_accum, doubleprogress_step)27 public PerPageJSONMap(String input_dir, ArrayList<String> solr_endpoints, String output_dir, int verbosity, 28 LongAccumulator progress_accum, long progress_step) 24 29 { 25 30 _input_dir = input_dir; 26 _solr_url = solr_url;27 31 _output_dir = output_dir; 28 32 _verbosity = verbosity; 33 34 _solr_endpoints = solr_endpoints; 35 _solr_endpoints_len = _solr_endpoints.size(); 29 36 30 37 _progress_accum = progress_accum; … … 45 52 } 46 53 47 48 if (_solr_url != null) { 54 String solr_url = null; 55 if (_solr_endpoints_len > 0) { 56 int random_choice = (int)(_solr_endpoints_len * Math.random()); 57 solr_url = _solr_endpoints.get(random_choice); 58 } 59 60 if (solr_url != null) { 49 61 if ((_verbosity >=2) && (random_test)) { 50 62 System.out.println("=================="); 51 System.out.println("Posting to: " + _solr_url);63 System.out.println("Posting to: " + solr_url); 52 64 System.out.println("=================="); 53 65 } 54 SolrDocJSON.postSolrDoc( _solr_url, solr_add_doc_json);66 SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json); 55 67 } 56 68 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31027 r31028 6 6 import java.io.IOException; 7 7 import java.io.Serializable; 8 import java.util.ArrayList; 9 8 10 import org.apache.commons.cli.*; 9 11 10 12 import org.apache.spark.api.java.*; 11 13 import org.apache.spark.util.DoubleAccumulator; 14 import org.apache.spark.util.LongAccumulator; 12 15 import org.hathitrust.extractedfeatures.PerPageJSONFlatmap; 13 16 import org.json.JSONObject; … … 103 106 jsc.close(); 104 107 } 105 108 public ArrayList<String> extrapolateSolrEndpoints() 109 { 110 ArrayList<String> solr_endpoints = new ArrayList<String>(); 111 112 String solr_endpoint_ips = System.getProperty("wcsa-ef-ingest.solr-endpoint-ips",null); 113 if (solr_endpoint_ips != null) { 114 String [] ips = solr_endpoint_ips.split(","); 115 for (String ip : ips) { 116 String solr_endpoint = _solr_url.replaceFirst("//.*?:", "//"+ip+":"); 117 solr_endpoints.add(solr_endpoint); 118 } 119 } 120 else { 121 if (_solr_url != null) { 122 solr_endpoints.add(_solr_url); 123 } 124 } 125 126 return solr_endpoints; 127 } 128 106 129 public void execPerPage() 107 130 { … … 129 152 //long num_page_ids = per_page_jsonobjects.count(); // trigger lazy eval of: flatmap:per-vol 130 153 131 DoubleAccumulator per_page_progress_accum = jsc.sc().doubleAccumulator("Pages Processed"); 154 LongAccumulator per_page_progress_accum = jsc.sc().longAccumulator("Pages Processed"); 155 ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(); 132 156 133 157 PerPageJSONMap paged_json_id_map 134 = new PerPageJSONMap(_input_dir, _solr_url,_output_dir,_verbosity, per_page_progress_accum,1.0);158 = new PerPageJSONMap(_input_dir,solr_endpoints,_output_dir,_verbosity, per_page_progress_accum,1); 135 159 JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map); 136 160 … … 219 243 220 244 221 String verbosity_str = cmd.getOptionValue("verbosity"," 0");245 String verbosity_str = cmd.getOptionValue("verbosity","1"); 222 246 int verbosity = Integer.parseInt(verbosity_str); 223 247 … … 269 293 = new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity); 270 294 271 String process_ json_mode = System.getProperty("ef-solr.process-json-mode","per-page");272 if (process_ json_mode.equals("per-volume")) {295 String process_ef_json_mode = System.getProperty("wcsa-ef-ingest.process-ef-json-mode","per-page"); 296 if (process_ef_json_mode.equals("per-volume")) { 273 297 prep_for_ingest.execPerVolume(); 274 298 }
Note:
See TracChangeset
for help on using the changeset viewer.