- Timestamp:
- 2017-03-02T23:28:38+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31450 r31451 32 32 protected String _langmap_directory; 33 33 34 protected String _solr_url; 34 protected final ArrayList<String> _solr_endpoints; 35 protected final int _solr_endpoints_len; 36 37 //protected String _solr_url; 35 38 protected String _output_dir; 36 39 … … 44 47 45 48 public PerVolumeJSON(String input_dir, String whitelist_filename, String langmap_directory, 46 String solr_url, String output_dir, int verbosity,49 ArrayList<String> solr_endpoints, String output_dir, int verbosity, 47 50 boolean icu_tokenize, boolean strict_file_io) 48 51 { … … 53 56 _langmap_directory = langmap_directory; 54 57 55 _solr_url = solr_url; 58 _solr_endpoints = solr_endpoints; 59 _solr_endpoints_len = solr_endpoints.size(); 60 61 //_solr_url = solr_url; 56 62 _output_dir = output_dir; 57 63 _verbosity = verbosity; … … 77 83 int ef_num_pages = 0; 78 84 85 String solr_url = null; 86 if (_solr_endpoints_len > 0) { 87 int random_choice = (int)(_solr_endpoints_len * Math.random()); 88 solr_url = _solr_endpoints.get(random_choice); 89 } 90 79 91 try { 80 92 … … 125 137 126 138 127 if ( _solr_url != null) {139 if (solr_url != null) { 128 140 if ((_verbosity >=2) && (i==20)) { 129 141 System.out.println("=================="); 130 System.out.println("Posting to: " + _solr_url);142 System.out.println("Posting to: " + solr_url); 131 143 System.out.println("=================="); 132 144 } 133 SolrDocJSON.postSolrDoc( _solr_url, solr_add_doc_json);145 SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json); 134 146 } 135 147 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31450 r31451 27 27 28 28 protected String _input_dir; 29 //protected String _json_list_filename; 29 protected String _solr_base_url; 30 protected String _solr_collection; 31 30 32 protected String _whitelist_filename; 31 33 protected String _langmap_directory; 32 34 33 protected String _solr_url;35 //protected String _solr_url; 34 36 protected String _output_dir; 35 37 36 38 protected int _verbosity; 37 39 38 public ProcessForSolrIngest(String input_dir, /*String json_list_filename,*/39 String solr_ url, String output_dir, int verbosity)40 public ProcessForSolrIngest(String input_dir, String solr_collection, 41 String solr_base_url, String output_dir, int verbosity) 40 42 { 41 43 _input_dir = input_dir; 42 //_json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;43 44 _solr_collection = solr_collection; 45 44 46 boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist"); 45 47 _whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null; … … 49 51 50 52 51 _solr_ url = solr_url;53 _solr_base_url = solr_base_url; 52 54 _output_dir = output_dir; 53 55 _verbosity = verbosity; … … 57 59 { 58 60 String spark_app_name = "[" + exec_mode + "] Extract Features: Process for Solr Ingest"; 59 //spark_app_name += " [" + _json_list_filename+ "]";60 61 if (_solr_ url != null) {62 spark_app_name += " solr_ url=" + _solr_url;61 spark_app_name += " [" + _solr_collection + "]"; 62 63 if (_solr_base_url != null) { 64 spark_app_name += " solr_base_url=" + _solr_base_url; 63 65 } 64 66 … … 70 72 } 71 73 72 public ArrayList<String> extrapolateSolrEndpoints( )74 public ArrayList<String> extrapolateSolrEndpoints(String solr_collection) 73 75 { 74 76 ArrayList<String> solr_endpoints = new ArrayList<String>(); 75 77 76 if (_solr_url != null) { 78 if (_solr_base_url != null) { 79 String solr_url = _solr_base_url + "/" + solr_collection + "/update"; 80 77 81 String solr_cloud_nodes = System.getProperty("wcsa-ef-ingest.solr-cloud-nodes",null); 78 82 if (solr_cloud_nodes != null) { 79 83 String [] cloud_nodes = solr_cloud_nodes.split(","); 80 84 for (String cn : cloud_nodes) { 81 String solr_endpoint = _solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/");85 String solr_endpoint = solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/"); 82 86 solr_endpoints.add(solr_endpoint); 83 87 } 84 88 } 85 89 else { 86 solr_endpoints.add( _solr_url);90 solr_endpoints.add(solr_url); 87 91 } 88 92 } … … 109 113 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 110 114 115 ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(_solr_collection); 116 111 117 System.out.println("*** away to create PerVolumeJSON class, _langmap_directory = " + _langmap_directory); 112 118 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename, _langmap_directory, 113 _solr_url,_output_dir,_verbosity,119 solr_endpoints,_output_dir,_verbosity, 114 120 icu_tokenize,strict_file_io); 115 121 … … 250 256 public static void print_usage(HelpFormatter formatter, Options options) 251 257 { 252 formatter.printHelp("RUN.bash [options] input-dir json-filelist.txt", options);258 formatter.printHelp("RUN.bash [options] input-dir solr-collection", options); 253 259 } 254 260 … … 271 277 options.addOption(output_dir_opt); 272 278 273 Option solr_ url_opt = new Option("u", "solr-url", true,274 "If specified, the URL to post the Solr JSON data to");275 solr_ url_opt.setRequired(false);276 options.addOption(solr_ url_opt);279 Option solr_base_url_opt = new Option("u", "solr-base-url", true, 280 "If specified, the base URL to post the Solr JSON data to"); 281 solr_base_url_opt.setRequired(false); 282 options.addOption(solr_base_url_opt); 277 283 278 284 Option read_only_opt = new Option("r", "read-only", false, … … 304 310 305 311 String output_dir = cmd.getOptionValue("output-dir",null); 306 String solr_ url = cmd.getOptionValue("solr-url",null);312 String solr_base_url = cmd.getOptionValue("solr-base-url",null); 307 313 boolean read_only = cmd.hasOption("read-only"); 308 314 309 315 String[] filtered_args = cmd.getArgs(); 310 316 311 if (filtered_args.length != 1) {317 if (filtered_args.length != 2) { 312 318 print_usage(formatter,options); 313 319 System.exit(1); … … 331 337 } 332 338 333 if (!read_only && ((output_dir == null) && (solr_ url==null))) {334 System.err.println("Need to specify either --solr- url or --output-dir otherwise generated files are not ingested/saved");339 if (!read_only && ((output_dir == null) && (solr_base_url==null))) { 340 System.err.println("Need to specify either --solr-base-url or --output-dir otherwise generated files are not ingested/saved"); 335 341 print_usage(formatter,options); 336 342 System.exit(1); … … 339 345 // For this case, need to ensure solr-url and output-dir are null 340 346 output_dir = null; 341 solr_ url = null;347 solr_base_url = null; 342 348 } 343 349 344 350 String input_dir = filtered_args[0]; 345 //String json_list_filename= filtered_args[1];351 String solr_collection = filtered_args[1]; 346 352 347 353 ProcessForSolrIngest prep_for_ingest 348 = new ProcessForSolrIngest(input_dir, /*json_list_filename,*/solr_url,output_dir,verbosity);354 = new ProcessForSolrIngest(input_dir,solr_collection,solr_base_url,output_dir,verbosity); 349 355 350 356 prep_for_ingest.execPerVolumeSequenceFile();
Note:
See TracChangeset
for help on using the changeset viewer.