Changeset 31028

Show
Ignore:
Timestamp:
02.11.2016 14:17:45 (3 years ago)
Author:
davidb
Message:

Support for randonly choosing Solr endpoints added in

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/_RUN.bash

    r31000 r31028  
    8282 
    8383 
    84 cmd="$cmd --verbosity 1 $input_dir $json_filelist $*" 
     84cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*" 
    8585 
    8686echo "****" 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/ef-solr.properties

    r31025 r31028  
    11 
    2 #ef-solr.process-json-mode = per-volume 
    3 ef-solr.process-json-mode = per-page 
     2#wcsa-ef-ingest.process-ef-json-mode = per-volume 
     3wcsa-ef-ingest.process-ef-json-mode = per-page 
     4 
     5wcsa-ef-ingest.solr-endpoint-ips = 10.11.0.53,10.11.0.54,10.11.0.55 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONMap.java

    r31013 r31028  
    11package org.hathitrust.extractedfeatures; 
     2 
     3import java.util.ArrayList; 
    24 
    35import org.apache.spark.api.java.function.Function; 
    46import org.apache.spark.api.java.function.MapFunction; 
    57import org.apache.spark.util.DoubleAccumulator; 
     8import org.apache.spark.util.LongAccumulator; 
    69import org.json.JSONObject; 
    710 
     
    1215     
    1316    protected String _input_dir; 
    14     protected String _solr_url; 
    1517    protected String _output_dir; 
    1618    protected int    _verbosity; 
    1719     
    18     protected DoubleAccumulator _progress_accum; 
    19     protected double            _progress_step; 
     20    protected final ArrayList<String> _solr_endpoints; 
     21    protected final int _solr_endpoints_len; 
     22     
     23    protected LongAccumulator _progress_accum; 
     24    protected long            _progress_step; 
    2025     
    2126     
    22     public PerPageJSONMap(String input_dir, String solr_url, String output_dir, int verbosity,  
    23                           DoubleAccumulator progress_accum, double progress_step) 
     27    public PerPageJSONMap(String input_dir, ArrayList<String> solr_endpoints, String output_dir, int verbosity,  
     28                          LongAccumulator progress_accum, long progress_step) 
    2429    { 
    2530        _input_dir  = input_dir; 
    26         _solr_url   = solr_url; 
    2731        _output_dir = output_dir; 
    2832        _verbosity  = verbosity; 
     33         
     34        _solr_endpoints   = solr_endpoints; 
     35        _solr_endpoints_len = _solr_endpoints.size(); 
    2936         
    3037        _progress_accum = progress_accum; 
     
    4552        } 
    4653         
    47                      
    48         if (_solr_url != null) { 
     54        String solr_url = null; 
     55        if (_solr_endpoints_len > 0) { 
     56            int random_choice = (int)(_solr_endpoints_len * Math.random()); 
     57            solr_url = _solr_endpoints.get(random_choice); 
     58        } 
     59                 
     60        if (solr_url != null) { 
    4961            if ((_verbosity >=2) && (random_test)) { 
    5062                System.out.println("=================="); 
    51                 System.out.println("Posting to: " + _solr_url); 
     63                System.out.println("Posting to: " + solr_url); 
    5264                System.out.println("=================="); 
    5365            } 
    54             SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json); 
     66            SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json); 
    5567        } 
    5668 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31027 r31028  
    66import java.io.IOException; 
    77import java.io.Serializable; 
     8import java.util.ArrayList; 
     9 
    810import org.apache.commons.cli.*; 
    911 
    1012import org.apache.spark.api.java.*; 
    1113import org.apache.spark.util.DoubleAccumulator; 
     14import org.apache.spark.util.LongAccumulator; 
    1215import org.hathitrust.extractedfeatures.PerPageJSONFlatmap; 
    1316import org.json.JSONObject; 
     
    103106        jsc.close(); 
    104107    } 
    105  
     108    public ArrayList<String> extrapolateSolrEndpoints() 
     109    { 
     110        ArrayList<String> solr_endpoints = new ArrayList<String>(); 
     111         
     112        String solr_endpoint_ips = System.getProperty("wcsa-ef-ingest.solr-endpoint-ips",null); 
     113        if (solr_endpoint_ips != null) { 
     114            String [] ips = solr_endpoint_ips.split(","); 
     115            for (String ip : ips) { 
     116                String solr_endpoint = _solr_url.replaceFirst("//.*?:", "//"+ip+":"); 
     117                solr_endpoints.add(solr_endpoint); 
     118            } 
     119        } 
     120        else { 
     121            if (_solr_url != null) { 
     122                solr_endpoints.add(_solr_url); 
     123            } 
     124        } 
     125 
     126        return solr_endpoints; 
     127    } 
     128     
    106129    public void execPerPage() 
    107130    {    
     
    129152        //long num_page_ids = per_page_jsonobjects.count(); // trigger lazy eval of: flatmap:per-vol 
    130153 
    131         DoubleAccumulator per_page_progress_accum = jsc.sc().doubleAccumulator("Pages Processed"); 
     154        LongAccumulator per_page_progress_accum = jsc.sc().longAccumulator("Pages Processed"); 
     155        ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(); 
    132156         
    133157        PerPageJSONMap paged_json_id_map  
    134             = new PerPageJSONMap(_input_dir,_solr_url,_output_dir,_verbosity, per_page_progress_accum,1.0); 
     158            = new PerPageJSONMap(_input_dir,solr_endpoints,_output_dir,_verbosity, per_page_progress_accum,1); 
    135159        JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map); 
    136160 
     
    219243 
    220244         
    221         String verbosity_str = cmd.getOptionValue("verbosity","0"); 
     245        String verbosity_str = cmd.getOptionValue("verbosity","1"); 
    222246        int verbosity = Integer.parseInt(verbosity_str); 
    223247 
     
    269293            = new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity); 
    270294         
    271         String process_json_mode = System.getProperty("ef-solr.process-json-mode","per-page"); 
    272         if (process_json_mode.equals("per-volume")) {  
     295        String process_ef_json_mode = System.getProperty("wcsa-ef-ingest.process-ef-json-mode","per-page"); 
     296        if (process_ef_json_mode.equals("per-volume")) {  
    273297            prep_for_ingest.execPerVolume(); 
    274298        }