Changeset 31028 for other-projects


Ignore:
Timestamp:
2016-11-02T14:17:45+13:00 (7 years ago)
Author:
davidb
Message:

Support for randonly choosing Solr endpoints added in

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/_RUN.bash

    r31000 r31028  
    8282
    8383
    84 cmd="$cmd --verbosity 1 $input_dir $json_filelist $*"
     84cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*"
    8585
    8686echo "****"
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/ef-solr.properties

    r31025 r31028  
    11
    2 #ef-solr.process-json-mode = per-volume
    3 ef-solr.process-json-mode = per-page
     2#wcsa-ef-ingest.process-ef-json-mode = per-volume
     3wcsa-ef-ingest.process-ef-json-mode = per-page
     4
     5wcsa-ef-ingest.solr-endpoint-ips = 10.11.0.53,10.11.0.54,10.11.0.55
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONMap.java

    r31013 r31028  
    11package org.hathitrust.extractedfeatures;
     2
     3import java.util.ArrayList;
    24
    35import org.apache.spark.api.java.function.Function;
    46import org.apache.spark.api.java.function.MapFunction;
    57import org.apache.spark.util.DoubleAccumulator;
     8import org.apache.spark.util.LongAccumulator;
    69import org.json.JSONObject;
    710
     
    1215   
    1316    protected String _input_dir;
    14     protected String _solr_url;
    1517    protected String _output_dir;
    1618    protected int    _verbosity;
    1719   
    18     protected DoubleAccumulator _progress_accum;
    19     protected double            _progress_step;
     20    protected final ArrayList<String> _solr_endpoints;
     21    protected final int _solr_endpoints_len;
     22   
     23    protected LongAccumulator _progress_accum;
     24    protected long            _progress_step;
    2025   
    2126   
    22     public PerPageJSONMap(String input_dir, String solr_url, String output_dir, int verbosity,
    23                           DoubleAccumulator progress_accum, double progress_step)
     27    public PerPageJSONMap(String input_dir, ArrayList<String> solr_endpoints, String output_dir, int verbosity,
     28                          LongAccumulator progress_accum, long progress_step)
    2429    {
    2530        _input_dir  = input_dir;
    26         _solr_url   = solr_url;
    2731        _output_dir = output_dir;
    2832        _verbosity  = verbosity;
     33       
     34        _solr_endpoints   = solr_endpoints;
     35        _solr_endpoints_len = _solr_endpoints.size();
    2936       
    3037        _progress_accum = progress_accum;
     
    4552        }
    4653       
    47                    
    48         if (_solr_url != null) {
     54        String solr_url = null;
     55        if (_solr_endpoints_len > 0) {
     56            int random_choice = (int)(_solr_endpoints_len * Math.random());
     57            solr_url = _solr_endpoints.get(random_choice);
     58        }
     59               
     60        if (solr_url != null) {
    4961            if ((_verbosity >=2) && (random_test)) {
    5062                System.out.println("==================");
    51                 System.out.println("Posting to: " + _solr_url);
     63                System.out.println("Posting to: " + solr_url);
    5264                System.out.println("==================");
    5365            }
    54             SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
     66            SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json);
    5567        }
    5668
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31027 r31028  
    66import java.io.IOException;
    77import java.io.Serializable;
     8import java.util.ArrayList;
     9
    810import org.apache.commons.cli.*;
    911
    1012import org.apache.spark.api.java.*;
    1113import org.apache.spark.util.DoubleAccumulator;
     14import org.apache.spark.util.LongAccumulator;
    1215import org.hathitrust.extractedfeatures.PerPageJSONFlatmap;
    1316import org.json.JSONObject;
     
    103106        jsc.close();
    104107    }
    105 
     108    public ArrayList<String> extrapolateSolrEndpoints()
     109    {
     110        ArrayList<String> solr_endpoints = new ArrayList<String>();
     111       
     112        String solr_endpoint_ips = System.getProperty("wcsa-ef-ingest.solr-endpoint-ips",null);
     113        if (solr_endpoint_ips != null) {
     114            String [] ips = solr_endpoint_ips.split(",");
     115            for (String ip : ips) {
     116                String solr_endpoint = _solr_url.replaceFirst("//.*?:", "//"+ip+":");
     117                solr_endpoints.add(solr_endpoint);
     118            }
     119        }
     120        else {
     121            if (_solr_url != null) {
     122                solr_endpoints.add(_solr_url);
     123            }
     124        }
     125
     126        return solr_endpoints;
     127    }
     128   
    106129    public void execPerPage()
    107130    {   
     
    129152        //long num_page_ids = per_page_jsonobjects.count(); // trigger lazy eval of: flatmap:per-vol
    130153
    131         DoubleAccumulator per_page_progress_accum = jsc.sc().doubleAccumulator("Pages Processed");
     154        LongAccumulator per_page_progress_accum = jsc.sc().longAccumulator("Pages Processed");
     155        ArrayList<String> solr_endpoints = extrapolateSolrEndpoints();
    132156       
    133157        PerPageJSONMap paged_json_id_map
    134             = new PerPageJSONMap(_input_dir,_solr_url,_output_dir,_verbosity, per_page_progress_accum,1.0);
     158            = new PerPageJSONMap(_input_dir,solr_endpoints,_output_dir,_verbosity, per_page_progress_accum,1);
    135159        JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map);
    136160
     
    219243
    220244       
    221         String verbosity_str = cmd.getOptionValue("verbosity","0");
     245        String verbosity_str = cmd.getOptionValue("verbosity","1");
    222246        int verbosity = Integer.parseInt(verbosity_str);
    223247
     
    269293            = new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity);
    270294       
    271         String process_json_mode = System.getProperty("ef-solr.process-json-mode","per-page");
    272         if (process_json_mode.equals("per-volume")) {
     295        String process_ef_json_mode = System.getProperty("wcsa-ef-ingest.process-ef-json-mode","per-page");
     296        if (process_ef_json_mode.equals("per-volume")) {
    273297            prep_for_ingest.execPerVolume();
    274298        }
Note: See TracChangeset for help on using the changeset viewer.