Changeset 32107


Ignore:
Timestamp:
2018-01-16T23:17:42+13:00 (4 years ago)
Author:
davidb
Message:

Rekindling the ability to run a JSON-filelist Spark run via YARN

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/scripts/_RUN.sh

    r31184 r32107  
    33# To work, the follow bash variables need to have been set:
    44#
    5 #  json_filelist input_dir output_dir
     5#  seq_file (output_dir optional)
     6#
     7# Or:
     8#
     9#  json_filelist input_dir (output_dir optional)
    610#
    711# Typically done through running a wrapper script, such as:
    812#
    9 RUN-PD-CLUSTER.bash
     13FULL-RUN-YARN-SPARK.sh
    1014
    11 if [ "x$json_filelist" = "x" ] ; then
    12     echo "_RUN.bash: Failed to set 'json_filelist'" 1>&2
    13     exit
     15
     16show_usage=1
     17class_mode=""
     18if [ "x$seq_file" != "x" ] ; then
     19    show_usage=0
     20    class_mode="seq"
     21else
     22
     23   
     24    if [ "x$json_filelist" != "x" ] ; then
     25    class_mode="json"
     26    fi
     27
     28    if [ "x$input_dir" != "x" ] ; then
     29    if [ $show_usage = "json" ] ; then
     30        show_usage=0
     31    fi
     32    fi
     33
    1434fi
    1535
    16 if [ "x$input_dir" = "x" ] ; then
    17     echo "_RUN.bash: Failed to set 'input_dir'" 1>&2
    18     exit
     36if [ $show_usage = "1" ] ; then
     37    echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2
     38    exit 1
    1939fi
     40
    2041
    2142#if [ "x$output_dir" = "x" ] ; then
     
    2950using_hdfs=0
    3051
    31 if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
     52if [ "$class_mode" = "seq" ] ; then
     53  if [ "x${seq_file##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
    3254    # Evidence of running command over HDFS
    3355    run_jps=1
    3456    run_jps_daemons="Spark"
    3557    using_hdfs=1
     58  fi
     59fi
     60
     61if [ "$class_mode" = "json" ] ; then
     62  if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
     63    # Evidence of running command over HDFS
     64    run_jps=1
     65    run_jps_daemons="Spark"
     66    using_hdfs=1
     67  fi
    3668fi
    3769
     
    83115cmd="spark-submit --class $classmain $master_opt $self_contained_jar"
    84116
    85 if [ "x$solr_url" != "x" ] ; then
    86     cmd="$cmd --solr-url $solr_url"
    87 fi
     117if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] || [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then
     118  if [ "x$solr_base_url" != "x" ] ; then
     119      cmd="$cmd --solr-base-url $solr_base_url"
     120  fi
    88121
    89 if [ "x$output_dir" != "x" ] ; then
     122  if [ "x$output_dir" != "x" ] ; then
    90123    cmd="$cmd --output-dir $output_dir"
     124  fi
    91125fi
    92126
    93127
    94 cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*"
     128if [ "$class_mode" = "seq" ] ; then
     129    cmd="$cmd --properties ef-solr.properties $seq_file $*"
     130    #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*"
     131else
     132    cmd="$cmd --properties ef-solr.properties $json_filelist $input_dir $*"
     133    #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $json_filelist $input_dir $*"
     134
     135fi
    95136
    96137echo "****"
Note: See TracChangeset for help on using the changeset viewer.