Changeset 32107

Show
Ignore:
Timestamp:
16.01.2018 23:17:42 (6 weeks ago)
Author:
davidb
Message:

Rekindling the ability to run a JSON-filelist Spark run via YARN

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
Files:
1 added
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/scripts/_RUN.sh

    r31184 r32107  
    33# To work, the follow bash variables need to have been set: 
    44# 
    5 #  json_filelist input_dir output_dir 
     5#  seq_file (output_dir optional) 
     6# 
     7# Or: 
     8# 
     9#  json_filelist input_dir (output_dir optional) 
    610# 
    711# Typically done through running a wrapper script, such as: 
    812# 
    9 RUN-PD-CLUSTER.bash 
     13FULL-RUN-YARN-SPARK.sh 
    1014 
    11 if [ "x$json_filelist" = "x" ] ; then 
    12     echo "_RUN.bash: Failed to set 'json_filelist'" 1>&2 
    13     exit 
     15 
     16show_usage=1 
     17class_mode="" 
     18if [ "x$seq_file" != "x" ] ; then 
     19    show_usage=0 
     20    class_mode="seq" 
     21else 
     22 
     23     
     24    if [ "x$json_filelist" != "x" ] ; then 
     25    class_mode="json" 
     26    fi 
     27 
     28    if [ "x$input_dir" != "x" ] ; then 
     29    if [ $show_usage = "json" ] ; then 
     30        show_usage=0 
     31    fi 
     32    fi 
     33 
    1434fi 
    1535 
    16 if [ "x$input_dir" = "x" ] ; then 
    17     echo "_RUN.bash: Failed to set 'input_dir'" 1>&2 
    18     exit 
     36if [ $show_usage = "1" ] ; then 
     37    echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2 
     38    exit 1 
    1939fi 
     40 
    2041 
    2142#if [ "x$output_dir" = "x" ] ; then 
     
    2950using_hdfs=0 
    3051 
    31 if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then 
     52if [ "$class_mode" = "seq" ] ; then 
     53  if [ "x${seq_file##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then 
    3254    # Evidence of running command over HDFS 
    3355    run_jps=1 
    3456    run_jps_daemons="Spark" 
    3557    using_hdfs=1 
     58  fi 
     59fi 
     60 
     61if [ "$class_mode" = "json" ] ; then 
     62  if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then 
     63    # Evidence of running command over HDFS 
     64    run_jps=1 
     65    run_jps_daemons="Spark" 
     66    using_hdfs=1 
     67  fi 
    3668fi 
    3769 
     
    83115cmd="spark-submit --class $classmain $master_opt $self_contained_jar" 
    84116 
    85 if [ "x$solr_url" != "x" ] ; then 
    86     cmd="$cmd --solr-url $solr_url" 
    87 fi 
     117if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] || [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then 
     118  if [ "x$solr_base_url" != "x" ] ; then 
     119      cmd="$cmd --solr-base-url $solr_base_url" 
     120  fi 
    88121 
    89 if [ "x$output_dir" != "x" ] ; then 
     122  if [ "x$output_dir" != "x" ] ; then 
    90123    cmd="$cmd --output-dir $output_dir" 
     124  fi 
    91125fi 
    92126 
    93127 
    94 cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*" 
     128if [ "$class_mode" = "seq" ] ; then 
     129    cmd="$cmd --properties ef-solr.properties $seq_file $*" 
     130    #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*" 
     131else 
     132    cmd="$cmd --properties ef-solr.properties $json_filelist $input_dir $*" 
     133    #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $json_filelist $input_dir $*" 
     134 
     135fi 
    95136 
    96137echo "****"