#!/bin/bash # To work, the follow bash variables need to have been set: # # seq_file (output_dir optional) # # Or: # # json_filelist input_dir (output_dir optional) # # Typically done through running a wrapper script, such as: # # FULL-RUN-YARN-SPARK.sh show_usage=1 class_mode="" if [ "x$seq_file" != "x" ] ; then show_usage=0 class_mode="seq" else if [ "x$json_filelist" != "x" ] ; then class_mode="json" fi if [ "x$input_dir" != "x" ] ; then if [ $class_mode = "json" ] ; then show_usage=0 fi fi fi if [ $show_usage = "1" ] ; then echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2 exit 1 fi #if [ "x$output_dir" = "x" ] ; then # echo "_RUN.bash: Failed to set 'output_dir'" 1>&2 # exit #fi run_jps=0 run_jps_daemons="" run_jps_daemons_suffix="daemon" using_hdfs=0 if [ "$class_mode" = "seq" ] ; then if [ "x${seq_file##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then # Evidence of running command over HDFS run_jps=1 run_jps_daemons="Spark" using_hdfs=1 fi fi if [ "$class_mode" = "json" ] ; then if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then # Evidence of running command over HDFS run_jps=1 run_jps_daemons="Spark" using_hdfs=1 fi fi if [ "x${master_opt##--master spark://*}" = "x" ] ; then # Evidence of running command submitted to Spark cluster run_jps=1 if [ "x$run_jps_daemons" != "x" ] ; then run_jps_daemons="$run_jps_daemons and Hadoop" run_jps_daemons_suffix="daemons" else run_jps_daemons="Hadoop" fi fi if [ "$run_jps" = "1" ] ; then echo echo "****" echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix, by running 'jps':" echo "****" jps | egrep -v " Jps$" | sed 's/^/* /g' \ | sed 's/ Master/ [Spark] Master/' \ | sed 's/ NameNode/ [HDFS] NameNode/' \ | sed 's/ SecondaryNameNode/ [HDFS] SecondaryNameNode/' echo "****" echo "* Done" echo "****" echo sleep 1 fi if [ "$using_hdfs" = "1" ] ; then if [ "x$output_dir" != "x" ] ; then hadoop fs -test -d "$output_dir" if [ $? != 0 ] ; then echo "Creating directory:" echo " $output_dir" fi fi fi if [ "x$classmain" = "x" ] ; then classmain="org.hathitrust.extractedfeatures.ProcessForSolrIngest" fi self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar cmd="spark-submit --class $classmain $master_opt $self_contained_jar" if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] || [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then if [ "x$solr_base_url" != "x" ] ; then cmd="$cmd --solr-base-url $solr_base_url" fi if [ "x$output_dir" != "x" ] ; then cmd="$cmd --output-dir $output_dir" fi fi if [ "$class_mode" = "seq" ] ; then cmd="$cmd --properties ef-solr.properties $seq_file $*" #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*" else cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*" #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $input_dir $json_filelist $*" fi echo "****" echo "* Lauching:" echo "* $cmd" echo "****" if [ "$run_jps" = "1" ] ; then echo "* Monitor progress on Spark cluster through:" echo "* http://$SPARK_MASTER_HOST:8080/" echo "****" fi echo sleep 2 $cmd