#!/bin/bash # To work, the follow bash variables need to have been set: # # json_filelist input_dir output_dir # # Typically done through running a wrapper script, such as: # # RUN-PD-CLUSTER.bash if [ "x$json_filelist" = "x" ] ; then echo "_RUN.bash: Failed to set 'json_filelist'" 1>&2 exit fi if [ "x$input_dir" = "x" ] ; then echo "_RUN.bash: Failed to set 'input_dir'" 1>&2 exit fi if [ "x$output_dir" = "x" ] ; then echo "_RUN.bash: Failed to set 'output_dir'" 1>&2 exit fi run_jps=0 run_jps_daemons="" run_jps_daemons_suffix="daemon" if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then # Evidence of running command over HDFS run_jps=1 run_jps_daemons="Spark" fi if [ "x${master_opt##--master spark://*}" = "x" ] ; then # Evidence of running command submitted to Spark cluster run_jps=1 if [ "x$run_jps_daemons" != "x" ] ; then run_jps_daemons="$run_jps_daemons and Hadoop" run_jps_daemons_suffix="daemons" else run_jps_daemons="Hadoop" fi fi if [ "$run_jps" = "1" ] ; then echo echo "****" echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix" echo "****" jps | sed 's/^/* /g' echo "****" echo "* Done" echo "****" echo sleep 1 fi self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar" cmd="$base_cmd --verbosity 1 $json_filelist $input_dir $output_dir $*" echo "****" echo "* Lauching:" echo "* $cmd" echo "****" if [ "$run_jps" = "1" ] ; then echo "* Monitor progress on Spark cluster through:" echo "* http://10.10.0.52:8080/" echo "****" fi echo sleep 1 $cmd # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $* # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target\htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step1000.txt json-files solr-files $*