[30912] | 1 | #!/bin/bash
|
---|
| 2 |
|
---|
[30926] | 3 | # To work, the follow bash variables need to have been set:
|
---|
| 4 | #
|
---|
| 5 | # json_filelist input_dir output_dir
|
---|
| 6 | #
|
---|
| 7 | # Typically done through running a wrapper script, such as:
|
---|
| 8 | #
|
---|
| 9 | # RUN-PD-CLUSTER.bash
|
---|
[30912] | 10 |
|
---|
[30926] | 11 | if [ "x$json_filelist" = "x" ] ; then
|
---|
[30927] | 12 | echo "_RUN.bash: Failed to set 'json_filelist'" 1>&2
|
---|
[30926] | 13 | exit
|
---|
| 14 | fi
|
---|
[30923] | 15 |
|
---|
[30926] | 16 | if [ "x$input_dir" = "x" ] ; then
|
---|
[30927] | 17 | echo "_RUN.bash: Failed to set 'input_dir'" 1>&2
|
---|
[30926] | 18 | exit
|
---|
| 19 | fi
|
---|
| 20 |
|
---|
[30975] | 21 | #if [ "x$output_dir" = "x" ] ; then
|
---|
| 22 | # echo "_RUN.bash: Failed to set 'output_dir'" 1>&2
|
---|
| 23 | # exit
|
---|
| 24 | #fi
|
---|
[30926] | 25 |
|
---|
[30934] | 26 | run_jps=0
|
---|
| 27 | run_jps_daemons=""
|
---|
| 28 | run_jps_daemons_suffix="daemon"
|
---|
[30939] | 29 | using_hdfs=0
|
---|
[30929] | 30 |
|
---|
[30934] | 31 | if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
|
---|
| 32 | # Evidence of running command over HDFS
|
---|
| 33 | run_jps=1
|
---|
| 34 | run_jps_daemons="Spark"
|
---|
[30939] | 35 | using_hdfs=1
|
---|
[30934] | 36 | fi
|
---|
| 37 |
|
---|
[30935] | 38 | if [ "x${master_opt##--master spark://*}" = "x" ] ; then
|
---|
[30934] | 39 | # Evidence of running command submitted to Spark cluster
|
---|
| 40 | run_jps=1
|
---|
| 41 | if [ "x$run_jps_daemons" != "x" ] ; then
|
---|
| 42 | run_jps_daemons="$run_jps_daemons and Hadoop"
|
---|
| 43 | run_jps_daemons_suffix="daemons"
|
---|
| 44 | else
|
---|
| 45 | run_jps_daemons="Hadoop"
|
---|
| 46 | fi
|
---|
| 47 | fi
|
---|
| 48 |
|
---|
| 49 | if [ "$run_jps" = "1" ] ; then
|
---|
| 50 | echo
|
---|
| 51 | echo "****"
|
---|
[30952] | 52 | echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix, by running 'jps':"
|
---|
[30934] | 53 | echo "****"
|
---|
[31057] | 54 | jps | egrep -v " Jps$" | sed 's/^/* /g' \
|
---|
| 55 | | sed 's/ Master/ [Spark] Master/' \
|
---|
| 56 | | sed 's/ NameNode/ [HDFS] NameNode/' \
|
---|
| 57 | | sed 's/ SecondaryNameNode/ [HDFS] SecondaryNameNode/'
|
---|
| 58 |
|
---|
[30934] | 59 | echo "****"
|
---|
| 60 | echo "* Done"
|
---|
| 61 | echo "****"
|
---|
| 62 | echo
|
---|
| 63 |
|
---|
[30935] | 64 | sleep 1
|
---|
[30934] | 65 | fi
|
---|
| 66 |
|
---|
[30939] | 67 | if [ "$using_hdfs" = "1" ] ; then
|
---|
[31044] | 68 | if [ "x$output_dir" != "x" ] ; then
|
---|
| 69 | hadoop fs -test -d "$output_dir"
|
---|
[30939] | 70 |
|
---|
[31044] | 71 | if [ $? != 0 ] ; then
|
---|
[30939] | 72 | echo "Creating directory:"
|
---|
| 73 | echo " $output_dir"
|
---|
[31044] | 74 | fi
|
---|
[30939] | 75 | fi
|
---|
| 76 | fi
|
---|
[31184] | 77 |
|
---|
| 78 | if [ "x$classmain" = "x" ] ; then
|
---|
| 79 | classmain="org.hathitrust.extractedfeatures.ProcessForSolrIngest"
|
---|
| 80 | fi
|
---|
| 81 |
|
---|
[30918] | 82 | self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
|
---|
[31184] | 83 | cmd="spark-submit --class $classmain $master_opt $self_contained_jar"
|
---|
[30918] | 84 |
|
---|
[30975] | 85 | if [ "x$solr_url" != "x" ] ; then
|
---|
| 86 | cmd="$cmd --solr-url $solr_url"
|
---|
| 87 | fi
|
---|
[30918] | 88 |
|
---|
[30975] | 89 | if [ "x$output_dir" != "x" ] ; then
|
---|
| 90 | cmd="$cmd --output-dir $output_dir"
|
---|
| 91 | fi
|
---|
| 92 |
|
---|
| 93 |
|
---|
[31028] | 94 | cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*"
|
---|
[30975] | 95 |
|
---|
[30929] | 96 | echo "****"
|
---|
| 97 | echo "* Lauching:"
|
---|
| 98 | echo "* $cmd"
|
---|
| 99 | echo "****"
|
---|
[30975] | 100 |
|
---|
[30936] | 101 | if [ "$run_jps" = "1" ] ; then
|
---|
| 102 | echo "* Monitor progress on Spark cluster through:"
|
---|
[31093] | 103 | echo "* http://$SPARK_MASTER_HOST:8080/"
|
---|
[30936] | 104 | echo "****"
|
---|
| 105 | fi
|
---|
[30929] | 106 | echo
|
---|
[30939] | 107 | sleep 2
|
---|
[30929] | 108 |
|
---|
| 109 | $cmd
|
---|
| 110 |
|
---|