source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/scripts/_RUN.sh@ 32109

Last change on this file since 32109 was 32109, checked in by davidb, 6 years ago

Changes made after testing through YARN

  • Property svn:executable set to *
File size: 3.5 KB
RevLine 
[30912]1#!/bin/bash
2
[30926]3# To work, the follow bash variables need to have been set:
4#
[32107]5# seq_file (output_dir optional)
[30926]6#
[32107]7# Or:
8#
9# json_filelist input_dir (output_dir optional)
10#
[30926]11# Typically done through running a wrapper script, such as:
12#
[32107]13# FULL-RUN-YARN-SPARK.sh
[30912]14
[32107]15
16show_usage=1
17class_mode=""
18if [ "x$seq_file" != "x" ] ; then
19 show_usage=0
20 class_mode="seq"
21else
22
23
24 if [ "x$json_filelist" != "x" ] ; then
25 class_mode="json"
26 fi
27
28 if [ "x$input_dir" != "x" ] ; then
[32109]29 if [ $class_mode = "json" ] ; then
[32107]30 show_usage=0
31 fi
32 fi
33
[30926]34fi
[30923]35
[32107]36if [ $show_usage = "1" ] ; then
37 echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2
38 exit 1
[30926]39fi
40
[32107]41
[30975]42#if [ "x$output_dir" = "x" ] ; then
43# echo "_RUN.bash: Failed to set 'output_dir'" 1>&2
44# exit
45#fi
[30926]46
[30934]47run_jps=0
48run_jps_daemons=""
49run_jps_daemons_suffix="daemon"
[30939]50using_hdfs=0
[30929]51
[32107]52if [ "$class_mode" = "seq" ] ; then
53 if [ "x${seq_file##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
[30934]54 # Evidence of running command over HDFS
55 run_jps=1
56 run_jps_daemons="Spark"
[30939]57 using_hdfs=1
[32107]58 fi
[30934]59fi
60
[32107]61if [ "$class_mode" = "json" ] ; then
62 if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
63 # Evidence of running command over HDFS
64 run_jps=1
65 run_jps_daemons="Spark"
66 using_hdfs=1
67 fi
68fi
69
[30935]70if [ "x${master_opt##--master spark://*}" = "x" ] ; then
[30934]71 # Evidence of running command submitted to Spark cluster
72 run_jps=1
73 if [ "x$run_jps_daemons" != "x" ] ; then
74 run_jps_daemons="$run_jps_daemons and Hadoop"
75 run_jps_daemons_suffix="daemons"
76 else
77 run_jps_daemons="Hadoop"
78 fi
79fi
80
81if [ "$run_jps" = "1" ] ; then
82 echo
83 echo "****"
[30952]84 echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix, by running 'jps':"
[30934]85 echo "****"
[31057]86 jps | egrep -v " Jps$" | sed 's/^/* /g' \
87 | sed 's/ Master/ [Spark] Master/' \
88 | sed 's/ NameNode/ [HDFS] NameNode/' \
89 | sed 's/ SecondaryNameNode/ [HDFS] SecondaryNameNode/'
90
[30934]91 echo "****"
92 echo "* Done"
93 echo "****"
94 echo
95
[30935]96 sleep 1
[30934]97fi
98
[30939]99if [ "$using_hdfs" = "1" ] ; then
[31044]100 if [ "x$output_dir" != "x" ] ; then
101 hadoop fs -test -d "$output_dir"
[30939]102
[31044]103 if [ $? != 0 ] ; then
[30939]104 echo "Creating directory:"
105 echo " $output_dir"
[31044]106 fi
[30939]107 fi
108fi
[31184]109
110if [ "x$classmain" = "x" ] ; then
111 classmain="org.hathitrust.extractedfeatures.ProcessForSolrIngest"
112fi
113
[30918]114self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
[31184]115cmd="spark-submit --class $classmain $master_opt $self_contained_jar"
[30918]116
[32107]117if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] || [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then
118 if [ "x$solr_base_url" != "x" ] ; then
119 cmd="$cmd --solr-base-url $solr_base_url"
120 fi
[30918]121
[32107]122 if [ "x$output_dir" != "x" ] ; then
[30975]123 cmd="$cmd --output-dir $output_dir"
[32107]124 fi
[30975]125fi
126
127
[32107]128if [ "$class_mode" = "seq" ] ; then
129 cmd="$cmd --properties ef-solr.properties $seq_file $*"
130 #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*"
131else
[32109]132 cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*"
133 #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $input_dir $json_filelist $*"
[30975]134
[32107]135fi
136
[30929]137echo "****"
138echo "* Lauching:"
139echo "* $cmd"
140echo "****"
[30975]141
[30936]142if [ "$run_jps" = "1" ] ; then
143 echo "* Monitor progress on Spark cluster through:"
[31093]144 echo "* http://$SPARK_MASTER_HOST:8080/"
[30936]145 echo "****"
146fi
[30929]147echo
[30939]148sleep 2
[30929]149
150$cmd
151
Note: See TracBrowser for help on using the repository browser.