1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # To work, the follow bash variables need to have been set:
|
---|
4 | #
|
---|
5 | # seq_file (output_dir optional)
|
---|
6 | #
|
---|
7 | # Or:
|
---|
8 | #
|
---|
9 | # json_filelist input_dir (output_dir optional)
|
---|
10 | #
|
---|
11 | # Typically done through running a wrapper script, such as:
|
---|
12 | #
|
---|
13 | # FULL-RUN-YARN-SPARK.sh
|
---|
14 |
|
---|
15 |
|
---|
16 | show_usage=1
|
---|
17 | class_mode=""
|
---|
18 | if [ "x$seq_file" != "x" ] ; then
|
---|
19 | show_usage=0
|
---|
20 | class_mode="seq"
|
---|
21 | else
|
---|
22 |
|
---|
23 |
|
---|
24 | if [ "x$json_filelist" != "x" ] ; then
|
---|
25 | class_mode="json"
|
---|
26 | fi
|
---|
27 |
|
---|
28 | if [ "x$input_dir" != "x" ] ; then
|
---|
29 | if [ $class_mode = "json" ] ; then
|
---|
30 | show_usage=0
|
---|
31 | fi
|
---|
32 | fi
|
---|
33 |
|
---|
34 | fi
|
---|
35 |
|
---|
36 | if [ $show_usage = "1" ] ; then
|
---|
37 | echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2
|
---|
38 | exit 1
|
---|
39 | fi
|
---|
40 |
|
---|
41 |
|
---|
42 | #if [ "x$output_dir" = "x" ] ; then
|
---|
43 | # echo "_RUN.bash: Failed to set 'output_dir'" 1>&2
|
---|
44 | # exit
|
---|
45 | #fi
|
---|
46 |
|
---|
47 | run_jps=0
|
---|
48 | run_jps_daemons=""
|
---|
49 | run_jps_daemons_suffix="daemon"
|
---|
50 | using_hdfs=0
|
---|
51 |
|
---|
52 | if [ "$class_mode" = "seq" ] ; then
|
---|
53 | if [ "x${seq_file##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
|
---|
54 | # Evidence of running command over HDFS
|
---|
55 | run_jps=1
|
---|
56 | run_jps_daemons="Spark"
|
---|
57 | using_hdfs=1
|
---|
58 | fi
|
---|
59 | fi
|
---|
60 |
|
---|
61 | if [ "$class_mode" = "json" ] ; then
|
---|
62 | if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
|
---|
63 | # Evidence of running command over HDFS
|
---|
64 | run_jps=1
|
---|
65 | run_jps_daemons="Spark"
|
---|
66 | using_hdfs=1
|
---|
67 | fi
|
---|
68 | fi
|
---|
69 |
|
---|
70 | if [ "x${master_opt##--master spark://*}" = "x" ] ; then
|
---|
71 | # Evidence of running command submitted to Spark cluster
|
---|
72 | run_jps=1
|
---|
73 | if [ "x$run_jps_daemons" != "x" ] ; then
|
---|
74 | run_jps_daemons="$run_jps_daemons and Hadoop"
|
---|
75 | run_jps_daemons_suffix="daemons"
|
---|
76 | else
|
---|
77 | run_jps_daemons="Hadoop"
|
---|
78 | fi
|
---|
79 | fi
|
---|
80 |
|
---|
81 | if [ "$run_jps" = "1" ] ; then
|
---|
82 | echo
|
---|
83 | echo "****"
|
---|
84 | echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix, by running 'jps':"
|
---|
85 | echo "****"
|
---|
86 | jps | egrep -v " Jps$" | sed 's/^/* /g' \
|
---|
87 | | sed 's/ Master/ [Spark] Master/' \
|
---|
88 | | sed 's/ NameNode/ [HDFS] NameNode/' \
|
---|
89 | | sed 's/ SecondaryNameNode/ [HDFS] SecondaryNameNode/'
|
---|
90 |
|
---|
91 | echo "****"
|
---|
92 | echo "* Done"
|
---|
93 | echo "****"
|
---|
94 | echo
|
---|
95 |
|
---|
96 | sleep 1
|
---|
97 | fi
|
---|
98 |
|
---|
99 | if [ "$using_hdfs" = "1" ] ; then
|
---|
100 | if [ "x$output_dir" != "x" ] ; then
|
---|
101 | hadoop fs -test -d "$output_dir"
|
---|
102 |
|
---|
103 | if [ $? != 0 ] ; then
|
---|
104 | echo "Creating directory:"
|
---|
105 | echo " $output_dir"
|
---|
106 | fi
|
---|
107 | fi
|
---|
108 | fi
|
---|
109 |
|
---|
110 | if [ "x$classmain" = "x" ] ; then
|
---|
111 | classmain="org.hathitrust.extractedfeatures.ProcessForSolrIngest"
|
---|
112 | fi
|
---|
113 |
|
---|
114 | self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
|
---|
115 | cmd="spark-submit --class $classmain $master_opt $self_contained_jar"
|
---|
116 |
|
---|
117 | if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] || [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then
|
---|
118 | if [ "x$solr_base_url" != "x" ] ; then
|
---|
119 | cmd="$cmd --solr-base-url $solr_base_url"
|
---|
120 | fi
|
---|
121 |
|
---|
122 | if [ "x$output_dir" != "x" ] ; then
|
---|
123 | cmd="$cmd --output-dir $output_dir"
|
---|
124 | fi
|
---|
125 | fi
|
---|
126 |
|
---|
127 |
|
---|
128 | if [ "$class_mode" = "seq" ] ; then
|
---|
129 | cmd="$cmd --properties ef-solr.properties $seq_file $*"
|
---|
130 | #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*"
|
---|
131 | else
|
---|
132 | cmd="$cmd --properties ef-solr.properties $input_dir $json_filelist $*"
|
---|
133 | #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $input_dir $json_filelist $*"
|
---|
134 |
|
---|
135 | fi
|
---|
136 |
|
---|
137 | echo "****"
|
---|
138 | echo "* Lauching:"
|
---|
139 | echo "* $cmd"
|
---|
140 | echo "****"
|
---|
141 |
|
---|
142 | if [ "$run_jps" = "1" ] ; then
|
---|
143 | echo "* Monitor progress on Spark cluster through:"
|
---|
144 | echo "* http://$SPARK_MASTER_HOST:8080/"
|
---|
145 | echo "****"
|
---|
146 | fi
|
---|
147 | echo
|
---|
148 | sleep 2
|
---|
149 |
|
---|
150 | $cmd
|
---|
151 |
|
---|