source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/scripts/_RUN.sh@ 32107

Last change on this file since 32107 was 32107, checked in by davidb, 6 years ago

Rekindling the ability to run a JSON-filelist Spark run via YARN

  • Property svn:executable set to *
File size: 3.5 KB
Line 
1#!/bin/bash
2
3# To work, the follow bash variables need to have been set:
4#
5# seq_file (output_dir optional)
6#
7# Or:
8#
9# json_filelist input_dir (output_dir optional)
10#
11# Typically done through running a wrapper script, such as:
12#
13# FULL-RUN-YARN-SPARK.sh
14
15
16show_usage=1
17class_mode=""
18if [ "x$seq_file" != "x" ] ; then
19 show_usage=0
20 class_mode="seq"
21else
22
23
24 if [ "x$json_filelist" != "x" ] ; then
25 class_mode="json"
26 fi
27
28 if [ "x$input_dir" != "x" ] ; then
29 if [ $show_usage = "json" ] ; then
30 show_usage=0
31 fi
32 fi
33
34fi
35
36if [ $show_usage = "1" ] ; then
37 echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2
38 exit 1
39fi
40
41
42#if [ "x$output_dir" = "x" ] ; then
43# echo "_RUN.bash: Failed to set 'output_dir'" 1>&2
44# exit
45#fi
46
47run_jps=0
48run_jps_daemons=""
49run_jps_daemons_suffix="daemon"
50using_hdfs=0
51
52if [ "$class_mode" = "seq" ] ; then
53 if [ "x${seq_file##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
54 # Evidence of running command over HDFS
55 run_jps=1
56 run_jps_daemons="Spark"
57 using_hdfs=1
58 fi
59fi
60
61if [ "$class_mode" = "json" ] ; then
62 if [ "x${input_dir##hdfs://*}" = "x" ] || [ "x${output_dir##hdfs://*}" = "x" ] ; then
63 # Evidence of running command over HDFS
64 run_jps=1
65 run_jps_daemons="Spark"
66 using_hdfs=1
67 fi
68fi
69
70if [ "x${master_opt##--master spark://*}" = "x" ] ; then
71 # Evidence of running command submitted to Spark cluster
72 run_jps=1
73 if [ "x$run_jps_daemons" != "x" ] ; then
74 run_jps_daemons="$run_jps_daemons and Hadoop"
75 run_jps_daemons_suffix="daemons"
76 else
77 run_jps_daemons="Hadoop"
78 fi
79fi
80
81if [ "$run_jps" = "1" ] ; then
82 echo
83 echo "****"
84 echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix, by running 'jps':"
85 echo "****"
86 jps | egrep -v " Jps$" | sed 's/^/* /g' \
87 | sed 's/ Master/ [Spark] Master/' \
88 | sed 's/ NameNode/ [HDFS] NameNode/' \
89 | sed 's/ SecondaryNameNode/ [HDFS] SecondaryNameNode/'
90
91 echo "****"
92 echo "* Done"
93 echo "****"
94 echo
95
96 sleep 1
97fi
98
99if [ "$using_hdfs" = "1" ] ; then
100 if [ "x$output_dir" != "x" ] ; then
101 hadoop fs -test -d "$output_dir"
102
103 if [ $? != 0 ] ; then
104 echo "Creating directory:"
105 echo " $output_dir"
106 fi
107 fi
108fi
109
110if [ "x$classmain" = "x" ] ; then
111 classmain="org.hathitrust.extractedfeatures.ProcessForSolrIngest"
112fi
113
114self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
115cmd="spark-submit --class $classmain $master_opt $self_contained_jar"
116
117if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] || [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then
118 if [ "x$solr_base_url" != "x" ] ; then
119 cmd="$cmd --solr-base-url $solr_base_url"
120 fi
121
122 if [ "x$output_dir" != "x" ] ; then
123 cmd="$cmd --output-dir $output_dir"
124 fi
125fi
126
127
128if [ "$class_mode" = "seq" ] ; then
129 cmd="$cmd --properties ef-solr.properties $seq_file $*"
130 #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*"
131else
132 cmd="$cmd --properties ef-solr.properties $json_filelist $input_dir $*"
133 #cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $json_filelist $input_dir $*"
134
135fi
136
137echo "****"
138echo "* Lauching:"
139echo "* $cmd"
140echo "****"
141
142if [ "$run_jps" = "1" ] ; then
143 echo "* Monitor progress on Spark cluster through:"
144 echo "* http://$SPARK_MASTER_HOST:8080/"
145 echo "****"
146fi
147echo
148sleep 2
149
150$cmd
151
Note: See TracBrowser for help on using the repository browser.