Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/scripts/_RUN.sh@ 32107

Last change on this file since 32107 was 32107, checked in by davidb, 6 years ago
Rekindling the ability to run a JSON-filelist Spark run via YARN
Property svn:executable set to ``*
File size: 3.5 KB

Line
1	#!/bin/bash
2
3	# To work, the follow bash variables need to have been set:
4	#
5	# seq_file (output_dir optional)
6	#
7	# Or:
8	#
9	# json_filelist input_dir (output_dir optional)
10	#
11	# Typically done through running a wrapper script, such as:
12	#
13	# FULL-RUN-YARN-SPARK.sh
14
15
16	show_usage=1
17	class_mode=""
18	if [ "x$seq_file" != "x" ] ; then
19	show_usage=0
20	class_mode="seq"
21	else
22
23
24	if [ "x$json_filelist" != "x" ] ; then
25	class_mode="json"
26	fi
27
28	if [ "x$input_dir" != "x" ] ; then
29	if [ $show_usage = "json" ] ; then
30	show_usage=0
31	fi
32	fi
33
34	fi
35
36	if [ $show_usage = "1" ] ; then
37	echo "_RUN.bash: Failed to set 'seq_file' or 'input_dir json_filelist" 1>&2
38	exit 1
39	fi
40
41
42	#if [ "x$output_dir" = "x" ] ; then
43	# echo "_RUN.bash: Failed to set 'output_dir'" 1>&2
44	# exit
45	#fi
46
47	run_jps=0
48	run_jps_daemons=""
49	run_jps_daemons_suffix="daemon"
50	using_hdfs=0
51
52	if [ "$class_mode" = "seq" ] ; then
53	if [ "x${seq_file##hdfs://}" = "x" ] \|\| [ "x${output_dir##hdfs://}" = "x" ] ; then
54	# Evidence of running command over HDFS
55	run_jps=1
56	run_jps_daemons="Spark"
57	using_hdfs=1
58	fi
59	fi
60
61	if [ "$class_mode" = "json" ] ; then
62	if [ "x${input_dir##hdfs://}" = "x" ] \|\| [ "x${output_dir##hdfs://}" = "x" ] ; then
63	# Evidence of running command over HDFS
64	run_jps=1
65	run_jps_daemons="Spark"
66	using_hdfs=1
67	fi
68	fi
69
70	if [ "x${master_opt##--master spark://*}" = "x" ] ; then
71	# Evidence of running command submitted to Spark cluster
72	run_jps=1
73	if [ "x$run_jps_daemons" != "x" ] ; then
74	run_jps_daemons="$run_jps_daemons and Hadoop"
75	run_jps_daemons_suffix="daemons"
76	else
77	run_jps_daemons="Hadoop"
78	fi
79	fi
80
81	if [ "$run_jps" = "1" ] ; then
82	echo
83	echo "****"
84	echo "* Checking for $run_jps_daemons $run_jps_daemons_suffix, by running 'jps':"
85	echo "****"
86	jps \| egrep -v " Jps$" \| sed 's/^/* /g' \
87	\| sed 's/ Master/ [Spark] Master/' \
88	\| sed 's/ NameNode/ [HDFS] NameNode/' \
89	\| sed 's/ SecondaryNameNode/ [HDFS] SecondaryNameNode/'
90
91	echo "****"
92	echo "* Done"
93	echo "****"
94	echo
95
96	sleep 1
97	fi
98
99	if [ "$using_hdfs" = "1" ] ; then
100	if [ "x$output_dir" != "x" ] ; then
101	hadoop fs -test -d "$output_dir"
102
103	if [ $? != 0 ] ; then
104	echo "Creating directory:"
105	echo " $output_dir"
106	fi
107	fi
108	fi
109
110	if [ "x$classmain" = "x" ] ; then
111	classmain="org.hathitrust.extractedfeatures.ProcessForSolrIngest"
112	fi
113
114	self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
115	cmd="spark-submit --class $classmain $master_opt $self_contained_jar"
116
117	if [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngest" ] \|\| [ "$classmain" = "org.hathitrust.extractedfeatures.ProcessForSolrIngestJSONFilelist" ] ; then
118	if [ "x$solr_base_url" != "x" ] ; then
119	cmd="$cmd --solr-base-url $solr_base_url"
120	fi
121
122	if [ "x$output_dir" != "x" ] ; then
123	cmd="$cmd --output-dir $output_dir"
124	fi
125	fi
126
127
128	if [ "$class_mode" = "seq" ] ; then
129	cmd="$cmd --properties ef-solr.properties $seq_file $*"
130	#cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $seq_file $*"
131	else
132	cmd="$cmd --properties ef-solr.properties $json_filelist $input_dir $*"
133	#cmd="$cmd --properties /homea/dbbridge/extracted-features-solr/solr-ingest/ef-solr.properties $json_filelist $input_dir $*"
134
135	fi
136
137	echo "****"
138	echo "* Lauching:"
139	echo "* $cmd"
140	echo "****"
141
142	if [ "$run_jps" = "1" ] ; then
143	echo "* Monitor progress on Spark cluster through:"
144	echo "* http://$SPARK_MASTER_HOST:8080/"
145	echo "****"
146	fi
147	echo
148	sleep 2
149
150	$cmd
151

Note: See TracBrowser for help on using the repository browser.

Download in other formats: