Changeset 30926
- Timestamp:
- 2016-10-26T09:09:03+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/solr-extracted-features/trunk
- Files:
-
- 2 added
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/solr-extracted-features/trunk/_RUN.bash
r30923 r30926 1 1 #!/bin/bash 2 2 3 #input_dir=pd-ef-json-files 4 input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files" 5 output_dir=pd-solr-json-files 3 # To work, the follow bash variables need to have been set: 4 # 5 # json_filelist input_dir output_dir 6 # 7 # Typically done through running a wrapper script, such as: 8 # 9 # RUN-PD-CLUSTER.bash 6 10 7 #master_opt="--master local[4]" 8 master_opt="--master spark://10.10.0.52:7077" 11 if [ "x$json_filelist" = "x" ] ; then 12 echo "_RUN.bash: Failed to set 'json_filelist'" 1>2 13 exit 14 fi 15 16 if [ "x$input_dir" = "x" ] ; then 17 echo "_RUN.bash: Failed to set 'input_dir'" 1>2 18 exit 19 fi 20 21 if [ "x$output_dir" = "x" ] ; then 22 echo "_RUN.bash: Failed to set 'output_dir'" 1>2 23 exit 24 fi 9 25 10 26 self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar 11 27 base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar" 12 28 13 if [ $# -ge 1 ] ; then 14 file_listing=$1 15 shift 16 $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $* 17 else 18 echo "****" 19 echo "* Processing all files in: $input_dir" 20 echo "****" 21 $base_cmd $input_dir/*.json.bz2 $output_dir $* 22 fi 29 $base_cmd --json-filelist="$json_filelist" "$input_dir" "$output_dir" $* 23 30 24 31 # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $*
Note:
See TracChangeset
for help on using the changeset viewer.