#!/bin/bash #input_dir=pd-ef-json-files input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files" output_dir=pd-solr-json-files #master_opt="--master local[4]" master_opt="--master spark://10.10.0.52:7077" self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar" if [ $# -ge 1 ] ; then file_listing=$1 shift $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $* else echo "****" echo "* Processing all files in: $input_dir" echo "****" $base_cmd $input_dir/*.json.bz2 $output_dir $* fi # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $* # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target\htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step1000.txt json-files solr-files $*