Ignore:
Timestamp:
2016-10-26T09:09:03+13:00 (7 years ago)
Author:
davidb
Message:

Restructuring of RUN scripts to be more flexible

Location:
other-projects/hathitrust/solr-extracted-features/trunk
Files:
2 added
1 moved

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/_RUN.bash

    r30923 r30926  
    11#!/bin/bash
    22
    3 #input_dir=pd-ef-json-files
    4 input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files"
    5 output_dir=pd-solr-json-files
     3# To work, the follow bash variables need to have been set:
     4#
     5#  json_filelist input_dir output_dir
     6#
     7# Typically done through running a wrapper script, such as:
     8#
     9#  RUN-PD-CLUSTER.bash
    610
    7 #master_opt="--master local[4]"
    8 master_opt="--master spark://10.10.0.52:7077"
     11if [ "x$json_filelist" = "x" ] ; then
     12    echo "_RUN.bash: Failed to set 'json_filelist'" 1>2
     13    exit
     14fi
     15
     16if [ "x$input_dir" = "x" ] ; then
     17    echo "_RUN.bash: Failed to set 'input_dir'" 1>2
     18    exit
     19fi
     20
     21if [ "x$output_dir" = "x" ] ; then
     22    echo "_RUN.bash: Failed to set 'output_dir'" 1>2
     23    exit
     24fi
    925
    1026self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
    1127base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar"
    1228
    13 if [ $# -ge 1 ] ; then
    14     file_listing=$1
    15     shift
    16     $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $*
    17 else
    18     echo "****"
    19     echo "* Processing all files in: $input_dir"
    20     echo "****"
    21     $base_cmd $input_dir/*.json.bz2 $output_dir $*
    22 fi
     29$base_cmd --json-filelist="$json_filelist" "$input_dir" "$output_dir" $*
    2330
    2431#    spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $*
Note: See TracChangeset for help on using the changeset viewer.