Show
Ignore:
Timestamp:
26.10.2016 09:09:03 (3 years ago)
Author:
davidb
Message:

Restructuring of RUN scripts to be more flexible

Location:
other-projects/hathitrust/solr-extracted-features/trunk
Files:
2 added
1 moved

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/_RUN.bash

    r30923 r30926  
    11#!/bin/bash 
    22 
    3 #input_dir=pd-ef-json-files 
    4 input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files" 
    5 output_dir=pd-solr-json-files 
     3# To work, the follow bash variables need to have been set: 
     4# 
     5#  json_filelist input_dir output_dir 
     6# 
     7# Typically done through running a wrapper script, such as: 
     8# 
     9#  RUN-PD-CLUSTER.bash 
    610 
    7 #master_opt="--master local[4]" 
    8 master_opt="--master spark://10.10.0.52:7077" 
     11if [ "x$json_filelist" = "x" ] ; then 
     12    echo "_RUN.bash: Failed to set 'json_filelist'" 1>2 
     13    exit 
     14fi 
     15 
     16if [ "x$input_dir" = "x" ] ; then 
     17    echo "_RUN.bash: Failed to set 'input_dir'" 1>2 
     18    exit 
     19fi 
     20 
     21if [ "x$output_dir" = "x" ] ; then 
     22    echo "_RUN.bash: Failed to set 'output_dir'" 1>2 
     23    exit 
     24fi 
    925 
    1026self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar 
    1127base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar" 
    1228 
    13 if [ $# -ge 1 ] ; then 
    14     file_listing=$1 
    15     shift 
    16     $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $* 
    17 else 
    18     echo "****" 
    19     echo "* Processing all files in: $input_dir" 
    20     echo "****" 
    21     $base_cmd $input_dir/*.json.bz2 $output_dir $* 
    22 fi 
     29$base_cmd --json-filelist="$json_filelist" "$input_dir" "$output_dir" $* 
    2330 
    2431#    spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $*