Changeset 30923 for other-projects

Show
Ignore:
Timestamp:
25.10.2016 23:28:22 (3 years ago)
Author:
davidb
Message:

Rough cut version that reads in each JSON file over HDFS

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/RUN.bash

    r30918 r30923  
    11#!/bin/bash 
    22 
    3 input_dir=pd-ef-json-files 
     3#input_dir=pd-ef-json-files 
     4input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files" 
    45output_dir=pd-solr-json-files 
    56 
    6 master_opt="--master local[4]" 
     7#master_opt="--master local[4]" 
     8master_opt="--master spark://10.10.0.52:7077" 
     9 
    710self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar 
    811base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar" 
    912 
    1013if [ $# -ge 1 ] ; then 
    11     file_listing=shift $* 
     14    file_listing=$1 
     15    shift 
    1216    $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $* 
    1317else 
     
    1519    echo "* Processing all files in: $input_dir" 
    1620    echo "****" 
    17     $base_cmd $input_dir $output_dir $* 
     21    $base_cmd $input_dir/*.json.bz2 $output_dir $* 
    1822fi 
    1923