Changeset 30923


Ignore:
Timestamp:
10/25/16 23:28:22 (4 years ago)
Author:
davidb
Message:

Rough cut version that reads in each JSON file over HDFS

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/RUN.bash

    r30918 r30923  
    11#!/bin/bash
    22
    3 input_dir=pd-ef-json-files
     3#input_dir=pd-ef-json-files
     4input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files"
    45output_dir=pd-solr-json-files
    56
    6 master_opt="--master local[4]"
     7#master_opt="--master local[4]"
     8master_opt="--master spark://10.10.0.52:7077"
     9
    710self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
    811base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar"
    912
    1013if [ $# -ge 1 ] ; then
    11     file_listing=shift $*
     14    file_listing=$1
     15    shift
    1216    $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $*
    1317else
     
    1519    echo "* Processing all files in: $input_dir"
    1620    echo "****"
    17     $base_cmd $input_dir $output_dir $*
     21    $base_cmd $input_dir/*.json.bz2 $output_dir $*
    1822fi
    1923
Note: See TracChangeset for help on using the changeset viewer.