root/other-projects/hathitrust/solr-extracted-features/trunk/RUN.bash @ 30923

Revision 30923, 1.0 KB (checked in by davidb, 4 years ago)

Rough cut version that reads in each JSON file over HDFS

  • Property svn:executable set to *
Line 
1#!/bin/bash
2
3#input_dir=pd-ef-json-files
4input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files"
5output_dir=pd-solr-json-files
6
7#master_opt="--master local[4]"
8master_opt="--master spark://10.10.0.52:7077"
9
10self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
11base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar"
12
13if [ $# -ge 1 ] ; then
14    file_listing=$1
15    shift
16    $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $*
17else
18    echo "****"
19    echo "* Processing all files in: $input_dir"
20    echo "****"
21    $base_cmd $input_dir/*.json.bz2 $output_dir $*
22fi
23
24#    spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $*
25
26# spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target\htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step1000.txt json-files solr-files $*
Note: See TracBrowser for help on using the browser.