Last change
on this file since 30923 was 30923, checked in by davidb, 7 years ago |
Rough cut version that reads in each JSON file over HDFS
|
-
Property svn:executable
set to
*
|
File size:
1.0 KB
|
Line | |
---|
1 | #!/bin/bash
|
---|
2 |
|
---|
3 | #input_dir=pd-ef-json-files
|
---|
4 | input_dir="hdfs://10.10.0.52:9000/user/htrc/pd-ef-json-files"
|
---|
5 | output_dir=pd-solr-json-files
|
---|
6 |
|
---|
7 | #master_opt="--master local[4]"
|
---|
8 | master_opt="--master spark://10.10.0.52:7077"
|
---|
9 |
|
---|
10 | self_contained_jar=target/htrc-ef-ingest-0.9-jar-with-dependencies.jar
|
---|
11 | base_cmd="spark-submit --class org.hathitrust.PrepareForIngest $master_opt $self_contained_jar"
|
---|
12 |
|
---|
13 | if [ $# -ge 1 ] ; then
|
---|
14 | file_listing=$1
|
---|
15 | shift
|
---|
16 | $base_cmd --json-filelist="$file_listing" $input_dir $output_dir $*
|
---|
17 | else
|
---|
18 | echo "****"
|
---|
19 | echo "* Processing all files in: $input_dir"
|
---|
20 | echo "****"
|
---|
21 | $base_cmd $input_dir/*.json.bz2 $output_dir $*
|
---|
22 | fi
|
---|
23 |
|
---|
24 | # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target/htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step10000.txt pd-ef-json-files pd-solr-json-files $*
|
---|
25 |
|
---|
26 | # spark-submit --class org.hathitrust.PrepareForIngest --master local[4] target\htrc-ef-ingest-0.9-jar-with-dependencies.jar --json-filelist=pd-file-listing-step1000.txt json-files solr-files $*
|
---|
Note:
See
TracBrowser
for help on using the repository browser.