1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # convert URL index with Spark on Yarn
|
---|
4 |
|
---|
5 | # Table format configuration
|
---|
6 | FORMAT=${FORMAT:-parquet} # parquet, orc
|
---|
7 | NESTED="$NESTED" # "" (empty) or --useNestedSchema
|
---|
8 | COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
|
---|
9 | PARTITION_BY="crawl,subset"
|
---|
10 |
|
---|
11 | # Input spec (URL index files to convert)
|
---|
12 | #DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
|
---|
13 | DATA="$1"
|
---|
14 | # Output directory
|
---|
15 | #OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
|
---|
16 | OUTPUTDIR="$2"
|
---|
17 |
|
---|
18 |
|
---|
19 | # Spark configuration
|
---|
20 | SPARK_HOME="$SPARK_HOME"
|
---|
21 | # EXECUTOR_MEM=44g
|
---|
22 | # EXECUTOR_CORES=12
|
---|
23 | # NUM_EXECUTORS=4
|
---|
24 | # DRIVER_MEM=4g
|
---|
25 |
|
---|
26 | EXECUTOR_MEM=3g
|
---|
27 | EXECUTOR_CORES=2
|
---|
28 | NUM_EXECUTORS=2
|
---|
29 | DRIVER_MEM=3g
|
---|
30 |
|
---|
31 |
|
---|
32 | SPARK_ON_YARN="--master yarn"
|
---|
33 | SPARK_EXTRA_OPTS=""
|
---|
34 |
|
---|
35 | # source specific configuration file
|
---|
36 | ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
|
---|
37 |
|
---|
38 |
|
---|
39 | _APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
|
---|
40 |
|
---|
41 | ##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
|
---|
42 | #export LIBJARS=/home/vagrant/lib/*
|
---|
43 | #export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
|
---|
44 |
|
---|
45 | set -e
|
---|
46 | set -x
|
---|
47 |
|
---|
48 |
|
---|
49 | # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
|
---|
50 | # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
|
---|
51 | # --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \
|
---|
52 | # --conf spark.hadoop.fs.s3a.secret.key=ZVPIboz0brE+Zy8IXyo76wl7GaFrtlr6g4TBKgJt \
|
---|
53 |
|
---|
54 |
|
---|
55 | # $SPARK_ON_YARN \
|
---|
56 | # /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
|
---|
57 | # --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
|
---|
58 | # --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
|
---|
59 |
|
---|
60 | # --conf spark.sql.parquet.mergeSchema=true \
|
---|
61 |
|
---|
62 | $SPARK_HOME/bin/spark-submit \
|
---|
63 | --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
|
---|
64 | --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
|
---|
65 | --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
66 | --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
67 | --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
68 | --conf spark.sql.parquet.filterPushdown=true \
|
---|
69 | --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
|
---|
70 | --query "SELECT * FROM ccindex LIMIT 10" \
|
---|
71 | --outputFormat csv \
|
---|
72 | --numOutputPartitions 10 \
|
---|
73 | --outputCompression gzip \
|
---|
74 | s3a://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
75 | hdfs:///user/vagrant/cc-limit10-csv
|
---|
76 |
|
---|