1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # convert URL index with Spark on Yarn
|
---|
4 |
|
---|
5 | # Table format configuration
|
---|
6 | FORMAT=${FORMAT:-parquet} # parquet, orc
|
---|
7 | NESTED="$NESTED" # "" (empty) or --useNestedSchema
|
---|
8 | COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
|
---|
9 | PARTITION_BY="crawl,subset"
|
---|
10 |
|
---|
11 | # Input spec (URL index files to convert)
|
---|
12 | #DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
|
---|
13 | #DATA="$1"
|
---|
14 | # Output directory
|
---|
15 | #OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
|
---|
16 | CRAWL_ID=$1
|
---|
17 | OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
|
---|
18 | OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID}
|
---|
19 | # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs:
|
---|
20 |
|
---|
21 | # https://stackoverflow.com/questions/26513861/checking-if-directory-in-hdfs-already-exists-or-not
|
---|
22 | hdfs dfs -test -d $OUTPUT_PARENTDIR
|
---|
23 | if [ $? == 0 ]; then
|
---|
24 | echo "Directory $OUTPUT_PARENTDIR already exists."
|
---|
25 | else
|
---|
26 | echo "Creating directory $OUTPUT_PARENTDIR..."
|
---|
27 | hdfs dfs -mkdir $OUTPUT_PARENTDIR
|
---|
28 | fi
|
---|
29 |
|
---|
30 |
|
---|
31 | # Spark configuration
|
---|
32 | SPARK_HOME="$SPARK_HOME"
|
---|
33 | # EXECUTOR_MEM=44g
|
---|
34 | # EXECUTOR_CORES=12
|
---|
35 | # NUM_EXECUTORS=4
|
---|
36 | # DRIVER_MEM=4g
|
---|
37 |
|
---|
38 | EXECUTOR_MEM=3g
|
---|
39 | EXECUTOR_CORES=2
|
---|
40 | NUM_EXECUTORS=2
|
---|
41 | DRIVER_MEM=3g
|
---|
42 |
|
---|
43 |
|
---|
44 | SPARK_ON_YARN="--master yarn"
|
---|
45 | SPARK_EXTRA_OPTS=""
|
---|
46 |
|
---|
47 | # source specific configuration file
|
---|
48 | ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
|
---|
49 |
|
---|
50 |
|
---|
51 | _APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
|
---|
52 |
|
---|
53 | ##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
|
---|
54 | #export LIBJARS=/home/vagrant/lib/*
|
---|
55 | #export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
|
---|
56 |
|
---|
57 | set -e
|
---|
58 | set -x
|
---|
59 |
|
---|
60 |
|
---|
61 | # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
|
---|
62 | # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
|
---|
63 | # --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \
|
---|
64 | # --conf spark.hadoop.fs.s3a.secret.key=ZVPIboz0brE+Zy8IXyo76wl7GaFrtlr6g4TBKgJt \
|
---|
65 |
|
---|
66 |
|
---|
67 | # $SPARK_ON_YARN \
|
---|
68 | # /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
|
---|
69 | # --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
|
---|
70 | # --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
|
---|
71 |
|
---|
72 | # https://www.patricia-anong.com/blog/2017/11/1/extend-vmdk-on-virtualbox
|
---|
73 |
|
---|
74 | $SPARK_HOME/bin/spark-submit \
|
---|
75 | --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
|
---|
76 | --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
|
---|
77 | --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
78 | --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
79 | --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
80 | --conf spark.sql.parquet.filterPushdown=true \
|
---|
81 | --conf spark.sql.parquet.mergeSchema=true \
|
---|
82 | --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
|
---|
83 | --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
|
---|
84 | FROM ccindex
|
---|
85 | WHERE crawl = '${CRAWL_ID}' AND subset = 'warc' AND content_languages = 'mri'" \
|
---|
86 | --outputFormat csv \
|
---|
87 | --numOutputPartitions 10 \
|
---|
88 | --outputCompression gzip \
|
---|
89 | s3a://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
90 | $OUTPUTDIR
|
---|
91 |
|
---|