1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # convert URL index with Spark on Yarn
|
---|
4 |
|
---|
5 | # Table format configuration
|
---|
6 | FORMAT=${FORMAT:-parquet} # parquet, orc
|
---|
7 | NESTED="$NESTED" # "" (empty) or --useNestedSchema
|
---|
8 | COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
|
---|
9 | PARTITION_BY="crawl,subset"
|
---|
10 |
|
---|
11 | # Input spec (URL index files to convert)
|
---|
12 | #DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
|
---|
13 | DATA="$1"
|
---|
14 | # Output directory
|
---|
15 | #OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
|
---|
16 | OUTPUTDIR="$2"
|
---|
17 |
|
---|
18 |
|
---|
19 | # Spark configuration
|
---|
20 | SPARK_HOME="$SPARK_HOME"
|
---|
21 | # EXECUTOR_MEM=44g
|
---|
22 | # EXECUTOR_CORES=12
|
---|
23 | # NUM_EXECUTORS=4
|
---|
24 | # DRIVER_MEM=4g
|
---|
25 |
|
---|
26 | EXECUTOR_MEM=3g
|
---|
27 | EXECUTOR_CORES=2
|
---|
28 | NUM_EXECUTORS=2
|
---|
29 | DRIVER_MEM=3g
|
---|
30 |
|
---|
31 |
|
---|
32 | SPARK_ON_YARN="--master yarn"
|
---|
33 | SPARK_EXTRA_OPTS=""
|
---|
34 |
|
---|
35 | # source specific configuration file
|
---|
36 | ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
|
---|
37 |
|
---|
38 |
|
---|
39 | _APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
|
---|
40 |
|
---|
41 | ##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
|
---|
42 | #export LIBJARS=/home/vagrant/lib/*
|
---|
43 | #export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
|
---|
44 |
|
---|
45 | set -e
|
---|
46 | set -x
|
---|
47 |
|
---|
48 |
|
---|
49 | # $SPARK_HOME/bin/spark-submit \
|
---|
50 | # $SPARK_ON_YARN \
|
---|
51 | # --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
|
---|
52 | # --conf spark.core.connection.ack.wait.timeout=600s \
|
---|
53 | # --conf spark.network.timeout=120s \
|
---|
54 | # --conf spark.task.maxFailures=20 \
|
---|
55 | # --conf spark.shuffle.io.maxRetries=20 \
|
---|
56 | # --conf spark.shuffle.io.retryWait=60s \
|
---|
57 | # --conf spark.driver.memory=$DRIVER_MEM \
|
---|
58 | # --conf spark.executor.memory=$EXECUTOR_MEM \
|
---|
59 | # $SPARK_EXTRA_OPTS \
|
---|
60 | # --num-executors $NUM_EXECUTORS \
|
---|
61 | # --executor-cores $EXECUTOR_CORES \
|
---|
62 | # --executor-memory $EXECUTOR_MEM \
|
---|
63 | # --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
64 | # --conf spark.sql.parquet.filterPushdown=true \
|
---|
65 | # --conf spark.sql.parquet.mergeSchema=false \
|
---|
66 | # --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
67 | # --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
68 | # --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
|
---|
69 | # --outputCompression=$COMPRS \
|
---|
70 | # --outputFormat=$FORMAT $NESTED \
|
---|
71 | # --partitionBy=$PARTITION_BY \
|
---|
72 | # "$DATA" "$OUTPUTDIR"
|
---|
73 |
|
---|
74 | # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
|
---|
75 | # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
|
---|
76 |
|
---|
77 | # $SPARK_ON_YARN \
|
---|
78 | $SPARK_HOME/bin/spark-submit \
|
---|
79 | --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
|
---|
80 | --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
81 | --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
82 | --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
83 | --conf spark.sql.parquet.filterPushdown=true \
|
---|
84 | --conf spark.sql.parquet.mergeSchema=true \
|
---|
85 | --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
|
---|
86 | --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
|
---|
87 | FROM ccindex
|
---|
88 | WHERE crawl = 'CC-MAIN-2019-30' AND subset = 'warc' AND content_languages LIKE '%mri%'" \
|
---|
89 | --numOutputPartitions 10 \
|
---|
90 | --numRecordsPerWarcFile 5000 \
|
---|
91 | --warcPrefix MAORI-CC-2019-30 \
|
---|
92 | s3a://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
93 | hdfs:///user/vagrant/cc-mri-subset
|
---|
94 | # s3://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
95 | # Since s3:// deprecated, moved to using s3a:// now
|
---|