[33446] | 1 | #!/bin/bash
|
---|
| 2 |
|
---|
| 3 | # convert URL index with Spark on Yarn
|
---|
| 4 |
|
---|
| 5 | # Table format configuration
|
---|
| 6 | FORMAT=${FORMAT:-parquet} # parquet, orc
|
---|
| 7 | NESTED="$NESTED" # "" (empty) or --useNestedSchema
|
---|
| 8 | COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
|
---|
| 9 | PARTITION_BY="crawl,subset"
|
---|
| 10 |
|
---|
| 11 | # Input spec (URL index files to convert)
|
---|
| 12 | #DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
|
---|
| 13 | DATA="$1"
|
---|
| 14 | # Output directory
|
---|
| 15 | #OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
|
---|
| 16 | OUTPUTDIR="$2"
|
---|
| 17 |
|
---|
| 18 |
|
---|
| 19 | # Spark configuration
|
---|
| 20 | SPARK_HOME="$SPARK_HOME"
|
---|
| 21 | # EXECUTOR_MEM=44g
|
---|
| 22 | # EXECUTOR_CORES=12
|
---|
| 23 | # NUM_EXECUTORS=4
|
---|
| 24 | # DRIVER_MEM=4g
|
---|
| 25 |
|
---|
| 26 | EXECUTOR_MEM=3g
|
---|
| 27 | EXECUTOR_CORES=2
|
---|
| 28 | NUM_EXECUTORS=2
|
---|
| 29 | DRIVER_MEM=3g
|
---|
| 30 |
|
---|
| 31 |
|
---|
| 32 | SPARK_ON_YARN="--master yarn"
|
---|
| 33 | SPARK_EXTRA_OPTS=""
|
---|
| 34 |
|
---|
| 35 | # source specific configuration file
|
---|
| 36 | ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
|
---|
| 37 |
|
---|
| 38 |
|
---|
| 39 | _APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
|
---|
| 40 |
|
---|
| 41 | ##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
|
---|
| 42 | #export LIBJARS=/home/vagrant/lib/*
|
---|
| 43 | #export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
|
---|
| 44 |
|
---|
| 45 | set -e
|
---|
| 46 | set -x
|
---|
| 47 |
|
---|
| 48 |
|
---|
| 49 | # $SPARK_HOME/bin/spark-submit \
|
---|
| 50 | # $SPARK_ON_YARN \
|
---|
| 51 | # --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
|
---|
| 52 | # --conf spark.core.connection.ack.wait.timeout=600s \
|
---|
| 53 | # --conf spark.network.timeout=120s \
|
---|
| 54 | # --conf spark.task.maxFailures=20 \
|
---|
| 55 | # --conf spark.shuffle.io.maxRetries=20 \
|
---|
| 56 | # --conf spark.shuffle.io.retryWait=60s \
|
---|
| 57 | # --conf spark.driver.memory=$DRIVER_MEM \
|
---|
| 58 | # --conf spark.executor.memory=$EXECUTOR_MEM \
|
---|
| 59 | # $SPARK_EXTRA_OPTS \
|
---|
| 60 | # --num-executors $NUM_EXECUTORS \
|
---|
| 61 | # --executor-cores $EXECUTOR_CORES \
|
---|
| 62 | # --executor-memory $EXECUTOR_MEM \
|
---|
| 63 | # --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
| 64 | # --conf spark.sql.parquet.filterPushdown=true \
|
---|
| 65 | # --conf spark.sql.parquet.mergeSchema=false \
|
---|
| 66 | # --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
| 67 | # --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
| 68 | # --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
|
---|
| 69 | # --outputCompression=$COMPRS \
|
---|
| 70 | # --outputFormat=$FORMAT $NESTED \
|
---|
| 71 | # --partitionBy=$PARTITION_BY \
|
---|
| 72 | # "$DATA" "$OUTPUTDIR"
|
---|
| 73 |
|
---|
| 74 | # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
|
---|
| 75 | # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
|
---|
| 76 |
|
---|
| 77 | # --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
| 78 | # --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
| 79 | # --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
| 80 | # --conf spark.sql.parquet.filterPushdown=true \
|
---|
| 81 | # --conf spark.sql.parquet.mergeSchema=true \
|
---|
| 82 |
|
---|
| 83 |
|
---|
| 84 |
|
---|
| 85 | # $SPARK_ON_YARN \
|
---|
| 86 | $SPARK_HOME/bin/spark-submit \
|
---|
| 87 | --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
|
---|
| 88 | --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
|
---|
| 89 | --csv hdfs:///user/vagrant/cc-mri-unzipped-csv/cc-mri.csv \
|
---|
| 90 | --numOutputPartitions 10 \
|
---|
| 91 | --numRecordsPerWarcFile 5000 \
|
---|
| 92 | --warcPrefix MAORI-CC-2019-30 \
|
---|
| 93 | s3a://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
| 94 | hdfs:///user/vagrant/cc-mri-subset
|
---|
| 95 |
|
---|