source: gs3-extensions/maori-lang-detection/hdfs-instructions/scripts/export_maori_subset_from_scratch.sh@ 33524

Last change on this file since 33524 was 33524, checked in by ak19, 5 years ago
  1. Further adjustments to documenting what we did to get things to run on the hadoop filesystem. 2. All the hadoop related gitprojects (with patches), separate copy of patches, config modifications and missing jar files that we needed, scripts we created to run on the hdfs machine and its host machine.
  • Property svn:executable set to *
File size: 3.3 KB
Line 
1#!/bin/bash
2
3# convert URL index with Spark on Yarn
4
5# Table format configuration
6FORMAT=${FORMAT:-parquet} # parquet, orc
7NESTED="$NESTED" # "" (empty) or --useNestedSchema
8COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
9PARTITION_BY="crawl,subset"
10
11# Input spec (URL index files to convert)
12#DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
13#DATA="$1"
14# Output directory
15CRAWL_ID=$1
16#OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
17OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-subset"
18
19
20# Spark configuration
21SPARK_HOME="$SPARK_HOME"
22# EXECUTOR_MEM=44g
23# EXECUTOR_CORES=12
24# NUM_EXECUTORS=4
25# DRIVER_MEM=4g
26
27EXECUTOR_MEM=3g
28EXECUTOR_CORES=2
29NUM_EXECUTORS=2
30DRIVER_MEM=3g
31
32
33SPARK_ON_YARN="--master yarn"
34SPARK_EXTRA_OPTS=""
35
36# source specific configuration file
37## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
38
39
40_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
41
42##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
43#export LIBJARS=/home/vagrant/lib/*
44#export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
45
46set -e
47set -x
48
49
50# $SPARK_HOME/bin/spark-submit \
51# $SPARK_ON_YARN \
52# --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
53# --conf spark.core.connection.ack.wait.timeout=600s \
54# --conf spark.network.timeout=120s \
55# --conf spark.task.maxFailures=20 \
56# --conf spark.shuffle.io.maxRetries=20 \
57# --conf spark.shuffle.io.retryWait=60s \
58# --conf spark.driver.memory=$DRIVER_MEM \
59# --conf spark.executor.memory=$EXECUTOR_MEM \
60# $SPARK_EXTRA_OPTS \
61# --num-executors $NUM_EXECUTORS \
62# --executor-cores $EXECUTOR_CORES \
63# --executor-memory $EXECUTOR_MEM \
64# --conf spark.hadoop.parquet.enable.dictionary=true \
65# --conf spark.sql.parquet.filterPushdown=true \
66# --conf spark.sql.parquet.mergeSchema=false \
67# --conf spark.sql.hive.metastorePartitionPruning=true \
68# --conf spark.hadoop.parquet.enable.summary-metadata=false \
69# --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
70# --outputCompression=$COMPRS \
71# --outputFormat=$FORMAT $NESTED \
72# --partitionBy=$PARTITION_BY \
73# "$DATA" "$OUTPUTDIR"
74
75# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
76# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
77
78# $SPARK_ON_YARN \
79$SPARK_HOME/bin/spark-submit \
80 --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
81 --conf spark.hadoop.parquet.enable.dictionary=true \
82 --conf spark.hadoop.parquet.enable.summary-metadata=false \
83 --conf spark.sql.hive.metastorePartitionPruning=true \
84 --conf spark.sql.parquet.filterPushdown=true \
85 --conf spark.sql.parquet.mergeSchema=true \
86 --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
87 --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
88 FROM ccindex
89 WHERE crawl = '${CRAWL_ID}' AND subset = 'warc' AND content_languages LIKE '%mri%'" \
90 --numOutputPartitions 10 \
91 --numRecordsPerWarcFile 5000 \
92 --warcPrefix MAORI-${CRAWL_ID} \
93 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
94 $OUTPUTDIR
95
96# hdfs:///user/vagrant/cc-mri-subset
97# s3://commoncrawl/cc-index/table/cc-main/warc/ \
98# Since s3:// deprecated, moved to using s3a:// now
Note: See TracBrowser for help on using the repository browser.