source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/limit10_export_index.sh@ 33574

Last change on this file since 33574 was 33524, checked in by ak19, 5 years ago
  1. Further adjustments to documenting what we did to get things to run on the hadoop filesystem. 2. All the hadoop related gitprojects (with patches), separate copy of patches, config modifications and missing jar files that we needed, scripts we created to run on the hdfs machine and its host machine.
  • Property svn:executable set to *
File size: 2.6 KB
Line 
1#!/bin/bash
2
3# convert URL index with Spark on Yarn
4
5# Table format configuration
6FORMAT=${FORMAT:-parquet} # parquet, orc
7NESTED="$NESTED" # "" (empty) or --useNestedSchema
8COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
9PARTITION_BY="crawl,subset"
10
11# Input spec (URL index files to convert)
12#DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
13DATA="$1"
14# Output directory
15#OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
16OUTPUTDIR="$2"
17
18
19# Spark configuration
20SPARK_HOME="$SPARK_HOME"
21# EXECUTOR_MEM=44g
22# EXECUTOR_CORES=12
23# NUM_EXECUTORS=4
24# DRIVER_MEM=4g
25
26EXECUTOR_MEM=3g
27EXECUTOR_CORES=2
28NUM_EXECUTORS=2
29DRIVER_MEM=3g
30
31
32SPARK_ON_YARN="--master yarn"
33SPARK_EXTRA_OPTS=""
34
35# source specific configuration file
36## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
37
38
39_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
40
41##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
42#export LIBJARS=/home/vagrant/lib/*
43#export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
44
45set -e
46set -x
47
48
49# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
50# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
51# --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \
52# --conf spark.hadoop.fs.s3a.secret.key=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 \
53
54
55# $SPARK_ON_YARN \
56# /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
57# --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
58# --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
59
60# --conf spark.sql.parquet.mergeSchema=true \
61
62$SPARK_HOME/bin/spark-submit \
63 --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
64 --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
65 --conf spark.hadoop.parquet.enable.dictionary=true \
66 --conf spark.hadoop.parquet.enable.summary-metadata=false \
67 --conf spark.sql.hive.metastorePartitionPruning=true \
68 --conf spark.sql.parquet.filterPushdown=true \
69 --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
70 --query "SELECT * FROM ccindex LIMIT 10" \
71 --outputFormat csv \
72 --numOutputPartitions 10 \
73 --outputCompression gzip \
74 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
75 hdfs:///user/vagrant/cc-limit10-csv
76
Note: See TracBrowser for help on using the repository browser.