source: gs3-extensions/maori-lang-detection/bin/hadoop-spark-scripts/export_maori_index_csv.sh@ 33445

Last change on this file since 33445 was 33445, checked in by ak19, 5 years ago

The first working hadoop spark script for processing common crawl data. This one successfully got all the commoncrawl warc INDEX data for the specified period where content_languages contained mri (as any of the document's 3 primary languages) and put them out into a csv file

  • Property svn:executable set to *
File size: 2.8 KB
Line 
1#!/bin/bash
2
3# convert URL index with Spark on Yarn
4
5# Table format configuration
6FORMAT=${FORMAT:-parquet} # parquet, orc
7NESTED="$NESTED" # "" (empty) or --useNestedSchema
8COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
9PARTITION_BY="crawl,subset"
10
11# Input spec (URL index files to convert)
12#DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
13DATA="$1"
14# Output directory
15#OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
16OUTPUTDIR="$2"
17
18
19# Spark configuration
20SPARK_HOME="$SPARK_HOME"
21# EXECUTOR_MEM=44g
22# EXECUTOR_CORES=12
23# NUM_EXECUTORS=4
24# DRIVER_MEM=4g
25
26EXECUTOR_MEM=3g
27EXECUTOR_CORES=2
28NUM_EXECUTORS=2
29DRIVER_MEM=3g
30
31
32SPARK_ON_YARN="--master yarn"
33SPARK_EXTRA_OPTS=""
34
35# source specific configuration file
36## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
37
38
39_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
40
41##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
42#export LIBJARS=/home/vagrant/lib/*
43#export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
44
45set -e
46set -x
47
48
49# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
50# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
51# --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \
52# --conf spark.hadoop.fs.s3a.secret.key=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 \
53
54
55# $SPARK_ON_YARN \
56# /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
57# --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
58# --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
59
60$SPARK_HOME/bin/spark-submit \
61 --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
62 --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
63 --conf spark.hadoop.parquet.enable.dictionary=true \
64 --conf spark.hadoop.parquet.enable.summary-metadata=false \
65 --conf spark.sql.hive.metastorePartitionPruning=true \
66 --conf spark.sql.parquet.filterPushdown=true \
67 --conf spark.sql.parquet.mergeSchema=true \
68 --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
69 --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
70 FROM ccindex
71 WHERE crawl = 'CC-MAIN-2019-30' AND subset = 'warc' AND content_languages LIKE '%mri%'" \
72 --outputFormat csv \
73 --numOutputPartitions 10 \
74 --outputCompression gzip \
75 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
76 hdfs:///user/vagrant/cc-mri-csv
77
Note: See TracBrowser for help on using the repository browser.