source: gs3-extensions/maori-lang-detection/bin/hadoop-spark-scripts/export_maori_subset.sh@ 33446

Last change on this file since 33446 was 33446, checked in by ak19, 5 years ago
  1. Committing working version of export_maori_subset.sh which takes the csv file from running export_maori_index.csv.sh as input and gets the warc files at the specified offsets. 2. Notes on the changes necessary to the Java code (cc-index-table/src/main/java/org/commoncrawl/spark/examples/CCIndexWarcExport.java) to get the export_maori_subset.sh to run without exceptions so far. 3. The otherwise untested export_maori_subset_from_scratch.sh script which would perform the sql query and feed that in to getting the WARC records instead of producing an intermediate csv file.
  • Property svn:executable set to *
File size: 3.0 KB
Line 
1#!/bin/bash
2
3# convert URL index with Spark on Yarn
4
5# Table format configuration
6FORMAT=${FORMAT:-parquet} # parquet, orc
7NESTED="$NESTED" # "" (empty) or --useNestedSchema
8COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
9PARTITION_BY="crawl,subset"
10
11# Input spec (URL index files to convert)
12#DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
13DATA="$1"
14# Output directory
15#OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
16OUTPUTDIR="$2"
17
18
19# Spark configuration
20SPARK_HOME="$SPARK_HOME"
21# EXECUTOR_MEM=44g
22# EXECUTOR_CORES=12
23# NUM_EXECUTORS=4
24# DRIVER_MEM=4g
25
26EXECUTOR_MEM=3g
27EXECUTOR_CORES=2
28NUM_EXECUTORS=2
29DRIVER_MEM=3g
30
31
32SPARK_ON_YARN="--master yarn"
33SPARK_EXTRA_OPTS=""
34
35# source specific configuration file
36## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
37
38
39_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
40
41##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
42#export LIBJARS=/home/vagrant/lib/*
43#export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
44
45set -e
46set -x
47
48
49# $SPARK_HOME/bin/spark-submit \
50# $SPARK_ON_YARN \
51# --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
52# --conf spark.core.connection.ack.wait.timeout=600s \
53# --conf spark.network.timeout=120s \
54# --conf spark.task.maxFailures=20 \
55# --conf spark.shuffle.io.maxRetries=20 \
56# --conf spark.shuffle.io.retryWait=60s \
57# --conf spark.driver.memory=$DRIVER_MEM \
58# --conf spark.executor.memory=$EXECUTOR_MEM \
59# $SPARK_EXTRA_OPTS \
60# --num-executors $NUM_EXECUTORS \
61# --executor-cores $EXECUTOR_CORES \
62# --executor-memory $EXECUTOR_MEM \
63# --conf spark.hadoop.parquet.enable.dictionary=true \
64# --conf spark.sql.parquet.filterPushdown=true \
65# --conf spark.sql.parquet.mergeSchema=false \
66# --conf spark.sql.hive.metastorePartitionPruning=true \
67# --conf spark.hadoop.parquet.enable.summary-metadata=false \
68# --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
69# --outputCompression=$COMPRS \
70# --outputFormat=$FORMAT $NESTED \
71# --partitionBy=$PARTITION_BY \
72# "$DATA" "$OUTPUTDIR"
73
74# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
75# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
76
77 # --conf spark.hadoop.parquet.enable.dictionary=true \
78 # --conf spark.hadoop.parquet.enable.summary-metadata=false \
79 # --conf spark.sql.hive.metastorePartitionPruning=true \
80 # --conf spark.sql.parquet.filterPushdown=true \
81 # --conf spark.sql.parquet.mergeSchema=true \
82
83
84
85# $SPARK_ON_YARN \
86$SPARK_HOME/bin/spark-submit \
87 --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
88 --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
89 --csv hdfs:///user/vagrant/cc-mri-unzipped-csv/cc-mri.csv \
90 --numOutputPartitions 10 \
91 --numRecordsPerWarcFile 5000 \
92 --warcPrefix MAORI-CC-2019-30 \
93 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
94 hdfs:///user/vagrant/cc-mri-subset
95
Note: See TracBrowser for help on using the repository browser.