Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

export_maori_subset.sh@ 33446

Last change on this file since 33446 was 33446, checked in by ak19, 5 years ago
Committing working version of export_maori_subset.sh which takes the csv file from running export_maori_index.csv.sh as input and gets the warc files at the specified offsets. 2. Notes on the changes necessary to the Java code (cc-index-table/src/main/java/org/commoncrawl/spark/examples/CCIndexWarcExport.java) to get the export_maori_subset.sh to run without exceptions so far. 3. The otherwise untested export_maori_subset_from_scratch.sh script which would perform the sql query and feed that in to getting the WARC records instead of producing an intermediate csv file.
Property svn:executable set to ``*
File size: 3.0 KB

Line
1	#!/bin/bash
2
3	# convert URL index with Spark on Yarn
4
5	# Table format configuration
6	FORMAT=${FORMAT:-parquet} # parquet, orc
7	NESTED="$NESTED" # "" (empty) or --useNestedSchema
8	COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
9	PARTITION_BY="crawl,subset"
10
11	# Input spec (URL index files to convert)
12	#DATA=s3a://commoncrawl/cc-index/collections/CC-MAIN-2017-51/indexes/cdx-*.gz
13	DATA="$1"
14	# Output directory
15	#OUTPUTDIR=hdfs:///user/ubuntu/cc-index-table/
16	OUTPUTDIR="$2"
17
18
19	# Spark configuration
20	SPARK_HOME="$SPARK_HOME"
21	# EXECUTOR_MEM=44g
22	# EXECUTOR_CORES=12
23	# NUM_EXECUTORS=4
24	# DRIVER_MEM=4g
25
26	EXECUTOR_MEM=3g
27	EXECUTOR_CORES=2
28	NUM_EXECUTORS=2
29	DRIVER_MEM=3g
30
31
32	SPARK_ON_YARN="--master yarn"
33	SPARK_EXTRA_OPTS=""
34
35	# source specific configuration file
36	## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
37
38
39	_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
40
41	##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
42	#export LIBJARS=/home/vagrant/lib/*
43	#export HADOOP_CLASSPATH=`echo ${LIBJARS} \| sed s/,/:/g`
44
45	set -e
46	set -x
47
48
49	# $SPARK_HOME/bin/spark-submit \
50	# $SPARK_ON_YARN \
51	# --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
52	# --conf spark.core.connection.ack.wait.timeout=600s \
53	# --conf spark.network.timeout=120s \
54	# --conf spark.task.maxFailures=20 \
55	# --conf spark.shuffle.io.maxRetries=20 \
56	# --conf spark.shuffle.io.retryWait=60s \
57	# --conf spark.driver.memory=$DRIVER_MEM \
58	# --conf spark.executor.memory=$EXECUTOR_MEM \
59	# $SPARK_EXTRA_OPTS \
60	# --num-executors $NUM_EXECUTORS \
61	# --executor-cores $EXECUTOR_CORES \
62	# --executor-memory $EXECUTOR_MEM \
63	# --conf spark.hadoop.parquet.enable.dictionary=true \
64	# --conf spark.sql.parquet.filterPushdown=true \
65	# --conf spark.sql.parquet.mergeSchema=false \
66	# --conf spark.sql.hive.metastorePartitionPruning=true \
67	# --conf spark.hadoop.parquet.enable.summary-metadata=false \
68	# --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
69	# --outputCompression=$COMPRS \
70	# --outputFormat=$FORMAT $NESTED \
71	# --partitionBy=$PARTITION_BY \
72	# "$DATA" "$OUTPUTDIR"
73
74	# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
75	# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
76
77	# --conf spark.hadoop.parquet.enable.dictionary=true \
78	# --conf spark.hadoop.parquet.enable.summary-metadata=false \
79	# --conf spark.sql.hive.metastorePartitionPruning=true \
80	# --conf spark.sql.parquet.filterPushdown=true \
81	# --conf spark.sql.parquet.mergeSchema=true \
82
83
84
85	# $SPARK_ON_YARN \
86	$SPARK_HOME/bin/spark-submit \
87	--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
88	--class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
89	--csv hdfs:///user/vagrant/cc-mri-unzipped-csv/cc-mri.csv \
90	--numOutputPartitions 10 \
91	--numRecordsPerWarcFile 5000 \
92	--warcPrefix MAORI-CC-2019-30 \
93	s3a://commoncrawl/cc-index/table/cc-main/warc/ \
94	hdfs:///user/vagrant/cc-mri-subset
95

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/bin/hadoop-spark-scripts/export_maori_subset.sh@ 33446

Download in other formats: