Context Navigation

source: gs3-extensions/maori-lang-detection/bin/script/get_maori_WET_records_for_crawl.sh@ 33495

Last change on this file since 33495 was 33495, checked in by ak19, 5 years ago
Pruned out unused commands, added comments, marked unused variables to be removed in a future version of this script after testing out the full version of this script on CC crawl 2019-26.
Property svn:executable set to ``*
File size: 9.0 KB

Line
1	#!/bin/bash
2
3	# This script is BASED ON the cc-index-table github project's convert_url_index script at
4	# https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh
5	# That script is described as
6	# "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format)
7	# into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)."
8	# If you want to run that script, then modify its variables to have the following values before
9	# running it, in order for it to work on our machine for doing analytics:
10	# EXECUTOR_MEM=3g
11	# EXECUTOR_CORES=2
12	# NUM_EXECUTORS=2
13	# DRIVER_MEM=3g
14	# Since that script was copied here, as a result, a lot of such variables (like executor
15	# and memory related) are unused here, as they were just copied directly across. Such unused
16	# variables can probably be removed from this file.
17
18	# This script was modified to do the following:
19	# SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp
20	# and get all those records for which the primary language in the content_languages field is MRI for Maori.
21	# Only the WARC related fields (url, filename, offset and length fields) of each record are requested.
22	# The matching records' fields are then constructed into a distributed csv file on the local hdfs system
23	# A second phase then requests the warc files at those offsets and downloads them onto the local hdfs.
24	# We still get zipped WARC files, but they only contain the pages of that crawl where the primary language
25	# was identified as MRI.
26	# A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the
27	# mounted shared space on vagrant.
28
29
30	#---------------------------- START UNUSED VARIABLES---------------------------#
31
32	# Table format configuration
33	FORMAT=${FORMAT:-parquet} # parquet, orc
34	NESTED="$NESTED" # "" (empty) or --useNestedSchema
35	COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
36	PARTITION_BY="crawl,subset"
37
38
39	# Spark configuration
40	SPARK_HOME="$SPARK_HOME"
41	# EXECUTOR_MEM=44g
42	# EXECUTOR_CORES=12
43	# NUM_EXECUTORS=4
44	# DRIVER_MEM=4g
45
46	#--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh,
47	# as follows in order to get that spark job to run. Not used in this script. ---#
48	EXECUTOR_MEM=3g
49	EXECUTOR_CORES=2
50	NUM_EXECUTORS=2
51	DRIVER_MEM=3g
52
53	#--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---#
54	SPARK_ON_YARN="--master yarn"
55	SPARK_EXTRA_OPTS=""
56
57	# source specific configuration file
58	## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
59
60	#---------------------------- END UNUSED VARIABLES---------------------------#
61
62
63	# The crawl timestamp, of the form CC-MAIN-2019-26
64	# Obtain from http://index.commoncrawl.org/
65	CRAWL_ID=$1
66	if [ "x$CRAWL_ID" == "x" ]; then
67	echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT."
68	echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/"
69	exit
70	fi
71	# Output directory
72	OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID}
73	# or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs:
74
75	# https://stackoverflow.com/questions/26513861/checking-if-directory-in-hdfs-already-exists-or-not
76	hdfs dfs -test -d $OUTPUT_PARENTDIR
77	if [ $? == 0 ]; then
78	echo "Directory $OUTPUT_PARENTDIR already exists."
79	else
80	echo "Creating directory $OUTPUT_PARENTDIR..."
81	hdfs dfs -mkdir $OUTPUT_PARENTDIR
82	fi
83
84
85	_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
86
87	##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
88	#export LIBJARS=/home/vagrant/lib/*
89	#export HADOOP_CLASSPATH=`echo ${LIBJARS} \| sed s/,/:/g`
90
91	set -e
92	set -x
93
94	# PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files
95	# for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori)
96	# The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR.
97
98	#OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
99	OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv"
100
101	# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
102	# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
103
104
105	# $SPARK_ON_YARN \
106	# /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
107	# --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
108	# --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
109
110	# https://www.patricia-anong.com/blog/2017/11/1/extend-vmdk-on-virtualbox
111
112	$SPARK_HOME/bin/spark-submit \
113	--jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
114	--driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
115	--conf spark.hadoop.parquet.enable.dictionary=true \
116	--conf spark.hadoop.parquet.enable.summary-metadata=false \
117	--conf spark.sql.hive.metastorePartitionPruning=true \
118	--conf spark.sql.parquet.filterPushdown=true \
119	--conf spark.sql.parquet.mergeSchema=true \
120	--class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
121	--query "SELECT url, warc_filename, warc_record_offset, warc_record_length
122	FROM ccindex
123	WHERE crawl = '${CRAWL_ID}' AND subset = 'warc' AND content_languages = 'mri'" \
124	--outputFormat csv \
125	--numOutputPartitions 10 \
126	--outputCompression gzip \
127	s3a://commoncrawl/cc-index/table/cc-main/warc/ \
128	$OUTPUTDIR
129
130
131	# The above generates ZIPPED part*.csv files in $OUTPUTDIR (folder cc-mri-csv).
132	# First create a folder and unzip into it:
133	hdfs dfs -test -d $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
134	if [ $? == 0 ]; then
135	echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}."
136	echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR"
137	else
138	echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..."
139	hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
140
141	echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv"
142	hdfs dfs -cat $OUTPUTDIR/part* \| gzip -d \| hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv
143	fi
144
145
146
147	# PHASE 2, which uses the index of MRI warc URLs and offsets,
148	# stored in the now unzipped .csv file,
149	# to get all the WARC records it specifies at the specified warc offsets.
150	# This will get us all the MRI pages of the commoncrawl for CRAWL_ID
151
152	# Change OUTPUTDIR
153	OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc"
154
155
156	# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
157	# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
158
159
160	# $SPARK_ON_YARN \
161	$SPARK_HOME/bin/spark-submit \
162	--conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
163	--class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
164	--csv $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv \
165	--numOutputPartitions 10 \
166	--numRecordsPerWarcFile 5000 \
167	--warcPrefix MAORI-${CRAWL_ID} \
168	s3a://commoncrawl/cc-index/table/cc-main/warc/ \
169	$OUTPUTDIR
170
171
172	# PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area
173
174	hdfs dfs -test -f $OUTPUTDIR/_SUCCESS
175	if [ $? == 0 ]; then
176	# ia-hadoop-tools converts warc files into wet (and wat) files but expects a particular folder structure
177	# Create the expected folder structure: a "wet" and a "wat" folder should exist
178	# at the same level as the "warc" folder ($OUTPUTDIR) that contains the warc gz files.
179
180	echo "Creating wet (text) and wat (metadata) directories to put converted warc files into..."
181	hdfs dfs -mkdir $OUTPUT_PARENTDIR/wet
182	hdfs dfs -mkdir $OUTPUT_PARENTDIR/wat
183	# move the _SUCCESS file from /warc to its parent folder
184	hdfs dfs -mv $OUTPUTDIR/_SUCCESS $OUTPUT_PARENTDIR/.
185
186	# Move from the "/home/vagrant/cc-index-table" github project to the "ia-hadoop-tools" github project:
187	cd /home/vagrant/ia-hadoop-tools
188	# PWD is now ia-hadoop-tools folder
189	# To run the following, guava.jar should be on the hadoop classpath
190	# This can be achieved with:
191	# vagrant@node1:~/ia-hadoop-tools$ sudo cp /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/.
192	$HADOOP_MAPRED_HOME/bin/hadoop jar $PWD/target/ia-hadoop-tools-jar-with-dependencies.jar WEATGenerator -strictMode -skipExisting batch-id-xyz $OUTPUTDIR/*.warc.gz
193
194	# Now copy the zipped wet-files into the mounted folder, so we can scp the files from there to where WETProcessor.java is
195	echo "Copying and tarring up the wet files"
196	mkdir /vagrant/${CRAWL_ID}-wet-files
197	hdfs dfs -get $OUTPUT_PARENTDIR/wet/*.warc.wet.gz /vagrant/${CRAWL_ID}-wet-files/.
198	cd /vagrant
199	#tar -cvf ${CRAWL_ID}.tar /vagrant/${CRAWL_ID}-wet-files
200	#rm -rf /vagrant/${CRAWL_ID}-wet-files
201	fi
202

Note: See TracBrowser for help on using the repository browser.

Download in other formats: