source: gs3-extensions/maori-lang-detection/bin/script/get_maori_WET_records_for_crawl.sh@ 33495

Last change on this file since 33495 was 33495, checked in by ak19, 5 years ago

Pruned out unused commands, added comments, marked unused variables to be removed in a future version of this script after testing out the full version of this script on CC crawl 2019-26.

  • Property svn:executable set to *
File size: 9.0 KB
Line 
1#!/bin/bash
2
3# This script is BASED ON the cc-index-table github project's convert_url_index script at
4# https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh
5# That script is described as
6# "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format)
7# into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)."
8# If you want to run that script, then modify its variables to have the following values before
9# running it, in order for it to work on our machine for doing analytics:
10# EXECUTOR_MEM=3g
11# EXECUTOR_CORES=2
12# NUM_EXECUTORS=2
13# DRIVER_MEM=3g
14# Since that script was copied here, as a result, a lot of such variables (like executor
15# and memory related) are unused here, as they were just copied directly across. Such unused
16# variables can probably be removed from this file.
17
18# This script was modified to do the following:
19# SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp
20# and get all those records for which the primary language in the content_languages field is MRI for Maori.
21# Only the WARC related fields (url, filename, offset and length fields) of each record are requested.
22# The matching records' fields are then constructed into a distributed csv file on the local hdfs system
23# A second phase then requests the warc files at those offsets and downloads them onto the local hdfs.
24# We still get zipped WARC files, but they only contain the pages of that crawl where the primary language
25# was identified as MRI.
26# A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the
27# mounted shared space on vagrant.
28
29
30#---------------------------- START UNUSED VARIABLES---------------------------#
31
32# Table format configuration
33FORMAT=${FORMAT:-parquet} # parquet, orc
34NESTED="$NESTED" # "" (empty) or --useNestedSchema
35COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
36PARTITION_BY="crawl,subset"
37
38
39# Spark configuration
40SPARK_HOME="$SPARK_HOME"
41# EXECUTOR_MEM=44g
42# EXECUTOR_CORES=12
43# NUM_EXECUTORS=4
44# DRIVER_MEM=4g
45
46#--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh,
47# as follows in order to get that spark job to run. Not used in this script. ---#
48EXECUTOR_MEM=3g
49EXECUTOR_CORES=2
50NUM_EXECUTORS=2
51DRIVER_MEM=3g
52
53#--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---#
54SPARK_ON_YARN="--master yarn"
55SPARK_EXTRA_OPTS=""
56
57# source specific configuration file
58## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
59
60#---------------------------- END UNUSED VARIABLES---------------------------#
61
62
63# The crawl timestamp, of the form CC-MAIN-2019-26
64# Obtain from http://index.commoncrawl.org/
65CRAWL_ID=$1
66if [ "x$CRAWL_ID" == "x" ]; then
67 echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT."
68 echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/"
69 exit
70fi
71# Output directory
72OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID}
73 # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs:
74
75# https://stackoverflow.com/questions/26513861/checking-if-directory-in-hdfs-already-exists-or-not
76hdfs dfs -test -d $OUTPUT_PARENTDIR
77if [ $? == 0 ]; then
78 echo "Directory $OUTPUT_PARENTDIR already exists."
79else
80 echo "Creating directory $OUTPUT_PARENTDIR..."
81 hdfs dfs -mkdir $OUTPUT_PARENTDIR
82fi
83
84
85_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
86
87##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
88#export LIBJARS=/home/vagrant/lib/*
89#export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
90
91set -e
92set -x
93
94# PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files
95# for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori)
96# The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR.
97
98#OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
99OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv"
100
101# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
102# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
103
104
105# $SPARK_ON_YARN \
106# /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
107# --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
108# --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
109
110# https://www.patricia-anong.com/blog/2017/11/1/extend-vmdk-on-virtualbox
111
112$SPARK_HOME/bin/spark-submit \
113 --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
114 --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
115 --conf spark.hadoop.parquet.enable.dictionary=true \
116 --conf spark.hadoop.parquet.enable.summary-metadata=false \
117 --conf spark.sql.hive.metastorePartitionPruning=true \
118 --conf spark.sql.parquet.filterPushdown=true \
119 --conf spark.sql.parquet.mergeSchema=true \
120 --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
121 --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
122 FROM ccindex
123 WHERE crawl = '${CRAWL_ID}' AND subset = 'warc' AND content_languages = 'mri'" \
124 --outputFormat csv \
125 --numOutputPartitions 10 \
126 --outputCompression gzip \
127 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
128 $OUTPUTDIR
129
130
131# The above generates ZIPPED part*.csv files in $OUTPUTDIR (folder cc-mri-csv).
132# First create a folder and unzip into it:
133hdfs dfs -test -d $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
134if [ $? == 0 ]; then
135 echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}."
136 echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR"
137else
138 echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..."
139 hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
140
141 echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv"
142 hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv
143fi
144
145
146
147# PHASE 2, which uses the index of MRI warc URLs and offsets,
148# stored in the now unzipped .csv file,
149# to get all the WARC records it specifies at the specified warc offsets.
150# This will get us all the MRI pages of the commoncrawl for CRAWL_ID
151
152# Change OUTPUTDIR
153OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc"
154
155
156# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
157# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
158
159
160# $SPARK_ON_YARN \
161$SPARK_HOME/bin/spark-submit \
162 --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
163 --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
164 --csv $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv \
165 --numOutputPartitions 10 \
166 --numRecordsPerWarcFile 5000 \
167 --warcPrefix MAORI-${CRAWL_ID} \
168 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
169 $OUTPUTDIR
170
171
172# PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area
173
174hdfs dfs -test -f $OUTPUTDIR/_SUCCESS
175if [ $? == 0 ]; then
176 # ia-hadoop-tools converts warc files into wet (and wat) files but expects a particular folder structure
177 # Create the expected folder structure: a "wet" and a "wat" folder should exist
178 # at the same level as the "warc" folder ($OUTPUTDIR) that contains the warc gz files.
179
180 echo "Creating wet (text) and wat (metadata) directories to put converted warc files into..."
181 hdfs dfs -mkdir $OUTPUT_PARENTDIR/wet
182 hdfs dfs -mkdir $OUTPUT_PARENTDIR/wat
183 # move the _SUCCESS file from /warc to its parent folder
184 hdfs dfs -mv $OUTPUTDIR/_SUCCESS $OUTPUT_PARENTDIR/.
185
186 # Move from the "/home/vagrant/cc-index-table" github project to the "ia-hadoop-tools" github project:
187 cd /home/vagrant/ia-hadoop-tools
188 # PWD is now ia-hadoop-tools folder
189 # To run the following, guava.jar should be on the hadoop classpath
190 # This can be achieved with:
191 # vagrant@node1:~/ia-hadoop-tools$ sudo cp /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/.
192 $HADOOP_MAPRED_HOME/bin/hadoop jar $PWD/target/ia-hadoop-tools-jar-with-dependencies.jar WEATGenerator -strictMode -skipExisting batch-id-xyz $OUTPUTDIR/*.warc.gz
193
194 # Now copy the zipped wet-files into the mounted folder, so we can scp the files from there to where WETProcessor.java is
195 echo "Copying and tarring up the wet files"
196 mkdir /vagrant/${CRAWL_ID}-wet-files
197 hdfs dfs -get $OUTPUT_PARENTDIR/wet/*.warc.wet.gz /vagrant/${CRAWL_ID}-wet-files/.
198 cd /vagrant
199 #tar -cvf ${CRAWL_ID}.tar /vagrant/${CRAWL_ID}-wet-files
200 #rm -rf /vagrant/${CRAWL_ID}-wet-files
201fi
202
Note: See TracBrowser for help on using the repository browser.