source: gs3-extensions/maori-lang-detection/hdfs-instructions/scripts/get_maori_WET_records_for_crawl.sh@ 33524

Last change on this file since 33524 was 33524, checked in by ak19, 5 years ago
  1. Further adjustments to documenting what we did to get things to run on the hadoop filesystem. 2. All the hadoop related gitprojects (with patches), separate copy of patches, config modifications and missing jar files that we needed, scripts we created to run on the hdfs machine and its host machine.
  • Property svn:executable set to *
File size: 9.1 KB
Line 
1#!/bin/bash
2
3# This script is BASED ON the cc-index-table github project's convert_url_index script at
4# https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh
5# That script is described as
6# "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format)
7# into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)."
8# If you want to run that script, then modify its variables to have the following values before
9# running it, in order for it to work on our machine for doing analytics:
10# EXECUTOR_MEM=3g
11# EXECUTOR_CORES=2
12# NUM_EXECUTORS=2
13# DRIVER_MEM=3g
14# Since that script was copied here, as a result, a lot of such variables (like executor
15# and memory related) are unused here, as they were just copied directly across. Such unused
16# variables can probably be removed from this file.
17
18# This script was modified to do the following:
19# SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp
20# and get all those records for which the primary language in the content_languages field is MRI for Maori.
21# Only the WARC related fields (url, filename, offset and length fields) of each record are requested.
22# The matching records' fields are then constructed into a distributed csv file on the local hdfs system
23# A second phase then requests the warc files at those offsets and downloads them onto the local hdfs.
24# We still get zipped WARC files, but they only contain the pages of that crawl where the primary language
25# was identified as MRI.
26# A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the
27# mounted shared space on vagrant.
28
29
30#---------------------------- START UNUSED VARIABLES---------------------------#
31
32# Table format configuration
33FORMAT=${FORMAT:-parquet} # parquet, orc
34NESTED="$NESTED" # "" (empty) or --useNestedSchema
35COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
36PARTITION_BY="crawl,subset"
37
38
39# Spark configuration
40SPARK_HOME="$SPARK_HOME"
41# EXECUTOR_MEM=44g
42# EXECUTOR_CORES=12
43# NUM_EXECUTORS=4
44# DRIVER_MEM=4g
45
46#--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh,
47# as follows in order to get that spark job to run. Not used in this script. ---#
48EXECUTOR_MEM=3g
49EXECUTOR_CORES=2
50NUM_EXECUTORS=2
51DRIVER_MEM=3g
52
53#--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---#
54SPARK_ON_YARN="--master yarn"
55SPARK_EXTRA_OPTS=""
56
57# source specific configuration file
58## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
59
60#---------------------------- END UNUSED VARIABLES---------------------------#
61
62
63# The crawl timestamp, of the form CC-MAIN-2019-26
64# Obtain from http://index.commoncrawl.org/
65CRAWL_ID=$1
66if [ "x$CRAWL_ID" == "x" ]; then
67 echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT."
68 echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/"
69 exit
70fi
71# Output directory
72OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID}
73 # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs:
74
75# https://stackoverflow.com/questions/26513861/checking-if-directory-in-hdfs-already-exists-or-not
76#hdfs dfs -test -d $OUTPUT_PARENTDIR
77#if [ $? == 0 ]; then
78if $(hdfs dfs -test -d "$OUTPUT_PARENTDIR"); then
79 echo "Directory $OUTPUT_PARENTDIR already exists."
80else
81 echo "Creating directory $OUTPUT_PARENTDIR..."
82 hdfs dfs -mkdir $OUTPUT_PARENTDIR
83fi
84
85
86_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
87
88##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
89#export LIBJARS=/home/vagrant/lib/*
90#export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
91
92set -e
93set -x
94
95# PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files
96# for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori)
97# The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR.
98
99#OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
100OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv"
101
102# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
103# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
104
105
106# $SPARK_ON_YARN \
107# /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
108# --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
109# --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
110
111# https://www.patricia-anong.com/blog/2017/11/1/extend-vmdk-on-virtualbox
112
113$SPARK_HOME/bin/spark-submit \
114 --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
115 --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
116 --conf spark.hadoop.parquet.enable.dictionary=true \
117 --conf spark.hadoop.parquet.enable.summary-metadata=false \
118 --conf spark.sql.hive.metastorePartitionPruning=true \
119 --conf spark.sql.parquet.filterPushdown=true \
120 --conf spark.sql.parquet.mergeSchema=true \
121 --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
122 --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
123 FROM ccindex
124 WHERE crawl = '${CRAWL_ID}' AND subset = 'warc' AND content_languages = 'mri'" \
125 --outputFormat csv \
126 --numOutputPartitions 10 \
127 --outputCompression gzip \
128 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
129 $OUTPUTDIR
130
131
132# The above generates ZIPPED part*.csv files in $OUTPUTDIR (folder cc-mri-csv).
133# First create a folder and unzip into it:
134#hdfs dfs -test -d $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
135#if [ $? == 0 ]; then
136if $(hdfs dfs -test -d "$OUTPUT_PARENTDIR/cc-mri-unzipped-csv"); then
137 echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}."
138 echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR"
139else
140 echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..."
141 hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
142
143 echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv"
144 hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv
145fi
146
147
148
149# PHASE 2, which uses the index of MRI warc URLs and offsets,
150# stored in the now unzipped .csv file,
151# to get all the WARC records it specifies at the specified warc offsets.
152# This will get us all the MRI pages of the commoncrawl for CRAWL_ID
153
154# Change OUTPUTDIR
155OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc"
156
157
158# --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
159# --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
160
161
162# $SPARK_ON_YARN \
163$SPARK_HOME/bin/spark-submit \
164 --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
165 --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
166 --csv $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv \
167 --numOutputPartitions 10 \
168 --numRecordsPerWarcFile 5000 \
169 --warcPrefix MAORI-${CRAWL_ID} \
170 s3a://commoncrawl/cc-index/table/cc-main/warc/ \
171 $OUTPUTDIR
172
173
174# PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area
175
176#hdfs dfs -test -f $OUTPUTDIR/_SUCCESS
177#if [ $? == 0 ]; then
178if $(hdfs dfs -test -f "$OUTPUTDIR/_SUCCESS"); then
179 # ia-hadoop-tools converts warc files into wet (and wat) files but expects a particular folder structure
180 # Create the expected folder structure: a "wet" and a "wat" folder should exist
181 # at the same level as the "warc" folder ($OUTPUTDIR) that contains the warc gz files.
182
183 echo "Creating wet (text) and wat (metadata) directories to put converted warc files into..."
184 hdfs dfs -mkdir $OUTPUT_PARENTDIR/wet
185 hdfs dfs -mkdir $OUTPUT_PARENTDIR/wat
186 # move the _SUCCESS file from /warc to its parent folder
187 hdfs dfs -mv $OUTPUTDIR/_SUCCESS $OUTPUT_PARENTDIR/.
188
189 # Move from the "/home/vagrant/cc-index-table" github project to the "ia-hadoop-tools" github project:
190 cd /home/vagrant/ia-hadoop-tools
191 # PWD is now ia-hadoop-tools folder
192 # To run the following, guava.jar should be on the hadoop classpath
193 # This can be achieved with:
194 # vagrant@node1:~/ia-hadoop-tools$ sudo cp /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/.
195 $HADOOP_MAPRED_HOME/bin/hadoop jar $PWD/target/ia-hadoop-tools-jar-with-dependencies.jar WEATGenerator -strictMode -skipExisting batch-id-xyz $OUTPUTDIR/*.warc.gz
196
197 # Now copy the zipped wet-files into the mounted folder, so we can scp the files from there to where WETProcessor.java is
198 echo "Copying and tarring up the wet files"
199 mkdir /vagrant/${CRAWL_ID}-wet-files
200 hdfs dfs -get $OUTPUT_PARENTDIR/wet/*.warc.wet.gz /vagrant/${CRAWL_ID}-wet-files/.
201 cd /vagrant
202 #tar -cvf ${CRAWL_ID}.tar /vagrant/${CRAWL_ID}-wet-files
203 #rm -rf /vagrant/${CRAWL_ID}-wet-files
204fi
205
Note: See TracBrowser for help on using the repository browser.