1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # This script is BASED ON the cc-index-table github project's convert_url_index script at
|
---|
4 | # https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh
|
---|
5 | # That script is described as
|
---|
6 | # "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format)
|
---|
7 | # into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)."
|
---|
8 | # If you want to run that script, then modify its variables to have the following values before
|
---|
9 | # running it, in order for it to work on our machine for doing analytics:
|
---|
10 | # EXECUTOR_MEM=3g
|
---|
11 | # EXECUTOR_CORES=2
|
---|
12 | # NUM_EXECUTORS=2
|
---|
13 | # DRIVER_MEM=3g
|
---|
14 | # Since that script was copied here, as a result, a lot of such variables (like executor
|
---|
15 | # and memory related) are unused here, as they were just copied directly across. Such unused
|
---|
16 | # variables can probably be removed from this file.
|
---|
17 |
|
---|
18 | # This script was modified to do the following:
|
---|
19 | # SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp
|
---|
20 | # and get all those records for which the primary language in the content_languages field is MRI for Maori.
|
---|
21 | # Only the WARC related fields (url, filename, offset and length fields) of each record are requested.
|
---|
22 | # The matching records' fields are then constructed into a distributed csv file on the local hdfs system
|
---|
23 | # A second phase then requests the warc files at those offsets and downloads them onto the local hdfs.
|
---|
24 | # We still get zipped WARC files, but they only contain the pages of that crawl where the primary language
|
---|
25 | # was identified as MRI.
|
---|
26 | # A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the
|
---|
27 | # mounted shared space on vagrant.
|
---|
28 |
|
---|
29 |
|
---|
30 | #---------------------------- START UNUSED VARIABLES---------------------------#
|
---|
31 |
|
---|
32 | # Table format configuration
|
---|
33 | FORMAT=${FORMAT:-parquet} # parquet, orc
|
---|
34 | NESTED="$NESTED" # "" (empty) or --useNestedSchema
|
---|
35 | COMPRS=${COMPRS:-gzip} # gzip, snappy, lzo, none
|
---|
36 | PARTITION_BY="crawl,subset"
|
---|
37 |
|
---|
38 |
|
---|
39 | # Spark configuration
|
---|
40 | SPARK_HOME="$SPARK_HOME"
|
---|
41 | # EXECUTOR_MEM=44g
|
---|
42 | # EXECUTOR_CORES=12
|
---|
43 | # NUM_EXECUTORS=4
|
---|
44 | # DRIVER_MEM=4g
|
---|
45 |
|
---|
46 | #--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh,
|
---|
47 | # as follows in order to get that spark job to run. Not used in this script. ---#
|
---|
48 | EXECUTOR_MEM=3g
|
---|
49 | EXECUTOR_CORES=2
|
---|
50 | NUM_EXECUTORS=2
|
---|
51 | DRIVER_MEM=3g
|
---|
52 |
|
---|
53 | #--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---#
|
---|
54 | SPARK_ON_YARN="--master yarn"
|
---|
55 | SPARK_EXTRA_OPTS=""
|
---|
56 |
|
---|
57 | # source specific configuration file
|
---|
58 | ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
|
---|
59 |
|
---|
60 | #---------------------------- END UNUSED VARIABLES---------------------------#
|
---|
61 |
|
---|
62 |
|
---|
63 | # The crawl timestamp, of the form CC-MAIN-2019-26
|
---|
64 | # Obtain from http://index.commoncrawl.org/
|
---|
65 | CRAWL_ID=$1
|
---|
66 | if [ "x$CRAWL_ID" == "x" ]; then
|
---|
67 | echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT."
|
---|
68 | echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/"
|
---|
69 | exit
|
---|
70 | fi
|
---|
71 | # Output directory
|
---|
72 | OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID}
|
---|
73 | # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs:
|
---|
74 |
|
---|
75 | # https://stackoverflow.com/questions/26513861/checking-if-directory-in-hdfs-already-exists-or-not
|
---|
76 | hdfs dfs -test -d $OUTPUT_PARENTDIR
|
---|
77 | if [ $? == 0 ]; then
|
---|
78 | echo "Directory $OUTPUT_PARENTDIR already exists."
|
---|
79 | else
|
---|
80 | echo "Creating directory $OUTPUT_PARENTDIR..."
|
---|
81 | hdfs dfs -mkdir $OUTPUT_PARENTDIR
|
---|
82 | fi
|
---|
83 |
|
---|
84 |
|
---|
85 | _APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
|
---|
86 |
|
---|
87 | ##export LIBJARS=/home/vagrant/lib/hadoop-aws-2.7.6.jar
|
---|
88 | #export LIBJARS=/home/vagrant/lib/*
|
---|
89 | #export HADOOP_CLASSPATH=`echo ${LIBJARS} | sed s/,/:/g`
|
---|
90 |
|
---|
91 | set -e
|
---|
92 | set -x
|
---|
93 |
|
---|
94 | # PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files
|
---|
95 | # for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori)
|
---|
96 | # The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR.
|
---|
97 |
|
---|
98 | #OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
|
---|
99 | OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv"
|
---|
100 |
|
---|
101 | # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
|
---|
102 | # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
|
---|
103 |
|
---|
104 |
|
---|
105 | # $SPARK_ON_YARN \
|
---|
106 | # /home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar
|
---|
107 | # --jars file:/home/vagrant/aws-java-sdk-1.7.4.jar,file:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
|
---|
108 | # --driver-class-path=/home/vagrant/lib/aws-java-sdk-1.7.4.jar:/home/vagrant/lib/hadoop-aws-2.7.6.jar \
|
---|
109 |
|
---|
110 | # https://www.patricia-anong.com/blog/2017/11/1/extend-vmdk-on-virtualbox
|
---|
111 |
|
---|
112 | $SPARK_HOME/bin/spark-submit \
|
---|
113 | --jars file:/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar,file:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
|
---|
114 | --driver-class-path=/home/vagrant/.m2/repository/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar:/home/vagrant/.m2/repository/org/apache/hadoop/hadoop-aws/2.7.6/hadoop-aws-2.7.6.jar \
|
---|
115 | --conf spark.hadoop.parquet.enable.dictionary=true \
|
---|
116 | --conf spark.hadoop.parquet.enable.summary-metadata=false \
|
---|
117 | --conf spark.sql.hive.metastorePartitionPruning=true \
|
---|
118 | --conf spark.sql.parquet.filterPushdown=true \
|
---|
119 | --conf spark.sql.parquet.mergeSchema=true \
|
---|
120 | --class org.commoncrawl.spark.examples.CCIndexExport $_APPJAR \
|
---|
121 | --query "SELECT url, warc_filename, warc_record_offset, warc_record_length
|
---|
122 | FROM ccindex
|
---|
123 | WHERE crawl = '${CRAWL_ID}' AND subset = 'warc' AND content_languages = 'mri'" \
|
---|
124 | --outputFormat csv \
|
---|
125 | --numOutputPartitions 10 \
|
---|
126 | --outputCompression gzip \
|
---|
127 | s3a://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
128 | $OUTPUTDIR
|
---|
129 |
|
---|
130 |
|
---|
131 | # The above generates ZIPPED part*.csv files in $OUTPUTDIR (folder cc-mri-csv).
|
---|
132 | # First create a folder and unzip into it:
|
---|
133 | hdfs dfs -test -d $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
|
---|
134 | if [ $? == 0 ]; then
|
---|
135 | echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}."
|
---|
136 | echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR"
|
---|
137 | else
|
---|
138 | echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..."
|
---|
139 | hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
|
---|
140 |
|
---|
141 | echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv"
|
---|
142 | hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv
|
---|
143 | fi
|
---|
144 |
|
---|
145 |
|
---|
146 |
|
---|
147 | # PHASE 2, which uses the index of MRI warc URLs and offsets,
|
---|
148 | # stored in the now unzipped .csv file,
|
---|
149 | # to get all the WARC records it specifies at the specified warc offsets.
|
---|
150 | # This will get us all the MRI pages of the commoncrawl for CRAWL_ID
|
---|
151 |
|
---|
152 | # Change OUTPUTDIR
|
---|
153 | OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc"
|
---|
154 |
|
---|
155 |
|
---|
156 | # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
|
---|
157 | # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
|
---|
158 |
|
---|
159 |
|
---|
160 | # $SPARK_ON_YARN \
|
---|
161 | $SPARK_HOME/bin/spark-submit \
|
---|
162 | --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
|
---|
163 | --class org.commoncrawl.spark.examples.CCIndexWarcExport $_APPJAR \
|
---|
164 | --csv $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv \
|
---|
165 | --numOutputPartitions 10 \
|
---|
166 | --numRecordsPerWarcFile 5000 \
|
---|
167 | --warcPrefix MAORI-${CRAWL_ID} \
|
---|
168 | s3a://commoncrawl/cc-index/table/cc-main/warc/ \
|
---|
169 | $OUTPUTDIR
|
---|
170 |
|
---|
171 |
|
---|
172 | # PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area
|
---|
173 |
|
---|
174 | hdfs dfs -test -f $OUTPUTDIR/_SUCCESS
|
---|
175 | if [ $? == 0 ]; then
|
---|
176 | # ia-hadoop-tools converts warc files into wet (and wat) files but expects a particular folder structure
|
---|
177 | # Create the expected folder structure: a "wet" and a "wat" folder should exist
|
---|
178 | # at the same level as the "warc" folder ($OUTPUTDIR) that contains the warc gz files.
|
---|
179 |
|
---|
180 | echo "Creating wet (text) and wat (metadata) directories to put converted warc files into..."
|
---|
181 | hdfs dfs -mkdir $OUTPUT_PARENTDIR/wet
|
---|
182 | hdfs dfs -mkdir $OUTPUT_PARENTDIR/wat
|
---|
183 | # move the _SUCCESS file from /warc to its parent folder
|
---|
184 | hdfs dfs -mv $OUTPUTDIR/_SUCCESS $OUTPUT_PARENTDIR/.
|
---|
185 |
|
---|
186 | # Move from the "/home/vagrant/cc-index-table" github project to the "ia-hadoop-tools" github project:
|
---|
187 | cd /home/vagrant/ia-hadoop-tools
|
---|
188 | # PWD is now ia-hadoop-tools folder
|
---|
189 | # To run the following, guava.jar should be on the hadoop classpath
|
---|
190 | # This can be achieved with:
|
---|
191 | # vagrant@node1:~/ia-hadoop-tools$ sudo cp /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/.
|
---|
192 | $HADOOP_MAPRED_HOME/bin/hadoop jar $PWD/target/ia-hadoop-tools-jar-with-dependencies.jar WEATGenerator -strictMode -skipExisting batch-id-xyz $OUTPUTDIR/*.warc.gz
|
---|
193 |
|
---|
194 | # Now copy the zipped wet-files into the mounted folder, so we can scp the files from there to where WETProcessor.java is
|
---|
195 | echo "Copying and tarring up the wet files"
|
---|
196 | mkdir /vagrant/${CRAWL_ID}-wet-files
|
---|
197 | hdfs dfs -get $OUTPUT_PARENTDIR/wet/*.warc.wet.gz /vagrant/${CRAWL_ID}-wet-files/.
|
---|
198 | cd /vagrant
|
---|
199 | #tar -cvf ${CRAWL_ID}.tar /vagrant/${CRAWL_ID}-wet-files
|
---|
200 | #rm -rf /vagrant/${CRAWL_ID}-wet-files
|
---|
201 | fi
|
---|
202 |
|
---|