Changeset 33495

Show
Ignore:
Timestamp:
22.09.2019 19:19:36 (4 weeks ago)
Author:
ak19
Message:

Pruned out unused commands, added comments, marked unused variables to be removed in a future version of this script after testing out the full version of this script on CC crawl 2019-26.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/bin/script/get_maori_WET_records_for_crawl.sh

    r33494 r33495  
    11#!/bin/bash 
    22 
    3 # convert URL index with Spark on Yarn 
     3# This script is BASED ON the cc-index-table github project's convert_url_index script at 
     4# https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh 
     5# That script is described as 
     6# "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format) 
     7# into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)." 
     8# If you want to run that script, then modify its variables to have the following values before 
     9# running it, in order for it to work on our machine for doing analytics: 
     10# EXECUTOR_MEM=3g 
     11# EXECUTOR_CORES=2 
     12# NUM_EXECUTORS=2 
     13# DRIVER_MEM=3g 
     14# Since that script was copied here, as a result, a lot of such variables (like executor 
     15# and memory related) are unused here, as they were just copied directly across. Such unused 
     16# variables can probably be removed from this file. 
     17 
     18# This script was modified to do the following: 
     19# SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp 
     20# and get all those records for which the primary language in the content_languages field is MRI for Maori. 
     21# Only the WARC related fields (url, filename, offset and length fields) of each record are requested. 
     22# The matching records' fields are then constructed into a distributed csv file on the local hdfs system 
     23# A second phase then requests the warc files at those offsets and downloads them onto the local hdfs. 
     24# We still get zipped WARC files, but they only contain the pages of that crawl where the primary language 
     25# was identified as MRI. 
     26# A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the 
     27# mounted shared space on vagrant. 
     28 
     29 
     30#---------------------------- START UNUSED VARIABLES---------------------------# 
    431 
    532# Table format configuration 
     
    936PARTITION_BY="crawl,subset" 
    1037 
     38 
     39# Spark configuration 
     40SPARK_HOME="$SPARK_HOME" 
     41# EXECUTOR_MEM=44g 
     42# EXECUTOR_CORES=12 
     43# NUM_EXECUTORS=4 
     44# DRIVER_MEM=4g 
     45 
     46#--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh, 
     47# as follows in order to get that spark job to run. Not used in this script. ---# 
     48EXECUTOR_MEM=3g 
     49EXECUTOR_CORES=2 
     50NUM_EXECUTORS=2 
     51DRIVER_MEM=3g 
     52 
     53#--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---# 
     54SPARK_ON_YARN="--master yarn" 
     55SPARK_EXTRA_OPTS="" 
     56 
     57# source specific configuration file 
     58## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh 
     59 
     60#---------------------------- END UNUSED VARIABLES---------------------------# 
     61 
     62 
     63# The crawl timestamp, of the form CC-MAIN-2019-26 
     64# Obtain from http://index.commoncrawl.org/ 
     65CRAWL_ID=$1 
     66if [ "x$CRAWL_ID" == "x" ]; then 
     67    echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT." 
     68    echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/" 
     69    exit 
     70fi 
    1171# Output directory 
    12 CRAWL_ID=$1 
    1372OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID} 
    1473     # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs: 
     
    2483 
    2584 
    26 # Spark configuration 
    27 SPARK_HOME="$SPARK_HOME" 
    28 # EXECUTOR_MEM=44g 
    29 # EXECUTOR_CORES=12 
    30 # NUM_EXECUTORS=4 
    31 # DRIVER_MEM=4g 
    32  
    33 EXECUTOR_MEM=3g 
    34 EXECUTOR_CORES=2 
    35 NUM_EXECUTORS=2 
    36 DRIVER_MEM=3g 
    37  
    38  
    39 SPARK_ON_YARN="--master yarn" 
    40 SPARK_EXTRA_OPTS="" 
    41  
    42 # source specific configuration file 
    43 ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh 
    44  
    45  
    4685_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar 
    4786 
     
    5392set -x 
    5493 
    55 OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv" 
     94# PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files 
     95# for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori) 
     96# The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR. 
     97 
     98#OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv" 
     99OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv" 
    56100 
    57101#   --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \ 
    58102#   --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \ 
    59 #    --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \ 
    60 #    --conf spark.hadoop.fs.s3a.secret.key=ZVPIboz0brE+Zy8IXyo76wl7GaFrtlr6g4TBKgJt \ 
    61103 
    62104 
     
    92134if [ $? == 0 ]; then 
    93135    echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}." 
     136    echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR" 
    94137else 
    95138    echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..." 
    96139    hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv 
    97 fi 
    98  
    99 echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv" 
    100 hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv 
    101  
    102  
    103 # Now onto phase 2, which uses the index of MRI warc URLs and offsets, 
     140 
     141    echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv" 
     142    hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv 
     143fi 
     144 
     145 
     146 
     147# PHASE 2, which uses the index of MRI warc URLs and offsets, 
    104148# stored in the now unzipped .csv file, 
    105149# to get all the WARC records it specifies at the specified warc offsets. 
     
    109153OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc" 
    110154 
    111 # $SPARK_HOME/bin/spark-submit \ 
    112 #     $SPARK_ON_YARN \ 
    113 #     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ 
    114 #     --conf spark.core.connection.ack.wait.timeout=600s \ 
    115 #     --conf spark.network.timeout=120s \ 
    116 #     --conf spark.task.maxFailures=20 \ 
    117 #     --conf spark.shuffle.io.maxRetries=20 \ 
    118 #     --conf spark.shuffle.io.retryWait=60s \ 
    119 #     --conf spark.driver.memory=$DRIVER_MEM \ 
    120 #     --conf spark.executor.memory=$EXECUTOR_MEM \ 
    121 #     $SPARK_EXTRA_OPTS \ 
    122 #     --num-executors $NUM_EXECUTORS \ 
    123 #     --executor-cores $EXECUTOR_CORES \ 
    124 #     --executor-memory $EXECUTOR_MEM \ 
    125 #     --conf spark.hadoop.parquet.enable.dictionary=true \ 
    126 #     --conf spark.sql.parquet.filterPushdown=true \ 
    127 #     --conf spark.sql.parquet.mergeSchema=false \ 
    128 #     --conf spark.sql.hive.metastorePartitionPruning=true \ 
    129 #     --conf spark.hadoop.parquet.enable.summary-metadata=false \ 
    130 #     --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \ 
    131 #     --outputCompression=$COMPRS \ 
    132 #     --outputFormat=$FORMAT $NESTED \ 
    133 #     --partitionBy=$PARTITION_BY \ 
    134 #     "$DATA" "$OUTPUTDIR" 
    135155 
    136156#   --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \ 
     
    150170 
    151171 
    152 # Phase 3: convert warc files to wet files and tar them up into the mounted shared area 
     172# PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area 
    153173 
    154174hdfs dfs -test -f $OUTPUTDIR/_SUCCESS