Changeset 33495


Ignore:
Timestamp:
2019-09-22T19:19:36+12:00 (5 years ago)
Author:
ak19
Message:

Pruned out unused commands, added comments, marked unused variables to be removed in a future version of this script after testing out the full version of this script on CC crawl 2019-26.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/bin/script/get_maori_WET_records_for_crawl.sh

    r33494 r33495  
    11#!/bin/bash
    22
    3 # convert URL index with Spark on Yarn
     3# This script is BASED ON the cc-index-table github project's convert_url_index script at
     4# https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh
     5# That script is described as
     6# "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format)
     7# into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)."
     8# If you want to run that script, then modify its variables to have the following values before
     9# running it, in order for it to work on our machine for doing analytics:
     10# EXECUTOR_MEM=3g
     11# EXECUTOR_CORES=2
     12# NUM_EXECUTORS=2
     13# DRIVER_MEM=3g
     14# Since that script was copied here, as a result, a lot of such variables (like executor
     15# and memory related) are unused here, as they were just copied directly across. Such unused
     16# variables can probably be removed from this file.
     17
     18# This script was modified to do the following:
     19# SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp
     20# and get all those records for which the primary language in the content_languages field is MRI for Maori.
     21# Only the WARC related fields (url, filename, offset and length fields) of each record are requested.
     22# The matching records' fields are then constructed into a distributed csv file on the local hdfs system
     23# A second phase then requests the warc files at those offsets and downloads them onto the local hdfs.
     24# We still get zipped WARC files, but they only contain the pages of that crawl where the primary language
     25# was identified as MRI.
     26# A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the
     27# mounted shared space on vagrant.
     28
     29
     30#---------------------------- START UNUSED VARIABLES---------------------------#
    431
    532# Table format configuration
     
    936PARTITION_BY="crawl,subset"
    1037
     38
     39# Spark configuration
     40SPARK_HOME="$SPARK_HOME"
     41# EXECUTOR_MEM=44g
     42# EXECUTOR_CORES=12
     43# NUM_EXECUTORS=4
     44# DRIVER_MEM=4g
     45
     46#--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh,
     47# as follows in order to get that spark job to run. Not used in this script. ---#
     48EXECUTOR_MEM=3g
     49EXECUTOR_CORES=2
     50NUM_EXECUTORS=2
     51DRIVER_MEM=3g
     52
     53#--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---#
     54SPARK_ON_YARN="--master yarn"
     55SPARK_EXTRA_OPTS=""
     56
     57# source specific configuration file
     58## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
     59
     60#---------------------------- END UNUSED VARIABLES---------------------------#
     61
     62
     63# The crawl timestamp, of the form CC-MAIN-2019-26
     64# Obtain from http://index.commoncrawl.org/
     65CRAWL_ID=$1
     66if [ "x$CRAWL_ID" == "x" ]; then
     67    echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT."
     68    echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/"
     69    exit
     70fi
    1171# Output directory
    12 CRAWL_ID=$1
    1372OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID}
    1473     # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs:
     
    2483
    2584
    26 # Spark configuration
    27 SPARK_HOME="$SPARK_HOME"
    28 # EXECUTOR_MEM=44g
    29 # EXECUTOR_CORES=12
    30 # NUM_EXECUTORS=4
    31 # DRIVER_MEM=4g
    32 
    33 EXECUTOR_MEM=3g
    34 EXECUTOR_CORES=2
    35 NUM_EXECUTORS=2
    36 DRIVER_MEM=3g
    37 
    38 
    39 SPARK_ON_YARN="--master yarn"
    40 SPARK_EXTRA_OPTS=""
    41 
    42 # source specific configuration file
    43 ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh
    44 
    45 
    4685_APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar
    4786
     
    5392set -x
    5493
    55 OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
     94# PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files
     95# for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori)
     96# The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR.
     97
     98#OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv"
     99OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv"
    56100
    57101#   --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
    58102#   --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \
    59 #    --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \
    60 #    --conf spark.hadoop.fs.s3a.secret.key=ZVPIboz0brE+Zy8IXyo76wl7GaFrtlr6g4TBKgJt \
    61103
    62104
     
    92134if [ $? == 0 ]; then
    93135    echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}."
     136    echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR"
    94137else
    95138    echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..."
    96139    hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv
    97 fi
    98 
    99 echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv"
    100 hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv
    101 
    102 
    103 # Now onto phase 2, which uses the index of MRI warc URLs and offsets,
     140
     141    echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv"
     142    hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv
     143fi
     144
     145
     146
     147# PHASE 2, which uses the index of MRI warc URLs and offsets,
    104148# stored in the now unzipped .csv file,
    105149# to get all the WARC records it specifies at the specified warc offsets.
     
    109153OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc"
    110154
    111 # $SPARK_HOME/bin/spark-submit \
    112 #     $SPARK_ON_YARN \
    113 #     --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
    114 #     --conf spark.core.connection.ack.wait.timeout=600s \
    115 #     --conf spark.network.timeout=120s \
    116 #     --conf spark.task.maxFailures=20 \
    117 #     --conf spark.shuffle.io.maxRetries=20 \
    118 #     --conf spark.shuffle.io.retryWait=60s \
    119 #     --conf spark.driver.memory=$DRIVER_MEM \
    120 #     --conf spark.executor.memory=$EXECUTOR_MEM \
    121 #     $SPARK_EXTRA_OPTS \
    122 #     --num-executors $NUM_EXECUTORS \
    123 #     --executor-cores $EXECUTOR_CORES \
    124 #     --executor-memory $EXECUTOR_MEM \
    125 #     --conf spark.hadoop.parquet.enable.dictionary=true \
    126 #     --conf spark.sql.parquet.filterPushdown=true \
    127 #     --conf spark.sql.parquet.mergeSchema=false \
    128 #     --conf spark.sql.hive.metastorePartitionPruning=true \
    129 #     --conf spark.hadoop.parquet.enable.summary-metadata=false \
    130 #     --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \
    131 #     --outputCompression=$COMPRS \
    132 #     --outputFormat=$FORMAT $NESTED \
    133 #     --partitionBy=$PARTITION_BY \
    134 #     "$DATA" "$OUTPUTDIR"
    135155
    136156#   --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \
     
    150170
    151171
    152 # Phase 3: convert warc files to wet files and tar them up into the mounted shared area
     172# PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area
    153173
    154174hdfs dfs -test -f $OUTPUTDIR/_SUCCESS
Note: See TracChangeset for help on using the changeset viewer.