Changeset 33495
- Timestamp:
- 2019-09-22T19:19:36+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/bin/script/get_maori_WET_records_for_crawl.sh
r33494 r33495 1 1 #!/bin/bash 2 2 3 # convert URL index with Spark on Yarn 3 # This script is BASED ON the cc-index-table github project's convert_url_index script at 4 # https://github.com/commoncrawl/cc-index-table/blob/master/src/script/convert_url_index.sh 5 # That script is described as 6 # "A Spark job converts the Common Crawl URL index files (a sharded gzipped index in CDXJ format) 7 # into a table in Parquet or ORC format." (https://github.com/commoncrawl/cc-index-table)." 8 # If you want to run that script, then modify its variables to have the following values before 9 # running it, in order for it to work on our machine for doing analytics: 10 # EXECUTOR_MEM=3g 11 # EXECUTOR_CORES=2 12 # NUM_EXECUTORS=2 13 # DRIVER_MEM=3g 14 # Since that script was copied here, as a result, a lot of such variables (like executor 15 # and memory related) are unused here, as they were just copied directly across. Such unused 16 # variables can probably be removed from this file. 17 18 # This script was modified to do the following: 19 # SQL query CommonCrawl's distributed cc-index table on Amazon S3 for the parameterised crawl timestamp 20 # and get all those records for which the primary language in the content_languages field is MRI for Maori. 21 # Only the WARC related fields (url, filename, offset and length fields) of each record are requested. 22 # The matching records' fields are then constructed into a distributed csv file on the local hdfs system 23 # A second phase then requests the warc files at those offsets and downloads them onto the local hdfs. 24 # We still get zipped WARC files, but they only contain the pages of that crawl where the primary language 25 # was identified as MRI. 26 # A third phase converts those WARC files into WET (and WAT) files and copies these zipped files onto the 27 # mounted shared space on vagrant. 28 29 30 #---------------------------- START UNUSED VARIABLES---------------------------# 4 31 5 32 # Table format configuration … … 9 36 PARTITION_BY="crawl,subset" 10 37 38 39 # Spark configuration 40 SPARK_HOME="$SPARK_HOME" 41 # EXECUTOR_MEM=44g 42 # EXECUTOR_CORES=12 43 # NUM_EXECUTORS=4 44 # DRIVER_MEM=4g 45 46 #--- Dr Bainbridge modified the above variables in the original script, convert_url_index.sh, 47 # as follows in order to get that spark job to run. Not used in this script. ---# 48 EXECUTOR_MEM=3g 49 EXECUTOR_CORES=2 50 NUM_EXECUTORS=2 51 DRIVER_MEM=3g 52 53 #--- VARIABLES PROBABLY ALSO NOT OF USE IN THIS SCRIPT ---# 54 SPARK_ON_YARN="--master yarn" 55 SPARK_EXTRA_OPTS="" 56 57 # source specific configuration file 58 ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh 59 60 #---------------------------- END UNUSED VARIABLES---------------------------# 61 62 63 # The crawl timestamp, of the form CC-MAIN-2019-26 64 # Obtain from http://index.commoncrawl.org/ 65 CRAWL_ID=$1 66 if [ "x$CRAWL_ID" == "x" ]; then 67 echo "No crawl timestamp provided. Should be of the form CC-MAIN-YYYY-COUNT." 68 echo "e.g. CC-MAIN-2019-26. Choose a crawl timestamp from http://index.commoncrawl.org/" 69 exit 70 fi 11 71 # Output directory 12 CRAWL_ID=$113 72 OUTPUT_PARENTDIR=hdfs:///user/vagrant/${CRAWL_ID} 14 73 # or just OUTPUT_PARENTDIR=/user/vagrant/${CRAWL_ID}, since /user/vagrant is on hdfs: … … 24 83 25 84 26 # Spark configuration27 SPARK_HOME="$SPARK_HOME"28 # EXECUTOR_MEM=44g29 # EXECUTOR_CORES=1230 # NUM_EXECUTORS=431 # DRIVER_MEM=4g32 33 EXECUTOR_MEM=3g34 EXECUTOR_CORES=235 NUM_EXECUTORS=236 DRIVER_MEM=3g37 38 39 SPARK_ON_YARN="--master yarn"40 SPARK_EXTRA_OPTS=""41 42 # source specific configuration file43 ## test -e $(dirname $0)/convert_url_index_conf.sh && . $(dirname $0)/convert_url_index_conf.sh44 45 46 85 _APPJAR=$PWD/target/cc-spark-0.2-SNAPSHOT-jar-with-dependencies.jar 47 86 … … 53 92 set -x 54 93 55 OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv" 94 # PHASE 1: querying this crawl's massive index with an SQL query that requests just the references to warc files 95 # for those crawled web pages where the content_languages field's primary language is MRI (3 letter code for Maori) 96 # The output is a distributed .csv file which will be stored in a "cc-mri-csv" subfolder of the $OUTPUT_PARENTDIR. 97 98 #OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/cc-mri-csv" 99 OUTPUTDIR="${OUTPUT_PARENTDIR}/cc-mri-csv" 56 100 57 101 # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \ 58 102 # --conf spark.hadoop.fs.s3a.secret.key=SECRETKEY \ 59 # --conf spark.hadoop.fs.s3a.access.key=AKIA2EVQBWSTBJ2M4BLM \60 # --conf spark.hadoop.fs.s3a.secret.key=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 \61 103 62 104 … … 92 134 if [ $? == 0 ]; then 93 135 echo "Directory cc-mri-unzipped-csv already exists for crawl ${CRAWL_ID}." 136 echo "Assuming cc-mri.csv also exists inside $OUTPUT_PARENTDIR" 94 137 else 95 138 echo "Creating directory $OUTPUT_PARENTDIR/cc-mri-unzipped-csv..." 96 139 hdfs dfs -mkdir $OUTPUT_PARENTDIR/cc-mri-unzipped-csv 97 fi 98 99 echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv" 100 hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv 101 102 103 # Now onto phase 2, which uses the index of MRI warc URLs and offsets, 140 141 echo "Unzipping ${OUTPUTDIR}/part files into $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv" 142 hdfs dfs -cat $OUTPUTDIR/part* | gzip -d | hdfs dfs -put - $OUTPUT_PARENTDIR/cc-mri-unzipped-csv/cc-mri.csv 143 fi 144 145 146 147 # PHASE 2, which uses the index of MRI warc URLs and offsets, 104 148 # stored in the now unzipped .csv file, 105 149 # to get all the WARC records it specifies at the specified warc offsets. … … 109 153 OUTPUTDIR="hdfs:///user/vagrant/${CRAWL_ID}/warc" 110 154 111 # $SPARK_HOME/bin/spark-submit \112 # $SPARK_ON_YARN \113 # --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \114 # --conf spark.core.connection.ack.wait.timeout=600s \115 # --conf spark.network.timeout=120s \116 # --conf spark.task.maxFailures=20 \117 # --conf spark.shuffle.io.maxRetries=20 \118 # --conf spark.shuffle.io.retryWait=60s \119 # --conf spark.driver.memory=$DRIVER_MEM \120 # --conf spark.executor.memory=$EXECUTOR_MEM \121 # $SPARK_EXTRA_OPTS \122 # --num-executors $NUM_EXECUTORS \123 # --executor-cores $EXECUTOR_CORES \124 # --executor-memory $EXECUTOR_MEM \125 # --conf spark.hadoop.parquet.enable.dictionary=true \126 # --conf spark.sql.parquet.filterPushdown=true \127 # --conf spark.sql.parquet.mergeSchema=false \128 # --conf spark.sql.hive.metastorePartitionPruning=true \129 # --conf spark.hadoop.parquet.enable.summary-metadata=false \130 # --class org.commoncrawl.spark.CCIndex2Table $_APPJAR \131 # --outputCompression=$COMPRS \132 # --outputFormat=$FORMAT $NESTED \133 # --partitionBy=$PARTITION_BY \134 # "$DATA" "$OUTPUTDIR"135 155 136 156 # --conf spark.hadoop.fs.s3a.access.key=ACCESSKEY \ … … 150 170 151 171 152 # P hase 3: convert warc files to wet files and tar them upinto the mounted shared area172 # PHASE 3: convert warc files to wet files and copy the wet files into the mounted shared area 153 173 154 174 hdfs dfs -test -f $OUTPUTDIR/_SUCCESS
Note:
See TracChangeset
for help on using the changeset viewer.