#!/bin/bash # This script should be put into cc-index-table (github project) toplevel folder # It works with the get_maori_WET_records_for_crawl.sh script, which has to be # placed in cc-index-table/src/script # crawl_ids are from http://index.commoncrawl.org/ # We only want the crawl_ids from Sep 2018 and onwards as that's when # the content_languages field was included in CommonCrawl's columnar index # https://www.cyberciti.biz/faq/bash-for-loop-array/ # (else chain commands as at https://superuser.com/questions/237072/wrapping-long-bash-commands-in-script-files) crawl_ids=( "CC-MAIN-2019-35" "CC-MAIN-2019-30" "CC-MAIN-2019-26" \ "CC-MAIN-2019-22" "CC-MAIN-2019-18" "CC-MAIN-2019-13" \ "CC-MAIN-2019-09" "CC-MAIN-2019-04" "CC-MAIN-2018-51" \ "CC-MAIN-2018-47" "CC-MAIN-2018-43" "CC-MAIN-2018-39" ) for crawl_id in "${crawl_ids[@]}" do echo "About to start off index and WARC download process for CRAWL ID: $crawl_id" ./src/script/get_maori_WET_records_for_crawl.sh $crawl_id result=$? if [ $result != 0 ]; then echo "Processing common-crawl $crawl_id failed with exit value: $result" echo "Will cease to process remaining cc crawls. Exitting..." exit $result fi done # Important note from # https://www.tldp.org/LDP/abs/html/exit-status.html # "When a script ends with an exit that has no parameter, the exit # status of the script is the exit status of the last command executed # in the script (previous to the exit)."