Changeset 33526 for gs3-extensions
- Timestamp:
- 2019-09-26T20:38:14+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 2 deleted
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/hdfs-instructions/scripts/get_Maori_WET_records_from_CCSep2018_on.sh
r33525 r33526 3 3 # crawl_ids are from http://index.commoncrawl.org/ 4 4 # We only want the crawl_ids from Sep 2018 and onwards as that's when 5 # the content_languages field was included in CommonCrawl's index5 # the content_languages field was included in CommonCrawl's columnar index 6 6 7 7 # https://www.cyberciti.biz/faq/bash-for-loop-array/ 8 #crawl_ids=( "CC-MAIN-2019-35" "CC-MAIN-2019-30" "CC-MAIN-2019-26" "CC-MAIN-2019-22" "CC-MAIN-2019-18" "CC-MAIN-2019-13" "CC-MAIN-2019-09" "CC-MAIN-2019-04" "CC-MAIN-2018-51" "CC-MAIN-2018-47" "CC-MAIN-2018-43" "CC-MAIN-2018-39" ) 9 10 crawl_ids=( "CC-MAIN-2019-18" "CC-MAIN-2019-13" "CC-MAIN-2019-09" "CC-MAIN-2019-04" "CC-MAIN-2018-51" "CC-MAIN-2018-47" "CC-MAIN-2018-43" "CC-MAIN-2018-39" ) 8 # (else chain commands as at https://superuser.com/questions/237072/wrapping-long-bash-commands-in-script-files) 9 crawl_ids=( "CC-MAIN-2019-35" "CC-MAIN-2019-30" "CC-MAIN-2019-26" \ 10 "CC-MAIN-2019-22" "CC-MAIN-2019-18" "CC-MAIN-2019-13" \ 11 "CC-MAIN-2019-09" "CC-MAIN-2019-04" "CC-MAIN-2018-51" \ 12 "CC-MAIN-2018-47" "CC-MAIN-2018-43" "CC-MAIN-2018-39" ) 11 13 12 14 for crawl_id in "${crawl_ids[@]}" 13 15 do 14 16 echo "About to start off index and WARC download process for CRAWL ID: $crawl_id" 15 ./src/script/get_maori_WET_records_for_crawl.sh $crawl_id 17 ./cc-index-table/src/script/get_maori_WET_records_for_crawl.sh $crawl_id 18 result=$? 19 if [ $result != 0 ]; then 20 echo "Processing common-crawl $crawl_id failed with exit value: $result" 21 echo "Will cease to process remaining cc crawls. Exitting..." 22 exit $result 23 fi 16 24 done 25 26 # Important note from 27 # https://www.tldp.org/LDP/abs/html/exit-status.html 28 # "When a script ends with an exit that has no parameter, the exit 29 # status of the script is the exit status of the last command executed 30 # in the script (previous to the exit)." 31
Note:
See TracChangeset
for help on using the changeset viewer.