1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # This script should be put into cc-index-table (github project) toplevel folder
|
---|
4 | # It works with the get_maori_WET_records_for_crawl.sh script, which has to be
|
---|
5 | # placed in cc-index-table/src/script
|
---|
6 |
|
---|
7 | # crawl_ids are from http://index.commoncrawl.org/
|
---|
8 | # We only want the crawl_ids from Sep 2018 and onwards as that's when
|
---|
9 | # the content_languages field was included in CommonCrawl's columnar index
|
---|
10 |
|
---|
11 | # https://www.cyberciti.biz/faq/bash-for-loop-array/
|
---|
12 | # (else chain commands as at https://superuser.com/questions/237072/wrapping-long-bash-commands-in-script-files)
|
---|
13 | crawl_ids=( "CC-MAIN-2019-35" "CC-MAIN-2019-30" "CC-MAIN-2019-26" \
|
---|
14 | "CC-MAIN-2019-22" "CC-MAIN-2019-18" "CC-MAIN-2019-13" \
|
---|
15 | "CC-MAIN-2019-09" "CC-MAIN-2019-04" "CC-MAIN-2018-51" \
|
---|
16 | "CC-MAIN-2018-47" "CC-MAIN-2018-43" "CC-MAIN-2018-39" )
|
---|
17 |
|
---|
18 | for crawl_id in "${crawl_ids[@]}"
|
---|
19 | do
|
---|
20 | echo "About to start off index and WARC download process for CRAWL ID: $crawl_id"
|
---|
21 | ./src/script/get_maori_WET_records_for_crawl.sh $crawl_id
|
---|
22 | result=$?
|
---|
23 | if [ $result != 0 ]; then
|
---|
24 | echo "Processing common-crawl $crawl_id failed with exit value: $result"
|
---|
25 | echo "Will cease to process remaining cc crawls. Exitting..."
|
---|
26 | exit $result
|
---|
27 | fi
|
---|
28 | done
|
---|
29 |
|
---|
30 | # Important note from
|
---|
31 | # https://www.tldp.org/LDP/abs/html/exit-status.html
|
---|
32 | # "When a script ends with an exit that has no parameter, the exit
|
---|
33 | # status of the script is the exit status of the last command executed
|
---|
34 | # in the script (previous to the exit)."
|
---|
35 |
|
---|