root/gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/get_Maori_WET_records_from_CCSep2018_on.sh @ 33534

Revision 33534, 1.4 KB (checked in by ak19, 7 weeks ago)

Correction: toplevel script has to be placed inside cc-index-table not its parent outside of .

  • Property svn:executable set to *
Line 
1#!/bin/bash
2
3# This script should be put into cc-index-table (github project) toplevel folder
4# It works with the get_maori_WET_records_for_crawl.sh script, which has to be
5# placed in cc-index-table/src/script
6
7# crawl_ids are from http://index.commoncrawl.org/
8# We only want the crawl_ids from Sep 2018 and onwards as that's when
9# the content_languages field was included in CommonCrawl's columnar index
10
11# https://www.cyberciti.biz/faq/bash-for-loop-array/
12# (else chain commands as at https://superuser.com/questions/237072/wrapping-long-bash-commands-in-script-files)
13crawl_ids=( "CC-MAIN-2019-35" "CC-MAIN-2019-30" "CC-MAIN-2019-26" \
14      "CC-MAIN-2019-22" "CC-MAIN-2019-18" "CC-MAIN-2019-13" \
15      "CC-MAIN-2019-09" "CC-MAIN-2019-04" "CC-MAIN-2018-51" \
16      "CC-MAIN-2018-47" "CC-MAIN-2018-43" "CC-MAIN-2018-39" )
17
18for crawl_id in "${crawl_ids[@]}"
19do
20    echo "About to start off index and WARC download process for CRAWL ID: $crawl_id"
21    ./src/script/get_maori_WET_records_for_crawl.sh $crawl_id
22    result=$?
23    if [ $result != 0 ]; then
24    echo "Processing common-crawl $crawl_id failed with exit value: $result"
25    echo "Will cease to process remaining cc crawls. Exitting..."
26    exit $result
27    fi
28done
29
30# Important note from
31# https://www.tldp.org/LDP/abs/html/exit-status.html
32# "When a script ends with an exit that has no parameter, the exit
33# status of the script is the exit status of the last command executed
34# in the script (previous to the exit)."
35
Note: See TracBrowser for help on using the browser.