Changeset 33566 for gs3-extensions


Ignore:
Timestamp:
2019-10-14T22:07:45+13:00 (5 years ago)
Author:
ak19
Message:

batchcrawl.sh script now supports taking a comma or space separated list of crawl folder ids of the form 00000%d (e.g. 00311) and/or ranges of such crawl folder ids. Script is now inactive. Next commit should support the all parameter to crawl all subfolders (and perhaps a range with no final termination value).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33564 r33566  
    2222    crawlId=$2
    2323
    24     echo "processing site $siteDir with crawlId: $crawlId"
     24    echo "processing site $siteDir"
     25
     26    #echo "processing site $siteDir with crawlId: $crawlId"
    2527
    2628    echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
     
    7577}
    7678
    77 
    78 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
    79 for siteDir in $sitesDir/*/; do
    80 
    81     # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
    82     # Remove the $sitesDir prefix of to_crawl/sites followed by /,
    83     # Next remove the / suffix that remains
    84     crawlId=${siteDir#"$sitesDir/"}
    85     crawlId=${crawlId%/}
     79function prepareForCrawl() {
     80    siteDir=$1
     81    crawlId=$2
    8682
    8783    echo "Processing crawlId: $crawlId"
     
    9995    echo "--------------------------------------------------"
    10096   
    101     break
    102 done
     97
     98   
     99}
     100
     101function crawlAll() {
     102
     103    # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
     104    for siteDir in $sitesDir/*/; do
     105   
     106    # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
     107    # Remove the $sitesDir prefix of to_crawl/sites followed by /,
     108    # Next remove the / suffix that remains
     109    crawlId=${siteDir#"$sitesDir/"}
     110    crawlId=${crawlId%/}
     111   
     112    prepareForCrawl $siteDir $crawlId
     113    break
     114    done
     115}
     116
     117function crawlRange() {
     118   
     119    startCrawlId=$1
     120    endCrawlId=$2
     121
     122    # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
     123
     124    # sadly, the numeric value is in octal in both the following
     125    #end=$(($endCrawlId+0))   
     126    #printf -v end '%d\n' $endCrawlId 2>/dev/null
     127    # removes a single 0 prefix
     128    #end=${endCrawlId##+(0)}
     129    #${old##+(0)}
     130    # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
     131    start=${startCrawlId#"${startCrawlId%%[!0]*}"}
     132    end=${endCrawlId#"${endCrawlId%%[!0]*}"}
     133
     134    echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
     135
     136    # Generate a range of numbers between start and end
     137    #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
     138    for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
     139    # Now pad back with zeroes to get crawlId
     140    # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
     141    crawlId=`printf %05d $COUNTER`
     142    #echo $COUNTER - $crawlId
     143    siteDir=$sitesDir/$crawlId
     144    echo "siteDir $siteDir (crawlId $crawlId)"
     145    done
     146   
     147}
     148
     149function parseArgs() {
     150    # for i in $*; do
     151    #   echo "Pinky"
     152    #   echo $i
     153    # done
     154
     155    args="$1"
     156    echo "Got arg string: $args"
     157
     158    # works - split args on comma
     159    # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
     160    IFS=', ' read -ra IDS <<< "$args"
     161    for id in "${IDS[@]}"; do
     162
     163    if [[ $id == *"-"* ]]; then
     164        echo "$id is a range"
     165        startCrawlId=${id%%-*}
     166        endCrawlId=${id##*-}
     167        echo "crawlRange $startCrawlId $endCrawlId"
     168        crawlRange $startCrawlId $endCrawlId
     169    else
     170        echo "$id is singleton"
     171        crawlId=$id
     172        siteDir=$sitesDir/$crawlId
     173        echo "prepareForCrawl $siteDir $crawlId"
     174        #prepareForCrawl $siteDir $crawlId     
     175    fi
     176    done
     177
     178
     179
     180}
     181
     182#crawlRange "00010" "00015"
     183#parseArgs "00008-00022,00025,00026,00028-00034"
     184parseArgs "$*"
     185# can only have 9 args without using shift, see
     186# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
     187#parseArgs horse cow dog cat
Note: See TracChangeset for help on using the changeset viewer.