Changeset 33567


Ignore:
Timestamp:
2019-10-14T22:40:22+13:00 (5 years ago)
Author:
ak19
Message:

batchcrawl.sh now supports -all flag (and prints usage on 0 args). The script is active again with some minor testing done. Site 00001 (a page off http://00.gs) downloads successfully again, this time as a single site argument to the script. Removed the break statement from the crawlAll() function.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33566 r33567  
    8181    crawlId=$2
    8282
    83     echo "Processing crawlId: $crawlId"
     83    echo "Processing siteDir $siteDir with crawlId: $crawlId"
    8484   
    8585    if [ -d "$outputDir/$crawlId" ]; then
     
    111111   
    112112    prepareForCrawl $siteDir $crawlId
    113     break
     113    #break
    114114    done
    115115}
     
    141141    crawlId=`printf %05d $COUNTER`
    142142    #echo $COUNTER - $crawlId
    143     siteDir=$sitesDir/$crawlId
    144     echo "siteDir $siteDir (crawlId $crawlId)"
     143    # $siteDir needs / at end to work with existing code
     144    siteDir=$sitesDir/$crawlId/
     145    #echo "siteDir $siteDir (crawlId $crawlId)"
     146    prepareForCrawl $siteDir $crawlId
    145147    done
    146148   
    147149}
    148150
     151function printUsage() {
     152    echo "Run as:"
     153    echo "  $0 -all|<ids>"
     154    echo "    where an id is a folder name in to_crawl/sites"
     155    echo "    and ids can be a comma separated list of"
     156    echo "    individual ids and/or ranges"
     157    echo "  Examples:"
     158    echo "    $0 00008-00022,00025,00026,00028-00034"
     159    echo "    $0 -all"
     160    echo "    $0 00312"
     161   
     162}
     163
    149164function parseArgs() {
     165   
    150166    # for i in $*; do
    151167    #   echo "Pinky"
     
    154170
    155171    args="$1"
    156     echo "Got arg string: $args"
    157 
     172    #echo "Got arg string: $args"
     173
     174    if [ "x$args" = "x" ]; then
     175    printUsage
     176    exit 0
     177    fi
     178   
    158179    # works - split args on comma
    159180    # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
    160181    IFS=', ' read -ra IDS <<< "$args"
    161182    for id in "${IDS[@]}"; do
    162 
    163     if [[ $id == *"-"* ]]; then
     183    echo "id: |$id|"
     184    if [ "x$id" = "x-all" ]; then
     185        echo "crawlAll"
     186        crawlAll
     187    elif [[ $id == *"-"* ]]; then
     188        # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
    164189        echo "$id is a range"
    165190        startCrawlId=${id%%-*}
    166191        endCrawlId=${id##*-}
    167192        echo "crawlRange $startCrawlId $endCrawlId"
    168         crawlRange $startCrawlId $endCrawlId
     193        crawlRange $startCrawlId $endCrawlId   
     194       
    169195    else
    170196        echo "$id is singleton"
    171197        crawlId=$id
    172         siteDir=$sitesDir/$crawlId
     198        # $siteDir needs / at end to work with existing code
     199        siteDir=$sitesDir/$crawlId/
    173200        echo "prepareForCrawl $siteDir $crawlId"
    174         #prepareForCrawl $siteDir $crawlId     
     201        prepareForCrawl $siteDir $crawlId       
    175202    fi
    176203    done
    177 
    178 
    179 
    180 }
    181 
     204}
     205
     206
     207
     208# Passing as string instead of individual arguments
     209# Because one can only have 9 individual args without using shift, see
     210# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
     211
     212parseArgs "$*"
     213
     214# old. testing
    182215#crawlRange "00010" "00015"
    183216#parseArgs "00008-00022,00025,00026,00028-00034"
    184 parseArgs "$*"
    185 # can only have 9 args without using shift, see
    186 # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
    187 #parseArgs horse cow dog cat
Note: See TracChangeset for help on using the changeset viewer.