Changeset 33567

Show
Ignore:
Timestamp:
14.10.2019 22:40:22 (4 weeks ago)
Author:
ak19
Message:

batchcrawl.sh now supports -all flag (and prints usage on 0 args). The script is active again with some minor testing done. Site 00001 (a page off  http://00.gs) downloads successfully again, this time as a single site argument to the script. Removed the break statement from the crawlAll() function.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33566 r33567  
    8181    crawlId=$2 
    8282 
    83     echo "Processing crawlId: $crawlId" 
     83    echo "Processing siteDir $siteDir with crawlId: $crawlId" 
    8484     
    8585    if [ -d "$outputDir/$crawlId" ]; then 
     
    111111     
    112112    prepareForCrawl $siteDir $crawlId 
    113     break 
     113    #break 
    114114    done 
    115115} 
     
    141141    crawlId=`printf %05d $COUNTER` 
    142142    #echo $COUNTER - $crawlId 
    143     siteDir=$sitesDir/$crawlId 
    144     echo "siteDir $siteDir (crawlId $crawlId)" 
     143    # $siteDir needs / at end to work with existing code 
     144    siteDir=$sitesDir/$crawlId/ 
     145    #echo "siteDir $siteDir (crawlId $crawlId)" 
     146    prepareForCrawl $siteDir $crawlId 
    145147    done 
    146148     
    147149} 
    148150 
     151function printUsage() { 
     152    echo "Run as:" 
     153    echo "  $0 -all|<ids>" 
     154    echo "    where an id is a folder name in to_crawl/sites" 
     155    echo "    and ids can be a comma separated list of" 
     156    echo "    individual ids and/or ranges" 
     157    echo "  Examples:" 
     158    echo "    $0 00008-00022,00025,00026,00028-00034" 
     159    echo "    $0 -all" 
     160    echo "    $0 00312" 
     161     
     162} 
     163 
    149164function parseArgs() { 
     165     
    150166    # for i in $*; do 
    151167    #   echo "Pinky" 
     
    154170 
    155171    args="$1" 
    156     echo "Got arg string: $args" 
    157  
     172    #echo "Got arg string: $args" 
     173 
     174    if [ "x$args" = "x" ]; then  
     175    printUsage 
     176    exit 0 
     177    fi 
     178     
    158179    # works - split args on comma 
    159180    # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash 
    160181    IFS=', ' read -ra IDS <<< "$args" 
    161182    for id in "${IDS[@]}"; do 
    162  
    163     if [[ $id == *"-"* ]]; then 
     183    echo "id: |$id|" 
     184    if [ "x$id" = "x-all" ]; then 
     185        echo "crawlAll" 
     186        crawlAll 
     187    elif [[ $id == *"-"* ]]; then 
     188        # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash 
    164189        echo "$id is a range" 
    165190        startCrawlId=${id%%-*} 
    166191        endCrawlId=${id##*-} 
    167192        echo "crawlRange $startCrawlId $endCrawlId" 
    168         crawlRange $startCrawlId $endCrawlId 
     193        crawlRange $startCrawlId $endCrawlId     
     194         
    169195    else 
    170196        echo "$id is singleton" 
    171197        crawlId=$id 
    172         siteDir=$sitesDir/$crawlId 
     198        # $siteDir needs / at end to work with existing code 
     199        siteDir=$sitesDir/$crawlId/ 
    173200        echo "prepareForCrawl $siteDir $crawlId" 
    174         #prepareForCrawl $siteDir $crawlId       
     201        prepareForCrawl $siteDir $crawlId        
    175202    fi 
    176203    done 
    177  
    178  
    179  
    180 } 
    181  
     204} 
     205 
     206 
     207 
     208# Passing as string instead of individual arguments 
     209# Because one can only have 9 individual args without using shift, see 
     210# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html 
     211 
     212parseArgs "$*" 
     213 
     214# old. testing 
    182215#crawlRange "00010" "00015" 
    183216#parseArgs "00008-00022,00025,00026,00028-00034" 
    184 parseArgs "$*" 
    185 # can only have 9 args without using shift, see 
    186 # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash 
    187 #parseArgs horse cow dog cat