Changeset 33566

Show
Ignore:
Timestamp:
14.10.2019 22:07:45 (4 weeks ago)
Author:
ak19
Message:

batchcrawl.sh script now supports taking a comma or space separated list of crawl folder ids of the form 00000%d (e.g. 00311) and/or ranges of such crawl folder ids. Script is now inactive. Next commit should support the all parameter to crawl all subfolders (and perhaps a range with no final termination value).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33564 r33566  
    2222    crawlId=$2 
    2323 
    24     echo "processing site $siteDir with crawlId: $crawlId" 
     24    echo "processing site $siteDir" 
     25 
     26    #echo "processing site $siteDir with crawlId: $crawlId" 
    2527 
    2628    echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file" 
     
    7577} 
    7678 
    77  
    78 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash 
    79 for siteDir in $sitesDir/*/; do 
    80  
    81     # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ 
    82     # Remove the $sitesDir prefix of to_crawl/sites followed by /, 
    83     # Next remove the / suffix that remains 
    84     crawlId=${siteDir#"$sitesDir/"} 
    85     crawlId=${crawlId%/} 
     79function prepareForCrawl() { 
     80    siteDir=$1 
     81    crawlId=$2 
    8682 
    8783    echo "Processing crawlId: $crawlId" 
     
    9995    echo "--------------------------------------------------" 
    10096     
    101     break 
    102 done 
     97 
     98     
     99} 
     100 
     101function crawlAll() { 
     102 
     103    # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash 
     104    for siteDir in $sitesDir/*/; do 
     105     
     106    # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ 
     107    # Remove the $sitesDir prefix of to_crawl/sites followed by /, 
     108    # Next remove the / suffix that remains 
     109    crawlId=${siteDir#"$sitesDir/"} 
     110    crawlId=${crawlId%/} 
     111     
     112    prepareForCrawl $siteDir $crawlId 
     113    break 
     114    done 
     115} 
     116 
     117function crawlRange() { 
     118     
     119    startCrawlId=$1 
     120    endCrawlId=$2 
     121 
     122    # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285 
     123 
     124    # sadly, the numeric value is in octal in both the following 
     125    #end=$(($endCrawlId+0))     
     126    #printf -v end '%d\n' $endCrawlId 2>/dev/null 
     127    # removes a single 0 prefix 
     128    #end=${endCrawlId##+(0)} 
     129    #${old##+(0)} 
     130    # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007 
     131    start=${startCrawlId#"${startCrawlId%%[!0]*}"} 
     132    end=${endCrawlId#"${endCrawlId%%[!0]*}"} 
     133 
     134    echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)" 
     135 
     136    # Generate a range of numbers between start and end 
     137    #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026 
     138    for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do 
     139    # Now pad back with zeroes to get crawlId 
     140    # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string  
     141    crawlId=`printf %05d $COUNTER` 
     142    #echo $COUNTER - $crawlId 
     143    siteDir=$sitesDir/$crawlId 
     144    echo "siteDir $siteDir (crawlId $crawlId)" 
     145    done 
     146     
     147} 
     148 
     149function parseArgs() { 
     150    # for i in $*; do 
     151    #   echo "Pinky" 
     152    #   echo $i 
     153    # done 
     154 
     155    args="$1" 
     156    echo "Got arg string: $args" 
     157 
     158    # works - split args on comma 
     159    # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash 
     160    IFS=', ' read -ra IDS <<< "$args" 
     161    for id in "${IDS[@]}"; do 
     162 
     163    if [[ $id == *"-"* ]]; then 
     164        echo "$id is a range" 
     165        startCrawlId=${id%%-*} 
     166        endCrawlId=${id##*-} 
     167        echo "crawlRange $startCrawlId $endCrawlId" 
     168        crawlRange $startCrawlId $endCrawlId 
     169    else 
     170        echo "$id is singleton" 
     171        crawlId=$id 
     172        siteDir=$sitesDir/$crawlId 
     173        echo "prepareForCrawl $siteDir $crawlId" 
     174        #prepareForCrawl $siteDir $crawlId       
     175    fi 
     176    done 
     177 
     178 
     179 
     180} 
     181 
     182#crawlRange "00010" "00015" 
     183#parseArgs "00008-00022,00025,00026,00028-00034" 
     184parseArgs "$*" 
     185# can only have 9 args without using shift, see 
     186# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash 
     187#parseArgs horse cow dog cat