Changeset 33566 for gs3-extensions/maori-lang-detection/hdfs-cc-work
- Timestamp:
- 2019-10-14T22:07:45+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh
r33564 r33566 22 22 crawlId=$2 23 23 24 echo "processing site $siteDir with crawlId: $crawlId" 24 echo "processing site $siteDir" 25 26 #echo "processing site $siteDir with crawlId: $crawlId" 25 27 26 28 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file" … … 75 77 } 76 78 77 78 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash 79 for siteDir in $sitesDir/*/; do 80 81 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ 82 # Remove the $sitesDir prefix of to_crawl/sites followed by /, 83 # Next remove the / suffix that remains 84 crawlId=${siteDir#"$sitesDir/"} 85 crawlId=${crawlId%/} 79 function prepareForCrawl() { 80 siteDir=$1 81 crawlId=$2 86 82 87 83 echo "Processing crawlId: $crawlId" … … 99 95 echo "--------------------------------------------------" 100 96 101 break 102 done 97 98 99 } 100 101 function crawlAll() { 102 103 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash 104 for siteDir in $sitesDir/*/; do 105 106 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ 107 # Remove the $sitesDir prefix of to_crawl/sites followed by /, 108 # Next remove the / suffix that remains 109 crawlId=${siteDir#"$sitesDir/"} 110 crawlId=${crawlId%/} 111 112 prepareForCrawl $siteDir $crawlId 113 break 114 done 115 } 116 117 function crawlRange() { 118 119 startCrawlId=$1 120 endCrawlId=$2 121 122 # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285 123 124 # sadly, the numeric value is in octal in both the following 125 #end=$(($endCrawlId+0)) 126 #printf -v end '%d\n' $endCrawlId 2>/dev/null 127 # removes a single 0 prefix 128 #end=${endCrawlId##+(0)} 129 #${old##+(0)} 130 # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007 131 start=${startCrawlId#"${startCrawlId%%[!0]*}"} 132 end=${endCrawlId#"${endCrawlId%%[!0]*}"} 133 134 echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)" 135 136 # Generate a range of numbers between start and end 137 #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026 138 for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do 139 # Now pad back with zeroes to get crawlId 140 # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string 141 crawlId=`printf %05d $COUNTER` 142 #echo $COUNTER - $crawlId 143 siteDir=$sitesDir/$crawlId 144 echo "siteDir $siteDir (crawlId $crawlId)" 145 done 146 147 } 148 149 function parseArgs() { 150 # for i in $*; do 151 # echo "Pinky" 152 # echo $i 153 # done 154 155 args="$1" 156 echo "Got arg string: $args" 157 158 # works - split args on comma 159 # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash 160 IFS=', ' read -ra IDS <<< "$args" 161 for id in "${IDS[@]}"; do 162 163 if [[ $id == *"-"* ]]; then 164 echo "$id is a range" 165 startCrawlId=${id%%-*} 166 endCrawlId=${id##*-} 167 echo "crawlRange $startCrawlId $endCrawlId" 168 crawlRange $startCrawlId $endCrawlId 169 else 170 echo "$id is singleton" 171 crawlId=$id 172 siteDir=$sitesDir/$crawlId 173 echo "prepareForCrawl $siteDir $crawlId" 174 #prepareForCrawl $siteDir $crawlId 175 fi 176 done 177 178 179 180 } 181 182 #crawlRange "00010" "00015" 183 #parseArgs "00008-00022,00025,00026,00028-00034" 184 parseArgs "$*" 185 # can only have 9 args without using shift, see 186 # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash 187 #parseArgs horse cow dog cat
Note:
See TracChangeset
for help on using the changeset viewer.