source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33573

Last change on this file since 33573 was 33573, checked in by ak19, 5 years ago

Forgot to document that spaces were also allowed as separator in the input of crawl-site ids to the batchcrawl.sh script

  • Property svn:executable set to *
File size: 7.6 KB
Line 
1#!/bin/bash
2
3sitesDir=to_crawl/sites
4echo "SITES DIR (INPUT): $sitesDir"
5outputDir=crawled
6mkdir -p $outputDir
7echo "OUTPUT DIR: $outputDir"
8
9
10NUTCH_HOME=apache-nutch-2.3.1
11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18CRAWL_ITERATIONS=10
19
20function crawlSite() {
21 siteDir=$1
22 crawlId=$2
23
24 echo "processing site $siteDir"
25
26 #echo "processing site $siteDir with crawlId: $crawlId"
27
28 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29 cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31 echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32 cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34 #echo "Contents of seedURLs.txt file for site:"
35 #cat ${siteDir}seedURLs.txt
36
37 # $siteDir parameter is the folder containing seedURLs.txt
38 # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
39 crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
40
41 # Since we're going to crawl from scratch, create log.out file
42 echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
43 # append to log.out file hereafter
44 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
45 echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
46
47 # append output of $crawl_cmd to log.out
48 $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
49 result=$?
50
51 if [ "x$result" = "x0" ]; then
52 # nutch finished crawling successfully.
53
54 # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
55 # If not, create file UNFINISHED to indicate a more thorough crawl needed
56 tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
57 result=$?
58 if [ "x$result" != "x0" ]; then
59 echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
60 echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
61 echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
62 echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
63 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
64 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
65 echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
66 fi
67
68 # outputDir/$crawlId should not yet exist
69 ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
70 ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
71 cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
72 else
73 # appending to log.out
74 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
75 fi
76
77
78 # move the peripheral crawl products (the log.out and optional UNFINISHED file)
79 # from the input to the output folder. This way we can re-run the crawl and
80 # these files will still have been preserved as long as the output folder
81 # isn't deleted
82 mv ${siteDir}log.out $outputDir/$crawlId/log.out
83 if [ -e "${siteDir}UNFINISHED" ]; then
84 mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
85 fi
86
87 # finally append the current crawlId to siteIDsCrawled.txt
88 echo $crawlId >> $outputDir/siteIDsCrawled.txt
89}
90
91function prepareForCrawl() {
92 siteDir=$1
93 crawlId=$2
94
95 echo "Processing siteDir $siteDir with crawlId: $crawlId"
96
97 if [ -d "$outputDir/$crawlId" ]; then
98 # Skip site already processed. *Append* this msg to log.out
99 echo "" 2>&1 | tee -a ${siteDir}log.out
100 echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
101 echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
102
103 else
104 crawlSite $siteDir $crawlId
105
106 fi
107 echo "--------------------------------------------------"
108
109
110
111}
112
113function crawlAll() {
114
115 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
116 for siteDir in $sitesDir/*/; do
117
118 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
119 # Remove the $sitesDir prefix of to_crawl/sites followed by /,
120 # Next remove the / suffix that remains
121 crawlId=${siteDir#"$sitesDir/"}
122 crawlId=${crawlId%/}
123
124 prepareForCrawl $siteDir $crawlId
125 #break
126 done
127}
128
129function crawlRange() {
130
131 startCrawlId=$1
132 endCrawlId=$2
133
134 # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
135
136 # sadly, the numeric value is in octal in both the following
137 #end=$(($endCrawlId+0))
138 #printf -v end '%d\n' $endCrawlId 2>/dev/null
139 # removes a single 0 prefix
140 #end=${endCrawlId##+(0)}
141 #${old##+(0)}
142 # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
143 start=${startCrawlId#"${startCrawlId%%[!0]*}"}
144 end=${endCrawlId#"${endCrawlId%%[!0]*}"}
145
146 echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
147
148 # Generate a range of numbers between start and end
149 #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
150 for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
151 # Now pad back with zeroes to get crawlId
152 # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
153 crawlId=`printf %05d $COUNTER`
154 #echo $COUNTER - $crawlId
155 # $siteDir needs / at end to work with existing code
156 siteDir=$sitesDir/$crawlId/
157 #echo "siteDir $siteDir (crawlId $crawlId)"
158 prepareForCrawl $siteDir $crawlId
159 done
160
161}
162
163function printUsage() {
164 echo "Run as:"
165 echo " $0 -all|<ids>"
166 echo " where an id is a folder name in to_crawl/sites"
167 echo " and ids can be a comma or space separated list of"
168 echo " individual ids and/or ranges"
169 echo " Examples:"
170 echo " $0 00008-00022,00025,00026,00028-00034 00050"
171 echo " $0 -all"
172 echo " $0 00312"
173
174}
175
176function parseArgs() {
177
178 # for i in $*; do
179 # echo "Pinky"
180 # echo $i
181 # done
182
183 args="$1"
184 #echo "Got arg string: $args"
185
186 if [ "x$args" = "x" ]; then
187 printUsage
188 exit 0
189 fi
190
191 # works - split args on comma or space
192 # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
193 IFS=', ' read -ra IDS <<< "$args"
194 for id in "${IDS[@]}"; do
195 echo "id: |$id|"
196 if [ "x$id" = "x-all" ]; then
197 echo "crawlAll"
198 crawlAll
199 elif [[ $id == *"-"* ]]; then
200 # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
201 echo "$id is a range"
202 startCrawlId=${id%%-*}
203 endCrawlId=${id##*-}
204 echo "crawlRange $startCrawlId $endCrawlId"
205 crawlRange $startCrawlId $endCrawlId
206
207 else
208 echo "$id is singleton"
209 crawlId=$id
210 # $siteDir needs / at end to work with existing code
211 siteDir=$sitesDir/$crawlId/
212 echo "prepareForCrawl $siteDir $crawlId"
213 prepareForCrawl $siteDir $crawlId
214 fi
215 done
216}
217
218
219
220# Passing as string instead of individual arguments
221# Because one can only have 9 individual args without using shift, see
222# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
223
224parseArgs "$*"
225
226# old. testing
227#crawlRange "00010" "00015"
228#parseArgs "00008-00022,00025,00026,00028-00034"
Note: See TracBrowser for help on using the repository browser.