1 | #!/bin/bash
|
---|
2 |
|
---|
3 | sitesDir=to_crawl/sites
|
---|
4 | echo "SITES DIR (INPUT): $sitesDir"
|
---|
5 | outputDir=crawled
|
---|
6 | mkdir -p $outputDir
|
---|
7 | echo "OUTPUT DIR: $outputDir"
|
---|
8 |
|
---|
9 |
|
---|
10 | NUTCH_HOME=apache-nutch-2.3.1
|
---|
11 | NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
|
---|
12 | NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
|
---|
13 | NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
|
---|
14 |
|
---|
15 | CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
|
---|
16 | NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
|
---|
17 |
|
---|
18 | CRAWL_ITERATIONS=10
|
---|
19 |
|
---|
20 | function crawlSite() {
|
---|
21 | siteDir=$1
|
---|
22 | crawlId=$2
|
---|
23 |
|
---|
24 | echo "processing site $siteDir"
|
---|
25 |
|
---|
26 | #echo "processing site $siteDir with crawlId: $crawlId"
|
---|
27 |
|
---|
28 | echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
|
---|
29 | cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
|
---|
30 |
|
---|
31 | echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
|
---|
32 | cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
|
---|
33 |
|
---|
34 | #echo "Contents of seedURLs.txt file for site:"
|
---|
35 | #cat ${siteDir}seedURLs.txt
|
---|
36 |
|
---|
37 | # $siteDir parameter is the folder containing seedURLs.txt
|
---|
38 | # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
|
---|
39 | crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
|
---|
40 |
|
---|
41 | # Since we're going to crawl from scratch, create log.out file
|
---|
42 | echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
|
---|
43 | # append to log.out file hereafter
|
---|
44 | echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
|
---|
45 | echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
|
---|
46 |
|
---|
47 | # append output of $crawl_cmd to log.out
|
---|
48 | $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
|
---|
49 | result=$?
|
---|
50 |
|
---|
51 | if [ "x$result" = "x0" ]; then
|
---|
52 | # nutch finished crawling successfully.
|
---|
53 |
|
---|
54 | # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
|
---|
55 | # If not, create file UNFINISHED to indicate a more thorough crawl needed
|
---|
56 | tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
|
---|
57 | result=$?
|
---|
58 | if [ "x$result" != "x0" ]; then
|
---|
59 | echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
|
---|
60 | echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
61 | echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
62 | echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
63 | echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
64 | echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
65 | echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
66 | fi
|
---|
67 |
|
---|
68 | # outputDir/$crawlId should not yet exist
|
---|
69 | ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
|
---|
70 | ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
|
---|
71 | cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
|
---|
72 | else
|
---|
73 | # appending to log.out
|
---|
74 | echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
|
---|
75 | fi
|
---|
76 |
|
---|
77 |
|
---|
78 | # move the peripheral crawl products (the log.out and optional UNFINISHED file)
|
---|
79 | # from the input to the output folder. This way we can re-run the crawl and
|
---|
80 | # these files will still have been preserved as long as the output folder
|
---|
81 | # isn't deleted
|
---|
82 | mv ${siteDir}log.out $outputDir/$crawlId/log.out
|
---|
83 | if [ -e "${siteDir}UNFINISHED" ]; then
|
---|
84 | mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
|
---|
85 | fi
|
---|
86 |
|
---|
87 | # finally append the current crawlId to siteIDsCrawled.txt
|
---|
88 | echo $crawlId >> $outputDir/siteIDsCrawled.txt
|
---|
89 | }
|
---|
90 |
|
---|
91 | function prepareForCrawl() {
|
---|
92 | siteDir=$1
|
---|
93 | crawlId=$2
|
---|
94 |
|
---|
95 | echo "Processing siteDir $siteDir with crawlId: $crawlId"
|
---|
96 |
|
---|
97 | if [ -d "$outputDir/$crawlId" ]; then
|
---|
98 | # Skip site already processed. *Append* this msg to log.out
|
---|
99 | echo "" 2>&1 | tee -a ${siteDir}log.out
|
---|
100 | echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
|
---|
101 | echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
|
---|
102 |
|
---|
103 | else
|
---|
104 | crawlSite $siteDir $crawlId
|
---|
105 |
|
---|
106 | fi
|
---|
107 | echo "--------------------------------------------------"
|
---|
108 |
|
---|
109 |
|
---|
110 |
|
---|
111 | }
|
---|
112 |
|
---|
113 | function crawlAll() {
|
---|
114 |
|
---|
115 | # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
|
---|
116 | for siteDir in $sitesDir/*/; do
|
---|
117 |
|
---|
118 | # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
|
---|
119 | # Remove the $sitesDir prefix of to_crawl/sites followed by /,
|
---|
120 | # Next remove the / suffix that remains
|
---|
121 | crawlId=${siteDir#"$sitesDir/"}
|
---|
122 | crawlId=${crawlId%/}
|
---|
123 |
|
---|
124 | prepareForCrawl $siteDir $crawlId
|
---|
125 | #break
|
---|
126 | done
|
---|
127 | }
|
---|
128 |
|
---|
129 | function crawlRange() {
|
---|
130 |
|
---|
131 | startCrawlId=$1
|
---|
132 | endCrawlId=$2
|
---|
133 |
|
---|
134 | # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
|
---|
135 |
|
---|
136 | # sadly, the numeric value is in octal in both the following
|
---|
137 | #end=$(($endCrawlId+0))
|
---|
138 | #printf -v end '%d\n' $endCrawlId 2>/dev/null
|
---|
139 | # removes a single 0 prefix
|
---|
140 | #end=${endCrawlId##+(0)}
|
---|
141 | #${old##+(0)}
|
---|
142 | # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
|
---|
143 | start=${startCrawlId#"${startCrawlId%%[!0]*}"}
|
---|
144 | end=${endCrawlId#"${endCrawlId%%[!0]*}"}
|
---|
145 |
|
---|
146 | echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
|
---|
147 |
|
---|
148 | # Generate a range of numbers between start and end
|
---|
149 | #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
|
---|
150 | for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
|
---|
151 | # Now pad back with zeroes to get crawlId
|
---|
152 | # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
|
---|
153 | crawlId=`printf %05d $COUNTER`
|
---|
154 | #echo $COUNTER - $crawlId
|
---|
155 | # $siteDir needs / at end to work with existing code
|
---|
156 | siteDir=$sitesDir/$crawlId/
|
---|
157 | #echo "siteDir $siteDir (crawlId $crawlId)"
|
---|
158 | prepareForCrawl $siteDir $crawlId
|
---|
159 | done
|
---|
160 |
|
---|
161 | }
|
---|
162 |
|
---|
163 | function printUsage() {
|
---|
164 | echo "Run as:"
|
---|
165 | echo " $0 -all|<ids>"
|
---|
166 | echo " where an id is a folder name in to_crawl/sites"
|
---|
167 | echo " and ids can be a comma or space separated list of"
|
---|
168 | echo " individual ids and/or ranges"
|
---|
169 | echo " Examples:"
|
---|
170 | echo " $0 00008-00022,00025,00026,00028-00034 00050"
|
---|
171 | echo " $0 -all"
|
---|
172 | echo " $0 00312"
|
---|
173 |
|
---|
174 | }
|
---|
175 |
|
---|
176 | function parseArgs() {
|
---|
177 |
|
---|
178 | # for i in $*; do
|
---|
179 | # echo "Pinky"
|
---|
180 | # echo $i
|
---|
181 | # done
|
---|
182 |
|
---|
183 | args="$1"
|
---|
184 | #echo "Got arg string: $args"
|
---|
185 |
|
---|
186 | if [ "x$args" = "x" ]; then
|
---|
187 | printUsage
|
---|
188 | exit 0
|
---|
189 | fi
|
---|
190 |
|
---|
191 | # works - split args on comma or space
|
---|
192 | # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
|
---|
193 | IFS=', ' read -ra IDS <<< "$args"
|
---|
194 | for id in "${IDS[@]}"; do
|
---|
195 | echo "id: |$id|"
|
---|
196 | if [ "x$id" = "x-all" ]; then
|
---|
197 | echo "crawlAll"
|
---|
198 | crawlAll
|
---|
199 | elif [[ $id == *"-"* ]]; then
|
---|
200 | # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
|
---|
201 | echo "$id is a range"
|
---|
202 | startCrawlId=${id%%-*}
|
---|
203 | endCrawlId=${id##*-}
|
---|
204 | echo "crawlRange $startCrawlId $endCrawlId"
|
---|
205 | crawlRange $startCrawlId $endCrawlId
|
---|
206 |
|
---|
207 | else
|
---|
208 | echo "$id is singleton"
|
---|
209 | crawlId=$id
|
---|
210 | # $siteDir needs / at end to work with existing code
|
---|
211 | siteDir=$sitesDir/$crawlId/
|
---|
212 | echo "prepareForCrawl $siteDir $crawlId"
|
---|
213 | prepareForCrawl $siteDir $crawlId
|
---|
214 | fi
|
---|
215 | done
|
---|
216 | }
|
---|
217 |
|
---|
218 |
|
---|
219 |
|
---|
220 | # Passing as string instead of individual arguments
|
---|
221 | # Because one can only have 9 individual args without using shift, see
|
---|
222 | # https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
|
---|
223 |
|
---|
224 | parseArgs "$*"
|
---|
225 |
|
---|
226 | # old. testing
|
---|
227 | #crawlRange "00010" "00015"
|
---|
228 | #parseArgs "00008-00022,00025,00026,00028-00034"
|
---|