1 | #!/bin/bash
|
---|
2 |
|
---|
3 | sitesDir=to_crawl/sites
|
---|
4 | echo "SITES DIR (INPUT): $sitesDir"
|
---|
5 | outputDir=crawled
|
---|
6 | mkdir -p $outputDir
|
---|
7 | echo "OUTPUT DIR: $outputDir"
|
---|
8 |
|
---|
9 |
|
---|
10 | NUTCH_HOME=apache-nutch-2.3.1
|
---|
11 | NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
|
---|
12 | NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
|
---|
13 | NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
|
---|
14 |
|
---|
15 | CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
|
---|
16 | NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
|
---|
17 |
|
---|
18 | CRAWL_ITERATIONS=10
|
---|
19 |
|
---|
20 | function crawlSite() {
|
---|
21 | siteDir=$1
|
---|
22 | crawlId=$2
|
---|
23 |
|
---|
24 | echo "processing site $siteDir"
|
---|
25 |
|
---|
26 | #echo "processing site $siteDir with crawlId: $crawlId"
|
---|
27 |
|
---|
28 | echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
|
---|
29 | cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
|
---|
30 |
|
---|
31 | echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
|
---|
32 | cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
|
---|
33 |
|
---|
34 | #echo "Contents of seedURLs.txt file for site:"
|
---|
35 | #cat ${siteDir}seedURLs.txt
|
---|
36 |
|
---|
37 | # $siteDir parameter is the folder containing seedURLs.txt
|
---|
38 | crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
|
---|
39 |
|
---|
40 | # Since we're going to crawl from scratch, create log.out file
|
---|
41 | # Logging to terminal and log file simultaenously
|
---|
42 | # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
|
---|
43 | echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
|
---|
44 | # append to log.out file hereafter
|
---|
45 | echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
|
---|
46 | echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
|
---|
47 |
|
---|
48 | # append output of $crawl_cmd to log.out
|
---|
49 | $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
|
---|
50 | result=$?
|
---|
51 |
|
---|
52 | if [ "x$result" = "x0" ]; then
|
---|
53 | # nutch finished crawling successfully.
|
---|
54 |
|
---|
55 | # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
|
---|
56 | # If not, create file UNFINISHED to indicate a more thorough crawl needed
|
---|
57 | tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
|
---|
58 | result=$?
|
---|
59 | if [ "x$result" != "x0" ]; then
|
---|
60 | echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
|
---|
61 | echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
62 | echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
63 | echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
64 | echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
65 | echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
66 | echo "3. Adjust # crawl iterations in old crawl command:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
67 | echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
68 | fi
|
---|
69 |
|
---|
70 | # outputDir/$crawlId should not yet exist
|
---|
71 | ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
|
---|
72 | ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
|
---|
73 | cat $outputDir/$crawlId/part-r-* > $outputDir/$crawlId/dump.txt
|
---|
74 | else
|
---|
75 | # appending to log.out
|
---|
76 | echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
|
---|
77 | fi
|
---|
78 |
|
---|
79 |
|
---|
80 | # move the peripheral crawl products (the log.out and optional UNFINISHED file)
|
---|
81 | # from the input to the output folder. This way we can re-run the crawl and
|
---|
82 | # these files will still have been preserved as long as the output folder
|
---|
83 | # isn't deleted
|
---|
84 | mv ${siteDir}log.out $outputDir/$crawlId/log.out
|
---|
85 | if [ -e "${siteDir}UNFINISHED" ]; then
|
---|
86 | mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
|
---|
87 | fi
|
---|
88 |
|
---|
89 | # finally append the current crawlId to siteIDsCrawled.txt
|
---|
90 | echo $crawlId >> $outputDir/siteIDsCrawled.txt
|
---|
91 | }
|
---|
92 |
|
---|
93 | function prepareForCrawl() {
|
---|
94 | siteDir=$1
|
---|
95 | crawlId=$2
|
---|
96 |
|
---|
97 | echo "Processing siteDir $siteDir with crawlId: $crawlId"
|
---|
98 |
|
---|
99 | if [ -d "$outputDir/$crawlId" ]; then
|
---|
100 | # Skip site already processed. *Append* this msg to log.out
|
---|
101 | echo "" 2>&1 | tee -a ${siteDir}log.out
|
---|
102 | echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
|
---|
103 | echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
|
---|
104 |
|
---|
105 | else
|
---|
106 | crawlSite $siteDir $crawlId
|
---|
107 |
|
---|
108 | fi
|
---|
109 | echo "--------------------------------------------------"
|
---|
110 |
|
---|
111 |
|
---|
112 |
|
---|
113 | }
|
---|
114 |
|
---|
115 | function crawlAll() {
|
---|
116 |
|
---|
117 | # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
|
---|
118 | for siteDir in $sitesDir/*/; do
|
---|
119 |
|
---|
120 | # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
|
---|
121 | # Remove the $sitesDir prefix of to_crawl/sites followed by /,
|
---|
122 | # Next remove the / suffix that remains
|
---|
123 | crawlId=${siteDir#"$sitesDir/"}
|
---|
124 | crawlId=${crawlId%/}
|
---|
125 |
|
---|
126 | prepareForCrawl $siteDir $crawlId
|
---|
127 | #break
|
---|
128 | done
|
---|
129 | }
|
---|
130 |
|
---|
131 | function crawlRange() {
|
---|
132 |
|
---|
133 | startCrawlId=$1
|
---|
134 | endCrawlId=$2
|
---|
135 |
|
---|
136 | # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
|
---|
137 |
|
---|
138 | # sadly, the numeric value is in octal in both the following
|
---|
139 | #end=$(($endCrawlId+0))
|
---|
140 | #printf -v end '%d\n' $endCrawlId 2>/dev/null
|
---|
141 | # removes a single 0 prefix
|
---|
142 | #end=${endCrawlId##+(0)}
|
---|
143 | #${old##+(0)}
|
---|
144 | # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
|
---|
145 | start=${startCrawlId#"${startCrawlId%%[!0]*}"}
|
---|
146 | end=${endCrawlId#"${endCrawlId%%[!0]*}"}
|
---|
147 |
|
---|
148 | echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
|
---|
149 |
|
---|
150 | # Generate a range of numbers between start and end
|
---|
151 | #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
|
---|
152 | for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
|
---|
153 | # Now pad back with zeroes to get crawlId
|
---|
154 | # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
|
---|
155 | crawlId=`printf %05d $COUNTER`
|
---|
156 | #echo $COUNTER - $crawlId
|
---|
157 | # $siteDir needs / at end to work with existing code
|
---|
158 | siteDir=$sitesDir/$crawlId/
|
---|
159 | #echo "siteDir $siteDir (crawlId $crawlId)"
|
---|
160 | prepareForCrawl $siteDir $crawlId
|
---|
161 | done
|
---|
162 |
|
---|
163 | }
|
---|
164 |
|
---|
165 | function printUsage() {
|
---|
166 | echo "Run as:"
|
---|
167 | echo " $0 -all|<ids>"
|
---|
168 | echo " where an id is a folder name in to_crawl/sites"
|
---|
169 | echo " and ids can be a comma or space separated list of"
|
---|
170 | echo " individual ids and/or ranges"
|
---|
171 | echo " Examples:"
|
---|
172 | echo " $0 00008-00022,00025,00026,00028-00034 00050"
|
---|
173 | echo " $0 -all"
|
---|
174 | echo " $0 00312"
|
---|
175 |
|
---|
176 | }
|
---|
177 |
|
---|
178 | function parseArgs() {
|
---|
179 |
|
---|
180 | # for i in $*; do
|
---|
181 | # echo "Pinky"
|
---|
182 | # echo $i
|
---|
183 | # done
|
---|
184 |
|
---|
185 | args="$1"
|
---|
186 | #echo "Got arg string: $args"
|
---|
187 |
|
---|
188 | if [ "x$args" = "x" ]; then
|
---|
189 | printUsage
|
---|
190 | exit 0
|
---|
191 | fi
|
---|
192 |
|
---|
193 | # works - split args on comma or space
|
---|
194 | # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
|
---|
195 | IFS=', ' read -ra IDS <<< "$args"
|
---|
196 | for id in "${IDS[@]}"; do
|
---|
197 | echo "id: |$id|"
|
---|
198 | if [ "x$id" = "x-all" ]; then
|
---|
199 | echo "crawlAll"
|
---|
200 | crawlAll
|
---|
201 | elif [[ $id == *"-"* ]]; then
|
---|
202 | # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
|
---|
203 | echo "$id is a range"
|
---|
204 | startCrawlId=${id%%-*}
|
---|
205 | endCrawlId=${id##*-}
|
---|
206 | echo "crawlRange $startCrawlId $endCrawlId"
|
---|
207 | crawlRange $startCrawlId $endCrawlId
|
---|
208 |
|
---|
209 | else
|
---|
210 | echo "$id is singleton"
|
---|
211 | crawlId=$id
|
---|
212 | # $siteDir needs / at end to work with existing code
|
---|
213 | siteDir=$sitesDir/$crawlId/
|
---|
214 | echo "prepareForCrawl $siteDir $crawlId"
|
---|
215 | prepareForCrawl $siteDir $crawlId
|
---|
216 | fi
|
---|
217 | done
|
---|
218 | }
|
---|
219 |
|
---|
220 |
|
---|
221 |
|
---|
222 | # Passing as string instead of individual arguments
|
---|
223 | # Because one can only have 9 individual args without using shift, see
|
---|
224 | # https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
|
---|
225 |
|
---|
226 | parseArgs "$*"
|
---|
227 |
|
---|
228 | # old. testing
|
---|
229 | #crawlRange "00010" "00015"
|
---|
230 | #parseArgs "00008-00022,00025,00026,00028-00034"
|
---|