1 | #!/bin/bash
|
---|
2 |
|
---|
3 | sitesDir=to_crawl/sites
|
---|
4 | echo "SITES DIR (INPUT): $sitesDir"
|
---|
5 | outputDir=crawled
|
---|
6 | mkdir -p $outputDir
|
---|
7 | echo "OUTPUT DIR: $outputDir"
|
---|
8 |
|
---|
9 |
|
---|
10 | NUTCH_HOME=apache-nutch-2.3.1
|
---|
11 | NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
|
---|
12 | NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
|
---|
13 | NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
|
---|
14 |
|
---|
15 | CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
|
---|
16 | NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
|
---|
17 |
|
---|
18 | CRAWL_ITERATIONS=10
|
---|
19 |
|
---|
20 | function crawlSite() {
|
---|
21 | siteDir=$1
|
---|
22 | crawlId=$2
|
---|
23 |
|
---|
24 | echo "processing site $siteDir"
|
---|
25 |
|
---|
26 | #echo "processing site $siteDir with crawlId: $crawlId"
|
---|
27 |
|
---|
28 | echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
|
---|
29 | cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
|
---|
30 |
|
---|
31 | echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
|
---|
32 | cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
|
---|
33 |
|
---|
34 | #echo "Contents of seedURLs.txt file for site:"
|
---|
35 | #cat ${siteDir}seedURLs.txt
|
---|
36 |
|
---|
37 | # $siteDir parameter is the folder containing seedURLs.txt
|
---|
38 | # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
|
---|
39 | crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
|
---|
40 |
|
---|
41 | # Since we're going to crawl from scratch, create log.out file
|
---|
42 | echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
|
---|
43 | # append to log.out file hereafter
|
---|
44 | echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
|
---|
45 | echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
|
---|
46 |
|
---|
47 | # append output of $crawl_cmd to log.out
|
---|
48 | $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
|
---|
49 | result=$?
|
---|
50 |
|
---|
51 | if [ "x$result" = "x0" ]; then
|
---|
52 | # nutch finished crawling successfully.
|
---|
53 |
|
---|
54 | # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
|
---|
55 | # If not, create file UNFINISHED to indicate a more thorough crawl needed
|
---|
56 | tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
|
---|
57 | result=$?
|
---|
58 | if [ "x$result" != "x0" ]; then
|
---|
59 | echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
|
---|
60 | echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
61 | echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
62 | echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
63 | echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
64 | echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
65 | echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
66 | fi
|
---|
67 |
|
---|
68 | # outputDir/$crawlId should not yet exist
|
---|
69 | ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
|
---|
70 | ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
|
---|
71 | cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
|
---|
72 | else
|
---|
73 | # appending to log.out
|
---|
74 | echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
|
---|
75 | fi
|
---|
76 |
|
---|
77 | }
|
---|
78 |
|
---|
79 | function prepareForCrawl() {
|
---|
80 | siteDir=$1
|
---|
81 | crawlId=$2
|
---|
82 |
|
---|
83 | echo "Processing crawlId: $crawlId"
|
---|
84 |
|
---|
85 | if [ -d "$outputDir/$crawlId" ]; then
|
---|
86 | # Skip site already processed. *Append* this msg to log.out
|
---|
87 | echo "" 2>&1 | tee -a ${siteDir}log.out
|
---|
88 | echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
|
---|
89 | echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
|
---|
90 |
|
---|
91 | else
|
---|
92 | crawlSite $siteDir $crawlId
|
---|
93 |
|
---|
94 | fi
|
---|
95 | echo "--------------------------------------------------"
|
---|
96 |
|
---|
97 |
|
---|
98 |
|
---|
99 | }
|
---|
100 |
|
---|
101 | function crawlAll() {
|
---|
102 |
|
---|
103 | # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
|
---|
104 | for siteDir in $sitesDir/*/; do
|
---|
105 |
|
---|
106 | # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
|
---|
107 | # Remove the $sitesDir prefix of to_crawl/sites followed by /,
|
---|
108 | # Next remove the / suffix that remains
|
---|
109 | crawlId=${siteDir#"$sitesDir/"}
|
---|
110 | crawlId=${crawlId%/}
|
---|
111 |
|
---|
112 | prepareForCrawl $siteDir $crawlId
|
---|
113 | break
|
---|
114 | done
|
---|
115 | }
|
---|
116 |
|
---|
117 | function crawlRange() {
|
---|
118 |
|
---|
119 | startCrawlId=$1
|
---|
120 | endCrawlId=$2
|
---|
121 |
|
---|
122 | # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
|
---|
123 |
|
---|
124 | # sadly, the numeric value is in octal in both the following
|
---|
125 | #end=$(($endCrawlId+0))
|
---|
126 | #printf -v end '%d\n' $endCrawlId 2>/dev/null
|
---|
127 | # removes a single 0 prefix
|
---|
128 | #end=${endCrawlId##+(0)}
|
---|
129 | #${old##+(0)}
|
---|
130 | # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
|
---|
131 | start=${startCrawlId#"${startCrawlId%%[!0]*}"}
|
---|
132 | end=${endCrawlId#"${endCrawlId%%[!0]*}"}
|
---|
133 |
|
---|
134 | echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
|
---|
135 |
|
---|
136 | # Generate a range of numbers between start and end
|
---|
137 | #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
|
---|
138 | for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
|
---|
139 | # Now pad back with zeroes to get crawlId
|
---|
140 | # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
|
---|
141 | crawlId=`printf %05d $COUNTER`
|
---|
142 | #echo $COUNTER - $crawlId
|
---|
143 | siteDir=$sitesDir/$crawlId
|
---|
144 | echo "siteDir $siteDir (crawlId $crawlId)"
|
---|
145 | done
|
---|
146 |
|
---|
147 | }
|
---|
148 |
|
---|
149 | function parseArgs() {
|
---|
150 | # for i in $*; do
|
---|
151 | # echo "Pinky"
|
---|
152 | # echo $i
|
---|
153 | # done
|
---|
154 |
|
---|
155 | args="$1"
|
---|
156 | echo "Got arg string: $args"
|
---|
157 |
|
---|
158 | # works - split args on comma
|
---|
159 | # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
|
---|
160 | IFS=', ' read -ra IDS <<< "$args"
|
---|
161 | for id in "${IDS[@]}"; do
|
---|
162 |
|
---|
163 | if [[ $id == *"-"* ]]; then
|
---|
164 | echo "$id is a range"
|
---|
165 | startCrawlId=${id%%-*}
|
---|
166 | endCrawlId=${id##*-}
|
---|
167 | echo "crawlRange $startCrawlId $endCrawlId"
|
---|
168 | crawlRange $startCrawlId $endCrawlId
|
---|
169 | else
|
---|
170 | echo "$id is singleton"
|
---|
171 | crawlId=$id
|
---|
172 | siteDir=$sitesDir/$crawlId
|
---|
173 | echo "prepareForCrawl $siteDir $crawlId"
|
---|
174 | #prepareForCrawl $siteDir $crawlId
|
---|
175 | fi
|
---|
176 | done
|
---|
177 |
|
---|
178 |
|
---|
179 |
|
---|
180 | }
|
---|
181 |
|
---|
182 | #crawlRange "00010" "00015"
|
---|
183 | #parseArgs "00008-00022,00025,00026,00028-00034"
|
---|
184 | parseArgs "$*"
|
---|
185 | # can only have 9 args without using shift, see
|
---|
186 | # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
|
---|
187 | #parseArgs horse cow dog cat
|
---|