source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33569

Last change on this file since 33569 was 33569, checked in by ak19, 5 years ago
  1. batchcrawl.sh now does what it should have from the start, which is to move the log.out and UNFINISHED files into the output folder instead of leaving them in the input folder, as the input to_crawl folder can and does get replaced all the time, every time I regenerate it after black/white/greylisting more urls. 2. Blacklisted more adult sites, greylisted more product sites and .ru, .pl and .tk domains with whitelisting in the whitelist file. 3. CCWETProcessor now looks out for additional adult sites based on URL and adds them to its blacklist in memory (not the file) and logs the domain for checking and manually adding to the blacklist file.
  • Property svn:executable set to *
File size: 7.4 KB
Line 
1#!/bin/bash
2
3sitesDir=to_crawl/sites
4echo "SITES DIR (INPUT): $sitesDir"
5outputDir=crawled
6mkdir -p $outputDir
7echo "OUTPUT DIR: $outputDir"
8
9
10NUTCH_HOME=apache-nutch-2.3.1
11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18CRAWL_ITERATIONS=10
19
20function crawlSite() {
21 siteDir=$1
22 crawlId=$2
23
24 echo "processing site $siteDir"
25
26 #echo "processing site $siteDir with crawlId: $crawlId"
27
28 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29 cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31 echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32 cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34 #echo "Contents of seedURLs.txt file for site:"
35 #cat ${siteDir}seedURLs.txt
36
37 # $siteDir parameter is the folder containing seedURLs.txt
38 # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
39 crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
40
41 # Since we're going to crawl from scratch, create log.out file
42 echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
43 # append to log.out file hereafter
44 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
45 echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
46
47 # append output of $crawl_cmd to log.out
48 $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
49 result=$?
50
51 if [ "x$result" = "x0" ]; then
52 # nutch finished crawling successfully.
53
54 # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
55 # If not, create file UNFINISHED to indicate a more thorough crawl needed
56 tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
57 result=$?
58 if [ "x$result" != "x0" ]; then
59 echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
60 echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
61 echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
62 echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
63 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
64 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
65 echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
66 fi
67
68 # outputDir/$crawlId should not yet exist
69 ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
70 ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
71 cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
72 else
73 # appending to log.out
74 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
75 fi
76
77
78 # move the peripheral crawl products (the log.out and UNFINISHED files)
79 # from the input to the output folder. This way we can re-run the crawl and
80 # the original output will still have been preserved
81 mv ${siteDir}log.out $outputDir/$crawlId/log.out
82 mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
83}
84
85function prepareForCrawl() {
86 siteDir=$1
87 crawlId=$2
88
89 echo "Processing siteDir $siteDir with crawlId: $crawlId"
90
91 if [ -d "$outputDir/$crawlId" ]; then
92 # Skip site already processed. *Append* this msg to log.out
93 echo "" 2>&1 | tee -a ${siteDir}log.out
94 echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
95 echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
96
97 else
98 crawlSite $siteDir $crawlId
99
100 fi
101 echo "--------------------------------------------------"
102
103
104
105}
106
107function crawlAll() {
108
109 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
110 for siteDir in $sitesDir/*/; do
111
112 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
113 # Remove the $sitesDir prefix of to_crawl/sites followed by /,
114 # Next remove the / suffix that remains
115 crawlId=${siteDir#"$sitesDir/"}
116 crawlId=${crawlId%/}
117
118 prepareForCrawl $siteDir $crawlId
119 #break
120 done
121}
122
123function crawlRange() {
124
125 startCrawlId=$1
126 endCrawlId=$2
127
128 # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
129
130 # sadly, the numeric value is in octal in both the following
131 #end=$(($endCrawlId+0))
132 #printf -v end '%d\n' $endCrawlId 2>/dev/null
133 # removes a single 0 prefix
134 #end=${endCrawlId##+(0)}
135 #${old##+(0)}
136 # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
137 start=${startCrawlId#"${startCrawlId%%[!0]*}"}
138 end=${endCrawlId#"${endCrawlId%%[!0]*}"}
139
140 echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
141
142 # Generate a range of numbers between start and end
143 #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
144 for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
145 # Now pad back with zeroes to get crawlId
146 # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
147 crawlId=`printf %05d $COUNTER`
148 #echo $COUNTER - $crawlId
149 # $siteDir needs / at end to work with existing code
150 siteDir=$sitesDir/$crawlId/
151 #echo "siteDir $siteDir (crawlId $crawlId)"
152 prepareForCrawl $siteDir $crawlId
153 done
154
155}
156
157function printUsage() {
158 echo "Run as:"
159 echo " $0 -all|<ids>"
160 echo " where an id is a folder name in to_crawl/sites"
161 echo " and ids can be a comma separated list of"
162 echo " individual ids and/or ranges"
163 echo " Examples:"
164 echo " $0 00008-00022,00025,00026,00028-00034"
165 echo " $0 -all"
166 echo " $0 00312"
167
168}
169
170function parseArgs() {
171
172 # for i in $*; do
173 # echo "Pinky"
174 # echo $i
175 # done
176
177 args="$1"
178 #echo "Got arg string: $args"
179
180 if [ "x$args" = "x" ]; then
181 printUsage
182 exit 0
183 fi
184
185 # works - split args on comma
186 # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
187 IFS=', ' read -ra IDS <<< "$args"
188 for id in "${IDS[@]}"; do
189 echo "id: |$id|"
190 if [ "x$id" = "x-all" ]; then
191 echo "crawlAll"
192 crawlAll
193 elif [[ $id == *"-"* ]]; then
194 # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
195 echo "$id is a range"
196 startCrawlId=${id%%-*}
197 endCrawlId=${id##*-}
198 echo "crawlRange $startCrawlId $endCrawlId"
199 crawlRange $startCrawlId $endCrawlId
200
201 else
202 echo "$id is singleton"
203 crawlId=$id
204 # $siteDir needs / at end to work with existing code
205 siteDir=$sitesDir/$crawlId/
206 echo "prepareForCrawl $siteDir $crawlId"
207 prepareForCrawl $siteDir $crawlId
208 fi
209 done
210}
211
212
213
214# Passing as string instead of individual arguments
215# Because one can only have 9 individual args without using shift, see
216# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
217
218parseArgs "$*"
219
220# old. testing
221#crawlRange "00010" "00015"
222#parseArgs "00008-00022,00025,00026,00028-00034"
Note: See TracBrowser for help on using the repository browser.