source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33566

Last change on this file since 33566 was 33566, checked in by ak19, 5 years ago

batchcrawl.sh script now supports taking a comma or space separated list of crawl folder ids of the form 00000%d (e.g. 00311) and/or ranges of such crawl folder ids. Script is now inactive. Next commit should support the all parameter to crawl all subfolders (and perhaps a range with no final termination value).

  • Property svn:executable set to *
File size: 6.2 KB
Line 
1#!/bin/bash
2
3sitesDir=to_crawl/sites
4echo "SITES DIR (INPUT): $sitesDir"
5outputDir=crawled
6mkdir -p $outputDir
7echo "OUTPUT DIR: $outputDir"
8
9
10NUTCH_HOME=apache-nutch-2.3.1
11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18CRAWL_ITERATIONS=10
19
20function crawlSite() {
21 siteDir=$1
22 crawlId=$2
23
24 echo "processing site $siteDir"
25
26 #echo "processing site $siteDir with crawlId: $crawlId"
27
28 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29 cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31 echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32 cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34 #echo "Contents of seedURLs.txt file for site:"
35 #cat ${siteDir}seedURLs.txt
36
37 # $siteDir parameter is the folder containing seedURLs.txt
38 # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
39 crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
40
41 # Since we're going to crawl from scratch, create log.out file
42 echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
43 # append to log.out file hereafter
44 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
45 echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
46
47 # append output of $crawl_cmd to log.out
48 $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
49 result=$?
50
51 if [ "x$result" = "x0" ]; then
52 # nutch finished crawling successfully.
53
54 # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
55 # If not, create file UNFINISHED to indicate a more thorough crawl needed
56 tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
57 result=$?
58 if [ "x$result" != "x0" ]; then
59 echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
60 echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
61 echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
62 echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
63 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
64 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
65 echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
66 fi
67
68 # outputDir/$crawlId should not yet exist
69 ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
70 ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
71 cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
72 else
73 # appending to log.out
74 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
75 fi
76
77}
78
79function prepareForCrawl() {
80 siteDir=$1
81 crawlId=$2
82
83 echo "Processing crawlId: $crawlId"
84
85 if [ -d "$outputDir/$crawlId" ]; then
86 # Skip site already processed. *Append* this msg to log.out
87 echo "" 2>&1 | tee -a ${siteDir}log.out
88 echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
89 echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
90
91 else
92 crawlSite $siteDir $crawlId
93
94 fi
95 echo "--------------------------------------------------"
96
97
98
99}
100
101function crawlAll() {
102
103 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
104 for siteDir in $sitesDir/*/; do
105
106 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
107 # Remove the $sitesDir prefix of to_crawl/sites followed by /,
108 # Next remove the / suffix that remains
109 crawlId=${siteDir#"$sitesDir/"}
110 crawlId=${crawlId%/}
111
112 prepareForCrawl $siteDir $crawlId
113 break
114 done
115}
116
117function crawlRange() {
118
119 startCrawlId=$1
120 endCrawlId=$2
121
122 # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
123
124 # sadly, the numeric value is in octal in both the following
125 #end=$(($endCrawlId+0))
126 #printf -v end '%d\n' $endCrawlId 2>/dev/null
127 # removes a single 0 prefix
128 #end=${endCrawlId##+(0)}
129 #${old##+(0)}
130 # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
131 start=${startCrawlId#"${startCrawlId%%[!0]*}"}
132 end=${endCrawlId#"${endCrawlId%%[!0]*}"}
133
134 echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
135
136 # Generate a range of numbers between start and end
137 #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
138 for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
139 # Now pad back with zeroes to get crawlId
140 # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
141 crawlId=`printf %05d $COUNTER`
142 #echo $COUNTER - $crawlId
143 siteDir=$sitesDir/$crawlId
144 echo "siteDir $siteDir (crawlId $crawlId)"
145 done
146
147}
148
149function parseArgs() {
150 # for i in $*; do
151 # echo "Pinky"
152 # echo $i
153 # done
154
155 args="$1"
156 echo "Got arg string: $args"
157
158 # works - split args on comma
159 # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
160 IFS=', ' read -ra IDS <<< "$args"
161 for id in "${IDS[@]}"; do
162
163 if [[ $id == *"-"* ]]; then
164 echo "$id is a range"
165 startCrawlId=${id%%-*}
166 endCrawlId=${id##*-}
167 echo "crawlRange $startCrawlId $endCrawlId"
168 crawlRange $startCrawlId $endCrawlId
169 else
170 echo "$id is singleton"
171 crawlId=$id
172 siteDir=$sitesDir/$crawlId
173 echo "prepareForCrawl $siteDir $crawlId"
174 #prepareForCrawl $siteDir $crawlId
175 fi
176 done
177
178
179
180}
181
182#crawlRange "00010" "00015"
183#parseArgs "00008-00022,00025,00026,00028-00034"
184parseArgs "$*"
185# can only have 9 args without using shift, see
186# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
187#parseArgs horse cow dog cat
Note: See TracBrowser for help on using the repository browser.