source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33608

Last change on this file since 33608 was 33608, checked in by ak19, 4 years ago
  1. New script to export from HBase so that we could in theory reimport into HBase. I've not tried the reimport out, but I followed instructions to export and I got a non-zero output file, so I am assuming it worked. 2. Committing today's new crawls in crawledNode4.tar. Each crawled site's folder inside it now includes a file called part-m-* that is the exported Hbase on that node VM. 3. Updated hdfs related GS_README.txt with instructions on viewing the contents of a table in HBase and a link on exporting/importing from HBase. 4. Minor changes like the tar files shouldn't be called tar.gz.
  • Property svn:executable set to *
File size: 7.7 KB
Line 
1#!/bin/bash
2
3sitesDir=to_crawl/sites
4echo "SITES DIR (INPUT): $sitesDir"
5outputDir=crawled
6mkdir -p $outputDir
7echo "OUTPUT DIR: $outputDir"
8
9
10NUTCH_HOME=apache-nutch-2.3.1
11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18CRAWL_ITERATIONS=10
19
20function crawlSite() {
21 siteDir=$1
22 crawlId=$2
23
24 echo "processing site $siteDir"
25
26 #echo "processing site $siteDir with crawlId: $crawlId"
27
28 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29 cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31 echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32 cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34 #echo "Contents of seedURLs.txt file for site:"
35 #cat ${siteDir}seedURLs.txt
36
37 # $siteDir parameter is the folder containing seedURLs.txt
38 crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
39
40 # Since we're going to crawl from scratch, create log.out file
41 # Logging to terminal and log file simultaenously
42 # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
43 echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
44 # append to log.out file hereafter
45 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
46 echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
47
48 # append output of $crawl_cmd to log.out
49 $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
50 result=$?
51
52 if [ "x$result" = "x0" ]; then
53 # nutch finished crawling successfully.
54
55 # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
56 # If not, create file UNFINISHED to indicate a more thorough crawl needed
57 tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
58 result=$?
59 if [ "x$result" != "x0" ]; then
60 echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
61 echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
62 echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
63 echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
64 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
65 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
66 echo "3. Adjust # crawl iterations in old crawl command:" 2>&1 | tee -a ${siteDir}UNFINISHED
67 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
68 fi
69
70 # outputDir/$crawlId should not yet exist
71 ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
72 ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
73 cat $outputDir/$crawlId/part-r-* > $outputDir/$crawlId/dump.txt
74 else
75 # appending to log.out
76 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
77 fi
78
79
80 # move the peripheral crawl products (the log.out and optional UNFINISHED file)
81 # from the input to the output folder. This way we can re-run the crawl and
82 # these files will still have been preserved as long as the output folder
83 # isn't deleted
84 mv ${siteDir}log.out $outputDir/$crawlId/log.out
85 if [ -e "${siteDir}UNFINISHED" ]; then
86 mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
87 fi
88
89 # finally append the current crawlId to siteIDsCrawled.txt
90 echo $crawlId >> $outputDir/siteIDsCrawled.txt
91}
92
93function prepareForCrawl() {
94 siteDir=$1
95 crawlId=$2
96
97 echo "Processing siteDir $siteDir with crawlId: $crawlId"
98
99 if [ -d "$outputDir/$crawlId" ]; then
100 # Skip site already processed. *Append* this msg to log.out
101 echo "" 2>&1 | tee -a ${siteDir}log.out
102 echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
103 echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
104
105 else
106 crawlSite $siteDir $crawlId
107
108 fi
109 echo "--------------------------------------------------"
110
111
112
113}
114
115function crawlAll() {
116
117 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
118 for siteDir in $sitesDir/*/; do
119
120 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
121 # Remove the $sitesDir prefix of to_crawl/sites followed by /,
122 # Next remove the / suffix that remains
123 crawlId=${siteDir#"$sitesDir/"}
124 crawlId=${crawlId%/}
125
126 prepareForCrawl $siteDir $crawlId
127 #break
128 done
129}
130
131function crawlRange() {
132
133 startCrawlId=$1
134 endCrawlId=$2
135
136 # https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
137
138 # sadly, the numeric value is in octal in both the following
139 #end=$(($endCrawlId+0))
140 #printf -v end '%d\n' $endCrawlId 2>/dev/null
141 # removes a single 0 prefix
142 #end=${endCrawlId##+(0)}
143 #${old##+(0)}
144 # https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
145 start=${startCrawlId#"${startCrawlId%%[!0]*}"}
146 end=${endCrawlId#"${endCrawlId%%[!0]*}"}
147
148 echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
149
150 # Generate a range of numbers between start and end
151 #https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
152 for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
153 # Now pad back with zeroes to get crawlId
154 # https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
155 crawlId=`printf %05d $COUNTER`
156 #echo $COUNTER - $crawlId
157 # $siteDir needs / at end to work with existing code
158 siteDir=$sitesDir/$crawlId/
159 #echo "siteDir $siteDir (crawlId $crawlId)"
160 prepareForCrawl $siteDir $crawlId
161 done
162
163}
164
165function printUsage() {
166 echo "Run as:"
167 echo " $0 -all|<ids>"
168 echo " where an id is a folder name in to_crawl/sites"
169 echo " and ids can be a comma or space separated list of"
170 echo " individual ids and/or ranges"
171 echo " Examples:"
172 echo " $0 00008-00022,00025,00026,00028-00034 00050"
173 echo " $0 -all"
174 echo " $0 00312"
175
176}
177
178function parseArgs() {
179
180 # for i in $*; do
181 # echo "Pinky"
182 # echo $i
183 # done
184
185 args="$1"
186 #echo "Got arg string: $args"
187
188 if [ "x$args" = "x" ]; then
189 printUsage
190 exit 0
191 fi
192
193 # works - split args on comma or space
194 # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
195 IFS=', ' read -ra IDS <<< "$args"
196 for id in "${IDS[@]}"; do
197 echo "id: |$id|"
198 if [ "x$id" = "x-all" ]; then
199 echo "crawlAll"
200 crawlAll
201 elif [[ $id == *"-"* ]]; then
202 # https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
203 echo "$id is a range"
204 startCrawlId=${id%%-*}
205 endCrawlId=${id##*-}
206 echo "crawlRange $startCrawlId $endCrawlId"
207 crawlRange $startCrawlId $endCrawlId
208
209 else
210 echo "$id is singleton"
211 crawlId=$id
212 # $siteDir needs / at end to work with existing code
213 siteDir=$sitesDir/$crawlId/
214 echo "prepareForCrawl $siteDir $crawlId"
215 prepareForCrawl $siteDir $crawlId
216 fi
217 done
218}
219
220
221
222# Passing as string instead of individual arguments
223# Because one can only have 9 individual args without using shift, see
224# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
225
226parseArgs "$*"
227
228# old. testing
229#crawlRange "00010" "00015"
230#parseArgs "00008-00022,00025,00026,00028-00034"
Note: See TracBrowser for help on using the repository browser.