Context Navigation

source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33573

Last change on this file since 33573 was 33573, checked in by ak19, 5 years ago
Forgot to document that spaces were also allowed as separator in the input of crawl-site ids to the batchcrawl.sh script
Property svn:executable set to ``*
File size: 7.6 KB

Line
1	#!/bin/bash
2
3	sitesDir=to_crawl/sites
4	echo "SITES DIR (INPUT): $sitesDir"
5	outputDir=crawled
6	mkdir -p $outputDir
7	echo "OUTPUT DIR: $outputDir"
8
9
10	NUTCH_HOME=apache-nutch-2.3.1
11	NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12	NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13	NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15	CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16	NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18	CRAWL_ITERATIONS=10
19
20	function crawlSite() {
21	siteDir=$1
22	crawlId=$2
23
24	echo "processing site $siteDir"
25
26	#echo "processing site $siteDir with crawlId: $crawlId"
27
28	echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29	cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31	echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32	cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34	#echo "Contents of seedURLs.txt file for site:"
35	#cat ${siteDir}seedURLs.txt
36
37	# $siteDir parameter is the folder containing seedURLs.txt
38	# https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
39	crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
40
41	# Since we're going to crawl from scratch, create log.out file
42	echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 \| tee ${siteDir}log.out
43	# append to log.out file hereafter
44	echo " $crawl_cmd" 2>&1 \| tee -a ${siteDir}log.out
45	echo "--------------------------------------------------" 2>&1 \| tee -a ${siteDir}log.out
46
47	# append output of $crawl_cmd to log.out
48	$crawl_cmd 2>&1 \| tee -a ${siteDir}log.out
49	result=$?
50
51	if [ "x$result" = "x0" ]; then
52	# nutch finished crawling successfully.
53
54	# But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
55	# If not, create file UNFINISHED to indicate a more thorough crawl needed
56	tail -10 ${siteDir}log.out \| grep "no more URLs to fetch now" > /dev/null
57	result=$?
58	if [ "x$result" != "x0" ]; then
59	echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 \| tee ${siteDir}UNFINISHED
60	echo "" 2>&1 \| tee -a ${siteDir}UNFINISHED
61	echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 \| tee -a ${siteDir}UNFINISHED
62	echo "1. delete $outputDir/$crawlId" 2>&1 \| tee -a ${siteDir}UNFINISHED
63	echo "2. copy the regex-urlfilter file:" 2>&1 \| tee -a ${siteDir}UNFINISHED
64	echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 \| tee -a ${siteDir}UNFINISHED
65	echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 \| tee -a ${siteDir}UNFINISHED
66	fi
67
68	# outputDir/$crawlId should not yet exist
69	./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
70	./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
71	cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
72	else
73	# appending to log.out
74	echo "CRAWL FAILED." 2>&1 \| tee -a ${siteDir}log.out
75	fi
76
77
78	# move the peripheral crawl products (the log.out and optional UNFINISHED file)
79	# from the input to the output folder. This way we can re-run the crawl and
80	# these files will still have been preserved as long as the output folder
81	# isn't deleted
82	mv ${siteDir}log.out $outputDir/$crawlId/log.out
83	if [ -e "${siteDir}UNFINISHED" ]; then
84	mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
85	fi
86
87	# finally append the current crawlId to siteIDsCrawled.txt
88	echo $crawlId >> $outputDir/siteIDsCrawled.txt
89	}
90
91	function prepareForCrawl() {
92	siteDir=$1
93	crawlId=$2
94
95	echo "Processing siteDir $siteDir with crawlId: $crawlId"
96
97	if [ -d "$outputDir/$crawlId" ]; then
98	# Skip site already processed. Append this msg to log.out
99	echo "" 2>&1 \| tee -a ${siteDir}log.out
100	echo "**** $siteDir already processed. Skipping...." 2>&1 \| tee -a ${siteDir}log.out
101	echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 \| tee -a ${siteDir}log.out
102
103	else
104	crawlSite $siteDir $crawlId
105
106	fi
107	echo "--------------------------------------------------"
108
109
110
111	}
112
113	function crawlAll() {
114
115	# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
116	for siteDir in $sitesDir/*/; do
117
118	# to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
119	# Remove the $sitesDir prefix of to_crawl/sites followed by /,
120	# Next remove the / suffix that remains
121	crawlId=${siteDir#"$sitesDir/"}
122	crawlId=${crawlId%/}
123
124	prepareForCrawl $siteDir $crawlId
125	#break
126	done
127	}
128
129	function crawlRange() {
130
131	startCrawlId=$1
132	endCrawlId=$2
133
134	# https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
135
136	# sadly, the numeric value is in octal in both the following
137	#end=$(($endCrawlId+0))
138	#printf -v end '%d\n' $endCrawlId 2>/dev/null
139	# removes a single 0 prefix
140	#end=${endCrawlId##+(0)}
141	#${old##+(0)}
142	# https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
143	start=${startCrawlId#"${startCrawlId%%[!0]*}"}
144	end=${endCrawlId#"${endCrawlId%%[!0]*}"}
145
146	echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
147
148	# Generate a range of numbers between start and end
149	#https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
150	for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
151	# Now pad back with zeroes to get crawlId
152	# https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
153	crawlId=`printf %05d $COUNTER`
154	#echo $COUNTER - $crawlId
155	# $siteDir needs / at end to work with existing code
156	siteDir=$sitesDir/$crawlId/
157	#echo "siteDir $siteDir (crawlId $crawlId)"
158	prepareForCrawl $siteDir $crawlId
159	done
160
161	}
162
163	function printUsage() {
164	echo "Run as:"
165	echo " $0 -all\|<ids>"
166	echo " where an id is a folder name in to_crawl/sites"
167	echo " and ids can be a comma or space separated list of"
168	echo " individual ids and/or ranges"
169	echo " Examples:"
170	echo " $0 00008-00022,00025,00026,00028-00034 00050"
171	echo " $0 -all"
172	echo " $0 00312"
173
174	}
175
176	function parseArgs() {
177
178	# for i in $*; do
179	# echo "Pinky"
180	# echo $i
181	# done
182
183	args="$1"
184	#echo "Got arg string: $args"
185
186	if [ "x$args" = "x" ]; then
187	printUsage
188	exit 0
189	fi
190
191	# works - split args on comma or space
192	# https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
193	IFS=', ' read -ra IDS <<< "$args"
194	for id in "${IDS[@]}"; do
195	echo "id: \|$id\|"
196	if [ "x$id" = "x-all" ]; then
197	echo "crawlAll"
198	crawlAll
199	elif [[ $id == "-" ]]; then
200	# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
201	echo "$id is a range"
202	startCrawlId=${id%%-*}
203	endCrawlId=${id##*-}
204	echo "crawlRange $startCrawlId $endCrawlId"
205	crawlRange $startCrawlId $endCrawlId
206
207	else
208	echo "$id is singleton"
209	crawlId=$id
210	# $siteDir needs / at end to work with existing code
211	siteDir=$sitesDir/$crawlId/
212	echo "prepareForCrawl $siteDir $crawlId"
213	prepareForCrawl $siteDir $crawlId
214	fi
215	done
216	}
217
218
219
220	# Passing as string instead of individual arguments
221	# Because one can only have 9 individual args without using shift, see
222	# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
223
224	parseArgs "$*"
225
226	# old. testing
227	#crawlRange "00010" "00015"
228	#parseArgs "00008-00022,00025,00026,00028-00034"

Note: See TracBrowser for help on using the repository browser.

Download in other formats: