Context Navigation

source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33566

Last change on this file since 33566 was 33566, checked in by ak19, 5 years ago
batchcrawl.sh script now supports taking a comma or space separated list of crawl folder ids of the form 00000%d (e.g. 00311) and/or ranges of such crawl folder ids. Script is now inactive. Next commit should support the all parameter to crawl all subfolders (and perhaps a range with no final termination value).
Property svn:executable set to ``*
File size: 6.2 KB

Line
1	#!/bin/bash
2
3	sitesDir=to_crawl/sites
4	echo "SITES DIR (INPUT): $sitesDir"
5	outputDir=crawled
6	mkdir -p $outputDir
7	echo "OUTPUT DIR: $outputDir"
8
9
10	NUTCH_HOME=apache-nutch-2.3.1
11	NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12	NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13	NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15	CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16	NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18	CRAWL_ITERATIONS=10
19
20	function crawlSite() {
21	siteDir=$1
22	crawlId=$2
23
24	echo "processing site $siteDir"
25
26	#echo "processing site $siteDir with crawlId: $crawlId"
27
28	echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29	cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31	echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32	cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34	#echo "Contents of seedURLs.txt file for site:"
35	#cat ${siteDir}seedURLs.txt
36
37	# $siteDir parameter is the folder containing seedURLs.txt
38	# https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
39	crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
40
41	# Since we're going to crawl from scratch, create log.out file
42	echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 \| tee ${siteDir}log.out
43	# append to log.out file hereafter
44	echo " $crawl_cmd" 2>&1 \| tee -a ${siteDir}log.out
45	echo "--------------------------------------------------" 2>&1 \| tee -a ${siteDir}log.out
46
47	# append output of $crawl_cmd to log.out
48	$crawl_cmd 2>&1 \| tee -a ${siteDir}log.out
49	result=$?
50
51	if [ "x$result" = "x0" ]; then
52	# nutch finished crawling successfully.
53
54	# But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
55	# If not, create file UNFINISHED to indicate a more thorough crawl needed
56	tail -10 ${siteDir}log.out \| grep "no more URLs to fetch now" > /dev/null
57	result=$?
58	if [ "x$result" != "x0" ]; then
59	echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 \| tee ${siteDir}UNFINISHED
60	echo "" 2>&1 \| tee -a ${siteDir}UNFINISHED
61	echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 \| tee -a ${siteDir}UNFINISHED
62	echo "1. delete $outputDir/$crawlId" 2>&1 \| tee -a ${siteDir}UNFINISHED
63	echo "2. copy the regex-urlfilter file:" 2>&1 \| tee -a ${siteDir}UNFINISHED
64	echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 \| tee -a ${siteDir}UNFINISHED
65	echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 \| tee -a ${siteDir}UNFINISHED
66	fi
67
68	# outputDir/$crawlId should not yet exist
69	./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
70	./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
71	cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
72	else
73	# appending to log.out
74	echo "CRAWL FAILED." 2>&1 \| tee -a ${siteDir}log.out
75	fi
76
77	}
78
79	function prepareForCrawl() {
80	siteDir=$1
81	crawlId=$2
82
83	echo "Processing crawlId: $crawlId"
84
85	if [ -d "$outputDir/$crawlId" ]; then
86	# Skip site already processed. Append this msg to log.out
87	echo "" 2>&1 \| tee -a ${siteDir}log.out
88	echo "**** $siteDir already processed. Skipping...." 2>&1 \| tee -a ${siteDir}log.out
89	echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 \| tee -a ${siteDir}log.out
90
91	else
92	crawlSite $siteDir $crawlId
93
94	fi
95	echo "--------------------------------------------------"
96
97
98
99	}
100
101	function crawlAll() {
102
103	# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
104	for siteDir in $sitesDir/*/; do
105
106	# to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
107	# Remove the $sitesDir prefix of to_crawl/sites followed by /,
108	# Next remove the / suffix that remains
109	crawlId=${siteDir#"$sitesDir/"}
110	crawlId=${crawlId%/}
111
112	prepareForCrawl $siteDir $crawlId
113	break
114	done
115	}
116
117	function crawlRange() {
118
119	startCrawlId=$1
120	endCrawlId=$2
121
122	# https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
123
124	# sadly, the numeric value is in octal in both the following
125	#end=$(($endCrawlId+0))
126	#printf -v end '%d\n' $endCrawlId 2>/dev/null
127	# removes a single 0 prefix
128	#end=${endCrawlId##+(0)}
129	#${old##+(0)}
130	# https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
131	start=${startCrawlId#"${startCrawlId%%[!0]*}"}
132	end=${endCrawlId#"${endCrawlId%%[!0]*}"}
133
134	echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
135
136	# Generate a range of numbers between start and end
137	#https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
138	for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
139	# Now pad back with zeroes to get crawlId
140	# https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
141	crawlId=`printf %05d $COUNTER`
142	#echo $COUNTER - $crawlId
143	siteDir=$sitesDir/$crawlId
144	echo "siteDir $siteDir (crawlId $crawlId)"
145	done
146
147	}
148
149	function parseArgs() {
150	# for i in $*; do
151	# echo "Pinky"
152	# echo $i
153	# done
154
155	args="$1"
156	echo "Got arg string: $args"
157
158	# works - split args on comma
159	# https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
160	IFS=', ' read -ra IDS <<< "$args"
161	for id in "${IDS[@]}"; do
162
163	if [[ $id == "-" ]]; then
164	echo "$id is a range"
165	startCrawlId=${id%%-*}
166	endCrawlId=${id##*-}
167	echo "crawlRange $startCrawlId $endCrawlId"
168	crawlRange $startCrawlId $endCrawlId
169	else
170	echo "$id is singleton"
171	crawlId=$id
172	siteDir=$sitesDir/$crawlId
173	echo "prepareForCrawl $siteDir $crawlId"
174	#prepareForCrawl $siteDir $crawlId
175	fi
176	done
177
178
179
180	}
181
182	#crawlRange "00010" "00015"
183	#parseArgs "00008-00022,00025,00026,00028-00034"
184	parseArgs "$*"
185	# can only have 9 args without using shift, see
186	# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
187	#parseArgs horse cow dog cat

Note: See TracBrowser for help on using the repository browser.

Download in other formats: