Context Navigation

source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33569

Last change on this file since 33569 was 33569, checked in by ak19, 5 years ago
batchcrawl.sh now does what it should have from the start, which is to move the log.out and UNFINISHED files into the output folder instead of leaving them in the input folder, as the input to_crawl folder can and does get replaced all the time, every time I regenerate it after black/white/greylisting more urls. 2. Blacklisted more adult sites, greylisted more product sites and .ru, .pl and .tk domains with whitelisting in the whitelist file. 3. CCWETProcessor now looks out for additional adult sites based on URL and adds them to its blacklist in memory (not the file) and logs the domain for checking and manually adding to the blacklist file.
Property svn:executable set to ``*
File size: 7.4 KB

Line
1	#!/bin/bash
2
3	sitesDir=to_crawl/sites
4	echo "SITES DIR (INPUT): $sitesDir"
5	outputDir=crawled
6	mkdir -p $outputDir
7	echo "OUTPUT DIR: $outputDir"
8
9
10	NUTCH_HOME=apache-nutch-2.3.1
11	NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12	NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13	NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15	CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16	NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18	CRAWL_ITERATIONS=10
19
20	function crawlSite() {
21	siteDir=$1
22	crawlId=$2
23
24	echo "processing site $siteDir"
25
26	#echo "processing site $siteDir with crawlId: $crawlId"
27
28	echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29	cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31	echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32	cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34	#echo "Contents of seedURLs.txt file for site:"
35	#cat ${siteDir}seedURLs.txt
36
37	# $siteDir parameter is the folder containing seedURLs.txt
38	# https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
39	crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
40
41	# Since we're going to crawl from scratch, create log.out file
42	echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 \| tee ${siteDir}log.out
43	# append to log.out file hereafter
44	echo " $crawl_cmd" 2>&1 \| tee -a ${siteDir}log.out
45	echo "--------------------------------------------------" 2>&1 \| tee -a ${siteDir}log.out
46
47	# append output of $crawl_cmd to log.out
48	$crawl_cmd 2>&1 \| tee -a ${siteDir}log.out
49	result=$?
50
51	if [ "x$result" = "x0" ]; then
52	# nutch finished crawling successfully.
53
54	# But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
55	# If not, create file UNFINISHED to indicate a more thorough crawl needed
56	tail -10 ${siteDir}log.out \| grep "no more URLs to fetch now" > /dev/null
57	result=$?
58	if [ "x$result" != "x0" ]; then
59	echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 \| tee ${siteDir}UNFINISHED
60	echo "" 2>&1 \| tee -a ${siteDir}UNFINISHED
61	echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 \| tee -a ${siteDir}UNFINISHED
62	echo "1. delete $outputDir/$crawlId" 2>&1 \| tee -a ${siteDir}UNFINISHED
63	echo "2. copy the regex-urlfilter file:" 2>&1 \| tee -a ${siteDir}UNFINISHED
64	echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 \| tee -a ${siteDir}UNFINISHED
65	echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 \| tee -a ${siteDir}UNFINISHED
66	fi
67
68	# outputDir/$crawlId should not yet exist
69	./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
70	./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
71	cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
72	else
73	# appending to log.out
74	echo "CRAWL FAILED." 2>&1 \| tee -a ${siteDir}log.out
75	fi
76
77
78	# move the peripheral crawl products (the log.out and UNFINISHED files)
79	# from the input to the output folder. This way we can re-run the crawl and
80	# the original output will still have been preserved
81	mv ${siteDir}log.out $outputDir/$crawlId/log.out
82	mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
83	}
84
85	function prepareForCrawl() {
86	siteDir=$1
87	crawlId=$2
88
89	echo "Processing siteDir $siteDir with crawlId: $crawlId"
90
91	if [ -d "$outputDir/$crawlId" ]; then
92	# Skip site already processed. Append this msg to log.out
93	echo "" 2>&1 \| tee -a ${siteDir}log.out
94	echo "**** $siteDir already processed. Skipping...." 2>&1 \| tee -a ${siteDir}log.out
95	echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 \| tee -a ${siteDir}log.out
96
97	else
98	crawlSite $siteDir $crawlId
99
100	fi
101	echo "--------------------------------------------------"
102
103
104
105	}
106
107	function crawlAll() {
108
109	# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
110	for siteDir in $sitesDir/*/; do
111
112	# to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
113	# Remove the $sitesDir prefix of to_crawl/sites followed by /,
114	# Next remove the / suffix that remains
115	crawlId=${siteDir#"$sitesDir/"}
116	crawlId=${crawlId%/}
117
118	prepareForCrawl $siteDir $crawlId
119	#break
120	done
121	}
122
123	function crawlRange() {
124
125	startCrawlId=$1
126	endCrawlId=$2
127
128	# https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
129
130	# sadly, the numeric value is in octal in both the following
131	#end=$(($endCrawlId+0))
132	#printf -v end '%d\n' $endCrawlId 2>/dev/null
133	# removes a single 0 prefix
134	#end=${endCrawlId##+(0)}
135	#${old##+(0)}
136	# https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
137	start=${startCrawlId#"${startCrawlId%%[!0]*}"}
138	end=${endCrawlId#"${endCrawlId%%[!0]*}"}
139
140	echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
141
142	# Generate a range of numbers between start and end
143	#https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
144	for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
145	# Now pad back with zeroes to get crawlId
146	# https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
147	crawlId=`printf %05d $COUNTER`
148	#echo $COUNTER - $crawlId
149	# $siteDir needs / at end to work with existing code
150	siteDir=$sitesDir/$crawlId/
151	#echo "siteDir $siteDir (crawlId $crawlId)"
152	prepareForCrawl $siteDir $crawlId
153	done
154
155	}
156
157	function printUsage() {
158	echo "Run as:"
159	echo " $0 -all\|<ids>"
160	echo " where an id is a folder name in to_crawl/sites"
161	echo " and ids can be a comma separated list of"
162	echo " individual ids and/or ranges"
163	echo " Examples:"
164	echo " $0 00008-00022,00025,00026,00028-00034"
165	echo " $0 -all"
166	echo " $0 00312"
167
168	}
169
170	function parseArgs() {
171
172	# for i in $*; do
173	# echo "Pinky"
174	# echo $i
175	# done
176
177	args="$1"
178	#echo "Got arg string: $args"
179
180	if [ "x$args" = "x" ]; then
181	printUsage
182	exit 0
183	fi
184
185	# works - split args on comma
186	# https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
187	IFS=', ' read -ra IDS <<< "$args"
188	for id in "${IDS[@]}"; do
189	echo "id: \|$id\|"
190	if [ "x$id" = "x-all" ]; then
191	echo "crawlAll"
192	crawlAll
193	elif [[ $id == "-" ]]; then
194	# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
195	echo "$id is a range"
196	startCrawlId=${id%%-*}
197	endCrawlId=${id##*-}
198	echo "crawlRange $startCrawlId $endCrawlId"
199	crawlRange $startCrawlId $endCrawlId
200
201	else
202	echo "$id is singleton"
203	crawlId=$id
204	# $siteDir needs / at end to work with existing code
205	siteDir=$sitesDir/$crawlId/
206	echo "prepareForCrawl $siteDir $crawlId"
207	prepareForCrawl $siteDir $crawlId
208	fi
209	done
210	}
211
212
213
214	# Passing as string instead of individual arguments
215	# Because one can only have 9 individual args without using shift, see
216	# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
217
218	parseArgs "$*"
219
220	# old. testing
221	#crawlRange "00010" "00015"
222	#parseArgs "00008-00022,00025,00026,00028-00034"

Note: See TracBrowser for help on using the repository browser.

Download in other formats: