Context Navigation

batchcrawl.sh@ 33608

Last change on this file since 33608 was 33608, checked in by ak19, 4 years ago
New script to export from HBase so that we could in theory reimport into HBase. I've not tried the reimport out, but I followed instructions to export and I got a non-zero output file, so I am assuming it worked. 2. Committing today's new crawls in crawledNode4.tar. Each crawled site's folder inside it now includes a file called part-m-* that is the exported Hbase on that node VM. 3. Updated hdfs related GS_README.txt with instructions on viewing the contents of a table in HBase and a link on exporting/importing from HBase. 4. Minor changes like the tar files shouldn't be called tar.gz.
Property svn:executable set to ``*
File size: 7.7 KB

Line
1	#!/bin/bash
2
3	sitesDir=to_crawl/sites
4	echo "SITES DIR (INPUT): $sitesDir"
5	outputDir=crawled
6	mkdir -p $outputDir
7	echo "OUTPUT DIR: $outputDir"
8
9
10	NUTCH_HOME=apache-nutch-2.3.1
11	NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12	NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13	NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15	CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16	NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18	CRAWL_ITERATIONS=10
19
20	function crawlSite() {
21	siteDir=$1
22	crawlId=$2
23
24	echo "processing site $siteDir"
25
26	#echo "processing site $siteDir with crawlId: $crawlId"
27
28	echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
29	cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
30
31	echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
32	cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
33
34	#echo "Contents of seedURLs.txt file for site:"
35	#cat ${siteDir}seedURLs.txt
36
37	# $siteDir parameter is the folder containing seedURLs.txt
38	crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
39
40	# Since we're going to crawl from scratch, create log.out file
41	# Logging to terminal and log file simultaenously
42	# https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
43	echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 \| tee ${siteDir}log.out
44	# append to log.out file hereafter
45	echo " $crawl_cmd" 2>&1 \| tee -a ${siteDir}log.out
46	echo "--------------------------------------------------" 2>&1 \| tee -a ${siteDir}log.out
47
48	# append output of $crawl_cmd to log.out
49	$crawl_cmd 2>&1 \| tee -a ${siteDir}log.out
50	result=$?
51
52	if [ "x$result" = "x0" ]; then
53	# nutch finished crawling successfully.
54
55	# But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
56	# If not, create file UNFINISHED to indicate a more thorough crawl needed
57	tail -10 ${siteDir}log.out \| grep "no more URLs to fetch now" > /dev/null
58	result=$?
59	if [ "x$result" != "x0" ]; then
60	echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 \| tee ${siteDir}UNFINISHED
61	echo "" 2>&1 \| tee -a ${siteDir}UNFINISHED
62	echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 \| tee -a ${siteDir}UNFINISHED
63	echo "1. delete $outputDir/$crawlId" 2>&1 \| tee -a ${siteDir}UNFINISHED
64	echo "2. copy the regex-urlfilter file:" 2>&1 \| tee -a ${siteDir}UNFINISHED
65	echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 \| tee -a ${siteDir}UNFINISHED
66	echo "3. Adjust # crawl iterations in old crawl command:" 2>&1 \| tee -a ${siteDir}UNFINISHED
67	echo " $crawl_cmd" 2>&1 \| tee -a ${siteDir}UNFINISHED
68	fi
69
70	# outputDir/$crawlId should not yet exist
71	./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
72	./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
73	cat $outputDir/$crawlId/part-r-* > $outputDir/$crawlId/dump.txt
74	else
75	# appending to log.out
76	echo "CRAWL FAILED." 2>&1 \| tee -a ${siteDir}log.out
77	fi
78
79
80	# move the peripheral crawl products (the log.out and optional UNFINISHED file)
81	# from the input to the output folder. This way we can re-run the crawl and
82	# these files will still have been preserved as long as the output folder
83	# isn't deleted
84	mv ${siteDir}log.out $outputDir/$crawlId/log.out
85	if [ -e "${siteDir}UNFINISHED" ]; then
86	mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
87	fi
88
89	# finally append the current crawlId to siteIDsCrawled.txt
90	echo $crawlId >> $outputDir/siteIDsCrawled.txt
91	}
92
93	function prepareForCrawl() {
94	siteDir=$1
95	crawlId=$2
96
97	echo "Processing siteDir $siteDir with crawlId: $crawlId"
98
99	if [ -d "$outputDir/$crawlId" ]; then
100	# Skip site already processed. Append this msg to log.out
101	echo "" 2>&1 \| tee -a ${siteDir}log.out
102	echo "**** $siteDir already processed. Skipping...." 2>&1 \| tee -a ${siteDir}log.out
103	echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 \| tee -a ${siteDir}log.out
104
105	else
106	crawlSite $siteDir $crawlId
107
108	fi
109	echo "--------------------------------------------------"
110
111
112
113	}
114
115	function crawlAll() {
116
117	# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
118	for siteDir in $sitesDir/*/; do
119
120	# to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
121	# Remove the $sitesDir prefix of to_crawl/sites followed by /,
122	# Next remove the / suffix that remains
123	crawlId=${siteDir#"$sitesDir/"}
124	crawlId=${crawlId%/}
125
126	prepareForCrawl $siteDir $crawlId
127	#break
128	done
129	}
130
131	function crawlRange() {
132
133	startCrawlId=$1
134	endCrawlId=$2
135
136	# https://unix.stackexchange.com/questions/232384/argument-string-to-integer-in-bash/301285
137
138	# sadly, the numeric value is in octal in both the following
139	#end=$(($endCrawlId+0))
140	#printf -v end '%d\n' $endCrawlId 2>/dev/null
141	# removes a single 0 prefix
142	#end=${endCrawlId##+(0)}
143	#${old##+(0)}
144	# https://stackoverflow.com/questions/19861394/stripping-leading-zeros-using-bash-substring-removal-only/51196007
145	start=${startCrawlId#"${startCrawlId%%[!0]*}"}
146	end=${endCrawlId#"${endCrawlId%%[!0]*}"}
147
148	echo "Start: $start ($startCrawlId), End: $end ($endCrawlId)"
149
150	# Generate a range of numbers between start and end
151	#https://stackoverflow.com/questions/966020/how-to-produce-a-range-with-step-n-in-bash-generate-a-sequence-of-numbers-with/966026
152	for (( COUNTER=$start; $COUNTER<=$end; COUNTER=$COUNTER+1 )); do
153	# Now pad back with zeroes to get crawlId
154	# https://stackoverflow.com/questions/1117134/padding-zeros-in-a-string
155	crawlId=`printf %05d $COUNTER`
156	#echo $COUNTER - $crawlId
157	# $siteDir needs / at end to work with existing code
158	siteDir=$sitesDir/$crawlId/
159	#echo "siteDir $siteDir (crawlId $crawlId)"
160	prepareForCrawl $siteDir $crawlId
161	done
162
163	}
164
165	function printUsage() {
166	echo "Run as:"
167	echo " $0 -all\|<ids>"
168	echo " where an id is a folder name in to_crawl/sites"
169	echo " and ids can be a comma or space separated list of"
170	echo " individual ids and/or ranges"
171	echo " Examples:"
172	echo " $0 00008-00022,00025,00026,00028-00034 00050"
173	echo " $0 -all"
174	echo " $0 00312"
175
176	}
177
178	function parseArgs() {
179
180	# for i in $*; do
181	# echo "Pinky"
182	# echo $i
183	# done
184
185	args="$1"
186	#echo "Got arg string: $args"
187
188	if [ "x$args" = "x" ]; then
189	printUsage
190	exit 0
191	fi
192
193	# works - split args on comma or space
194	# https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
195	IFS=', ' read -ra IDS <<< "$args"
196	for id in "${IDS[@]}"; do
197	echo "id: \|$id\|"
198	if [ "x$id" = "x-all" ]; then
199	echo "crawlAll"
200	crawlAll
201	elif [[ $id == "-" ]]; then
202	# https://stackoverflow.com/questions/229551/how-to-check-if-a-string-contains-a-substring-in-bash
203	echo "$id is a range"
204	startCrawlId=${id%%-*}
205	endCrawlId=${id##*-}
206	echo "crawlRange $startCrawlId $endCrawlId"
207	crawlRange $startCrawlId $endCrawlId
208
209	else
210	echo "$id is singleton"
211	crawlId=$id
212	# $siteDir needs / at end to work with existing code
213	siteDir=$sitesDir/$crawlId/
214	echo "prepareForCrawl $siteDir $crawlId"
215	prepareForCrawl $siteDir $crawlId
216	fi
217	done
218	}
219
220
221
222	# Passing as string instead of individual arguments
223	# Because one can only have 9 individual args without using shift, see
224	# https://www.unix.com/shell-programming-and-scripting/57225-there-limit-no-arguments-shell-script.html
225
226	parseArgs "$*"
227
228	# old. testing
229	#crawlRange "00010" "00015"
230	#parseArgs "00008-00022,00025,00026,00028-00034"

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33608

Download in other formats: