Context Navigation

source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33564

Last change on this file since 33564 was 33564, checked in by ak19, 5 years ago
batchcrawl.sh now does the crawl and logs output of the crawl, dumps text and stats resulting from the crawl into an output folder and creates an UNFINISHED file with instructions and old crawl cmd if crawl did not terminate in specified number of iterations. At present there's still a break statement to stop after the first site has been processed.
Property svn:executable set to ``*
File size: 3.9 KB

Line
1	#!/bin/bash
2
3	sitesDir=to_crawl/sites
4	echo "SITES DIR (INPUT): $sitesDir"
5	outputDir=crawled
6	mkdir -p $outputDir
7	echo "OUTPUT DIR: $outputDir"
8
9
10	NUTCH_HOME=apache-nutch-2.3.1
11	NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12	NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13	NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15	CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16	NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18	CRAWL_ITERATIONS=10
19
20	function crawlSite() {
21	siteDir=$1
22	crawlId=$2
23
24	echo "processing site $siteDir with crawlId: $crawlId"
25
26	echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
27	cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
28
29	echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
30	cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
31
32	#echo "Contents of seedURLs.txt file for site:"
33	#cat ${siteDir}seedURLs.txt
34
35	# $siteDir parameter is the folder containing seedURLs.txt
36	# https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
37	crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
38
39	# Since we're going to crawl from scratch, create log.out file
40	echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 \| tee ${siteDir}log.out
41	# append to log.out file hereafter
42	echo " $crawl_cmd" 2>&1 \| tee -a ${siteDir}log.out
43	echo "--------------------------------------------------" 2>&1 \| tee -a ${siteDir}log.out
44
45	# append output of $crawl_cmd to log.out
46	$crawl_cmd 2>&1 \| tee -a ${siteDir}log.out
47	result=$?
48
49	if [ "x$result" = "x0" ]; then
50	# nutch finished crawling successfully.
51
52	# But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
53	# If not, create file UNFINISHED to indicate a more thorough crawl needed
54	tail -10 ${siteDir}log.out \| grep "no more URLs to fetch now" > /dev/null
55	result=$?
56	if [ "x$result" != "x0" ]; then
57	echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 \| tee ${siteDir}UNFINISHED
58	echo "" 2>&1 \| tee -a ${siteDir}UNFINISHED
59	echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 \| tee -a ${siteDir}UNFINISHED
60	echo "1. delete $outputDir/$crawlId" 2>&1 \| tee -a ${siteDir}UNFINISHED
61	echo "2. copy the regex-urlfilter file:" 2>&1 \| tee -a ${siteDir}UNFINISHED
62	echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 \| tee -a ${siteDir}UNFINISHED
63	echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 \| tee -a ${siteDir}UNFINISHED
64	fi
65
66	# outputDir/$crawlId should not yet exist
67	./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
68	./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
69	cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
70	else
71	# appending to log.out
72	echo "CRAWL FAILED." 2>&1 \| tee -a ${siteDir}log.out
73	fi
74
75	}
76
77
78	# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
79	for siteDir in $sitesDir/*/; do
80
81	# to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
82	# Remove the $sitesDir prefix of to_crawl/sites followed by /,
83	# Next remove the / suffix that remains
84	crawlId=${siteDir#"$sitesDir/"}
85	crawlId=${crawlId%/}
86
87	echo "Processing crawlId: $crawlId"
88
89	if [ -d "$outputDir/$crawlId" ]; then
90	# Skip site already processed. Append this msg to log.out
91	echo "" 2>&1 \| tee -a ${siteDir}log.out
92	echo "**** $siteDir already processed. Skipping...." 2>&1 \| tee -a ${siteDir}log.out
93	echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 \| tee -a ${siteDir}log.out
94
95	else
96	crawlSite $siteDir $crawlId
97
98	fi
99	echo "--------------------------------------------------"
100
101	break
102	done

Note: See TracBrowser for help on using the repository browser.

Download in other formats: