1 | #!/bin/bash
|
---|
2 | echo "Hello world!"
|
---|
3 |
|
---|
4 | sitesDir=to_crawl/sites
|
---|
5 | echo "SITESDIR: $sitesDir"
|
---|
6 |
|
---|
7 | NUTCH_HOME=apache-nutch-2.3.1
|
---|
8 | NUTCH_CONF_DIR=$NUTCH_HOME/conf
|
---|
9 | NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
|
---|
10 | NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
|
---|
11 |
|
---|
12 | CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
|
---|
13 |
|
---|
14 | CRAWL_ITERATIONS=10
|
---|
15 |
|
---|
16 | function prepareSite() {
|
---|
17 | siteDir=$1
|
---|
18 | crawlId=$2
|
---|
19 |
|
---|
20 | #echo "processing site $siteDir"
|
---|
21 |
|
---|
22 | #echo "processing site $siteDir with crawlId: $crawlId"
|
---|
23 |
|
---|
24 | echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
|
---|
25 | cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
|
---|
26 |
|
---|
27 | echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
|
---|
28 | cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
|
---|
29 |
|
---|
30 | #echo "Contents of seedURLs.txt file for site:"
|
---|
31 | #cat ${siteDir}seedURLs.txt
|
---|
32 |
|
---|
33 | # $siteDir parameter is the folder containing seedURLs.txt
|
---|
34 | crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
|
---|
35 |
|
---|
36 | echo "Going to run nutch crawl command:"
|
---|
37 | echo " $crawl_cmd"
|
---|
38 |
|
---|
39 |
|
---|
40 | }
|
---|
41 |
|
---|
42 |
|
---|
43 | # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
|
---|
44 | for siteDir in $sitesDir/*/; do
|
---|
45 | #echo "$siteDir"
|
---|
46 | # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
|
---|
47 | # Remove the $sitesDir prefix of to_crawl/sites followed by /,
|
---|
48 | # Next remove the / suffix that remains
|
---|
49 | crawlId=${siteDir#"$sitesDir/"}
|
---|
50 | crawlId=${crawlId%/}
|
---|
51 |
|
---|
52 | #echo "crawlId: $crawlId"
|
---|
53 | prepareSite $siteDir $crawlId
|
---|
54 | break
|
---|
55 | done
|
---|