1 | #!/bin/bash
|
---|
2 |
|
---|
3 | sitesDir=to_crawl/sites
|
---|
4 | echo "SITES DIR (INPUT): $sitesDir"
|
---|
5 | outputDir=crawled
|
---|
6 | mkdir -p $outputDir
|
---|
7 | echo "OUTPUT DIR: $outputDir"
|
---|
8 |
|
---|
9 |
|
---|
10 | NUTCH_HOME=apache-nutch-2.3.1
|
---|
11 | NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
|
---|
12 | NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
|
---|
13 | NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
|
---|
14 |
|
---|
15 | CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
|
---|
16 | NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
|
---|
17 |
|
---|
18 | CRAWL_ITERATIONS=10
|
---|
19 |
|
---|
20 | function crawlSite() {
|
---|
21 | siteDir=$1
|
---|
22 | crawlId=$2
|
---|
23 |
|
---|
24 | echo "processing site $siteDir with crawlId: $crawlId"
|
---|
25 |
|
---|
26 | echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
|
---|
27 | cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
|
---|
28 |
|
---|
29 | echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
|
---|
30 | cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
|
---|
31 |
|
---|
32 | #echo "Contents of seedURLs.txt file for site:"
|
---|
33 | #cat ${siteDir}seedURLs.txt
|
---|
34 |
|
---|
35 | # $siteDir parameter is the folder containing seedURLs.txt
|
---|
36 | # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
|
---|
37 | crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
|
---|
38 |
|
---|
39 | # Since we're going to crawl from scratch, create log.out file
|
---|
40 | echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
|
---|
41 | # append to log.out file hereafter
|
---|
42 | echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
|
---|
43 | echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
|
---|
44 |
|
---|
45 | # append output of $crawl_cmd to log.out
|
---|
46 | $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
|
---|
47 | result=$?
|
---|
48 |
|
---|
49 | if [ "x$result" = "x0" ]; then
|
---|
50 | # nutch finished crawling successfully.
|
---|
51 |
|
---|
52 | # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
|
---|
53 | # If not, create file UNFINISHED to indicate a more thorough crawl needed
|
---|
54 | tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
|
---|
55 | result=$?
|
---|
56 | if [ "x$result" != "x0" ]; then
|
---|
57 | echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
|
---|
58 | echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
59 | echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
60 | echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
61 | echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
62 | echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
63 | echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
|
---|
64 | fi
|
---|
65 |
|
---|
66 | # outputDir/$crawlId should not yet exist
|
---|
67 | ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
|
---|
68 | ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
|
---|
69 | cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
|
---|
70 | else
|
---|
71 | # appending to log.out
|
---|
72 | echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
|
---|
73 | fi
|
---|
74 |
|
---|
75 | }
|
---|
76 |
|
---|
77 |
|
---|
78 | # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
|
---|
79 | for siteDir in $sitesDir/*/; do
|
---|
80 |
|
---|
81 | # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
|
---|
82 | # Remove the $sitesDir prefix of to_crawl/sites followed by /,
|
---|
83 | # Next remove the / suffix that remains
|
---|
84 | crawlId=${siteDir#"$sitesDir/"}
|
---|
85 | crawlId=${crawlId%/}
|
---|
86 |
|
---|
87 | echo "Processing crawlId: $crawlId"
|
---|
88 |
|
---|
89 | if [ -d "$outputDir/$crawlId" ]; then
|
---|
90 | # Skip site already processed. *Append* this msg to log.out
|
---|
91 | echo "" 2>&1 | tee -a ${siteDir}log.out
|
---|
92 | echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
|
---|
93 | echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
|
---|
94 |
|
---|
95 | else
|
---|
96 | crawlSite $siteDir $crawlId
|
---|
97 |
|
---|
98 | fi
|
---|
99 | echo "--------------------------------------------------"
|
---|
100 |
|
---|
101 | break
|
---|
102 | done
|
---|