source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh@ 33564

Last change on this file since 33564 was 33564, checked in by ak19, 5 years ago

batchcrawl.sh now does the crawl and logs output of the crawl, dumps text and stats resulting from the crawl into an output folder and creates an UNFINISHED file with instructions and old crawl cmd if crawl did not terminate in specified number of iterations. At present there's still a break statement to stop after the first site has been processed.

  • Property svn:executable set to *
File size: 3.9 KB
Line 
1#!/bin/bash
2
3sitesDir=to_crawl/sites
4echo "SITES DIR (INPUT): $sitesDir"
5outputDir=crawled
6mkdir -p $outputDir
7echo "OUTPUT DIR: $outputDir"
8
9
10NUTCH_HOME=apache-nutch-2.3.1
11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
12NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
13NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
14
15CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
17
18CRAWL_ITERATIONS=10
19
20function crawlSite() {
21 siteDir=$1
22 crawlId=$2
23
24 echo "processing site $siteDir with crawlId: $crawlId"
25
26 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
27 cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE
28
29 echo "Appending contents of regex-urlfilter file for site $siteDir to url-filter file:"
30 cat ${siteDir}regex-urlfilter.txt >> $NUTCH_URLFILTER_FILE
31
32 #echo "Contents of seedURLs.txt file for site:"
33 #cat ${siteDir}seedURLs.txt
34
35 # $siteDir parameter is the folder containing seedURLs.txt
36 # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
37 crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
38
39 # Since we're going to crawl from scratch, create log.out file
40 echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
41 # append to log.out file hereafter
42 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
43 echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
44
45 # append output of $crawl_cmd to log.out
46 $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
47 result=$?
48
49 if [ "x$result" = "x0" ]; then
50 # nutch finished crawling successfully.
51
52 # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
53 # If not, create file UNFINISHED to indicate a more thorough crawl needed
54 tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
55 result=$?
56 if [ "x$result" != "x0" ]; then
57 echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
58 echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
59 echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
60 echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
61 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
62 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
63 echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
64 fi
65
66 # outputDir/$crawlId should not yet exist
67 ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
68 ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
69 cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
70 else
71 # appending to log.out
72 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
73 fi
74
75}
76
77
78# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
79for siteDir in $sitesDir/*/; do
80
81 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
82 # Remove the $sitesDir prefix of to_crawl/sites followed by /,
83 # Next remove the / suffix that remains
84 crawlId=${siteDir#"$sitesDir/"}
85 crawlId=${crawlId%/}
86
87 echo "Processing crawlId: $crawlId"
88
89 if [ -d "$outputDir/$crawlId" ]; then
90 # Skip site already processed. *Append* this msg to log.out
91 echo "" 2>&1 | tee -a ${siteDir}log.out
92 echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
93 echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out
94
95 else
96 crawlSite $siteDir $crawlId
97
98 fi
99 echo "--------------------------------------------------"
100
101 break
102done
Note: See TracBrowser for help on using the repository browser.