Changeset 33564
- Timestamp:
- 2019-10-14T21:01:17+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh
r33563 r33564 1 1 #!/bin/bash 2 echo "Hello world!"3 2 4 3 sitesDir=to_crawl/sites 5 echo "SITESDIR: $sitesDir" 4 echo "SITES DIR (INPUT): $sitesDir" 5 outputDir=crawled 6 mkdir -p $outputDir 7 echo "OUTPUT DIR: $outputDir" 8 6 9 7 10 NUTCH_HOME=apache-nutch-2.3.1 8 NUTCH_CONF_DIR=$NUTCH_HOME/ conf11 NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf 9 12 NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE 10 13 NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt 11 14 12 15 CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl 16 NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch 13 17 14 18 CRAWL_ITERATIONS=10 15 19 16 function prepareSite() {20 function crawlSite() { 17 21 siteDir=$1 18 22 crawlId=$2 19 23 20 #echo "processing site $siteDir" 21 22 #echo "processing site $siteDir with crawlId: $crawlId" 24 echo "processing site $siteDir with crawlId: $crawlId" 23 25 24 26 echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file" … … 32 34 33 35 # $siteDir parameter is the folder containing seedURLs.txt 36 # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout 34 37 crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS" 35 38 36 echo "Going to run nutch crawl command:" 37 echo " $crawl_cmd" 39 # Since we're going to crawl from scratch, create log.out file 40 echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out 41 # append to log.out file hereafter 42 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out 43 echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out 38 44 45 # append output of $crawl_cmd to log.out 46 $crawl_cmd 2>&1 | tee -a ${siteDir}log.out 47 result=$? 48 49 if [ "x$result" = "x0" ]; then 50 # nutch finished crawling successfully. 51 52 # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS 53 # If not, create file UNFINISHED to indicate a more thorough crawl needed 54 tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null 55 result=$? 56 if [ "x$result" != "x0" ]; then 57 echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED 58 echo "" 2>&1 | tee -a ${siteDir}UNFINISHED 59 echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED 60 echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED 61 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED 62 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED 63 echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED 64 fi 65 66 # outputDir/$crawlId should not yet exist 67 ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId 68 ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats 69 cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt 70 else 71 # appending to log.out 72 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out 73 fi 39 74 40 75 } … … 43 78 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash 44 79 for siteDir in $sitesDir/*/; do 45 #echo "$siteDir" 80 46 81 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ 47 82 # Remove the $sitesDir prefix of to_crawl/sites followed by /, … … 50 85 crawlId=${crawlId%/} 51 86 52 #echo "crawlId: $crawlId" 53 prepareSite $siteDir $crawlId 87 echo "Processing crawlId: $crawlId" 88 89 if [ -d "$outputDir/$crawlId" ]; then 90 # Skip site already processed. *Append* this msg to log.out 91 echo "" 2>&1 | tee -a ${siteDir}log.out 92 echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out 93 echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out 94 95 else 96 crawlSite $siteDir $crawlId 97 98 fi 99 echo "--------------------------------------------------" 100 54 101 break 55 102 done
Note:
See TracChangeset
for help on using the changeset viewer.