Changeset 33564

Show
Ignore:
Timestamp:
14.10.2019 21:01:17 (4 weeks ago)
Author:
ak19
Message:

batchcrawl.sh now does the crawl and logs output of the crawl, dumps text and stats resulting from the crawl into an output folder and creates an UNFINISHED file with instructions and old crawl cmd if crawl did not terminate in specified number of iterations. At present there's still a break statement to stop after the first site has been processed.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33563 r33564  
    11#!/bin/bash 
    2 echo "Hello world!" 
    32 
    43sitesDir=to_crawl/sites 
    5 echo "SITESDIR: $sitesDir" 
     4echo "SITES DIR (INPUT): $sitesDir" 
     5outputDir=crawled 
     6mkdir -p $outputDir 
     7echo "OUTPUT DIR: $outputDir" 
     8 
    69 
    710NUTCH_HOME=apache-nutch-2.3.1 
    8 NUTCH_CONF_DIR=$NUTCH_HOME/conf 
     11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf 
    912NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE 
    1013NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt 
    1114 
    1215CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl 
     16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch 
    1317 
    1418CRAWL_ITERATIONS=10 
    1519 
    16 function prepareSite() { 
     20function crawlSite() { 
    1721    siteDir=$1 
    1822    crawlId=$2 
    1923 
    20     #echo "processing site $siteDir" 
    21  
    22     #echo "processing site $siteDir with crawlId: $crawlId" 
     24    echo "processing site $siteDir with crawlId: $crawlId" 
    2325 
    2426    echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file" 
     
    3234     
    3335    # $siteDir parameter is the folder containing seedURLs.txt 
     36    # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout 
    3437    crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS" 
    3538 
    36     echo "Going to run nutch crawl command:" 
    37     echo "  $crawl_cmd" 
     39    # Since we're going to crawl from scratch, create log.out file 
     40    echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out 
     41    # append to log.out file hereafter 
     42    echo "  $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out 
     43    echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out 
    3844 
     45    # append output of $crawl_cmd to log.out 
     46    $crawl_cmd 2>&1 | tee -a ${siteDir}log.out 
     47    result=$? 
     48 
     49    if [ "x$result" = "x0" ]; then 
     50    # nutch finished crawling successfully. 
     51     
     52    # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS 
     53    # If not, create file UNFINISHED to indicate a more thorough crawl needed 
     54    tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null 
     55    result=$? 
     56    if [ "x$result" != "x0" ]; then 
     57        echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED 
     58        echo "" 2>&1 | tee -a ${siteDir}UNFINISHED 
     59        echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED 
     60        echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED 
     61        echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED 
     62        echo "   cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED 
     63        echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED 
     64    fi 
     65     
     66    # outputDir/$crawlId should not yet exist 
     67        ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId 
     68        ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats 
     69        cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt 
     70    else 
     71    # appending to log.out 
     72        echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out 
     73    fi 
    3974     
    4075} 
     
    4378# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash 
    4479for siteDir in $sitesDir/*/; do 
    45     #echo "$siteDir" 
     80 
    4681    # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ 
    4782    # Remove the $sitesDir prefix of to_crawl/sites followed by /, 
     
    5085    crawlId=${crawlId%/} 
    5186 
    52     #echo "crawlId: $crawlId" 
    53     prepareSite $siteDir $crawlId 
     87    echo "Processing crawlId: $crawlId" 
     88     
     89    if [ -d "$outputDir/$crawlId" ]; then 
     90    # Skip site already processed. *Append* this msg to log.out 
     91    echo "" 2>&1 | tee -a ${siteDir}log.out 
     92    echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out 
     93    echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out   
     94 
     95    else     
     96    crawlSite $siteDir $crawlId 
     97 
     98    fi 
     99    echo "--------------------------------------------------" 
     100     
    54101    break 
    55102done