Changeset 33564 for gs3-extensions


Ignore:
Timestamp:
2019-10-14T21:01:17+13:00 (5 years ago)
Author:
ak19
Message:

batchcrawl.sh now does the crawl and logs output of the crawl, dumps text and stats resulting from the crawl into an output folder and creates an UNFINISHED file with instructions and old crawl cmd if crawl did not terminate in specified number of iterations. At present there's still a break statement to stop after the first site has been processed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33563 r33564  
    11#!/bin/bash
    2 echo "Hello world!"
    32
    43sitesDir=to_crawl/sites
    5 echo "SITESDIR: $sitesDir"
     4echo "SITES DIR (INPUT): $sitesDir"
     5outputDir=crawled
     6mkdir -p $outputDir
     7echo "OUTPUT DIR: $outputDir"
     8
    69
    710NUTCH_HOME=apache-nutch-2.3.1
    8 NUTCH_CONF_DIR=$NUTCH_HOME/conf
     11NUTCH_CONF_DIR=$NUTCH_HOME/runtime/local/conf
    912NUTCH_URLFILTER_TEMPLATE=$NUTCH_CONF_DIR/regex-urlfilter.GS_TEMPLATE
    1013NUTCH_URLFILTER_FILE=$NUTCH_CONF_DIR/regex-urlfilter.txt
    1114
    1215CRAWL_COMMAND=$NUTCH_HOME/runtime/local/bin/crawl
     16NUTCH_COMMAND=$NUTCH_HOME/runtime/local/bin/nutch
    1317
    1418CRAWL_ITERATIONS=10
    1519
    16 function prepareSite() {
     20function crawlSite() {
    1721    siteDir=$1
    1822    crawlId=$2
    1923
    20     #echo "processing site $siteDir"
    21 
    22     #echo "processing site $siteDir with crawlId: $crawlId"
     24    echo "processing site $siteDir with crawlId: $crawlId"
    2325
    2426    echo "Copying over template $NUTCH_URLFILTER_TEMPLATE to live version of file"
     
    3234   
    3335    # $siteDir parameter is the folder containing seedURLs.txt
     36    # https://stackoverflow.com/questions/418896/how-to-redirect-output-to-a-file-and-stdout
    3437    crawl_cmd="./$CRAWL_COMMAND $siteDir $crawlId $CRAWL_ITERATIONS"
    3538
    36     echo "Going to run nutch crawl command:"
    37     echo "  $crawl_cmd"
     39    # Since we're going to crawl from scratch, create log.out file
     40    echo "Going to run nutch crawl command (and copy output to ${siteDir}log.out):" 2>&1 | tee ${siteDir}log.out
     41    # append to log.out file hereafter
     42    echo "  $crawl_cmd" 2>&1 | tee -a ${siteDir}log.out
     43    echo "--------------------------------------------------" 2>&1 | tee -a ${siteDir}log.out
    3844
     45    # append output of $crawl_cmd to log.out
     46    $crawl_cmd 2>&1 | tee -a ${siteDir}log.out
     47    result=$?
     48
     49    if [ "x$result" = "x0" ]; then
     50    # nutch finished crawling successfully.
     51   
     52    # But check if the site was crawled thoroughly within $CRAWL_ITERATIONS
     53    # If not, create file UNFINISHED to indicate a more thorough crawl needed
     54    tail -10 ${siteDir}log.out | grep "no more URLs to fetch now" > /dev/null
     55    result=$?
     56    if [ "x$result" != "x0" ]; then
     57        echo "A crawl of $CRAWL_ITERATIONS iterations was insufficient for crawlId $crawlId" 2>&1 | tee ${siteDir}UNFINISHED
     58        echo "" 2>&1 | tee -a ${siteDir}UNFINISHED
     59        echo "To re-run crawl of site with crawlId $crawlId with a larger number of iterations:" 2>&1 | tee -a ${siteDir}UNFINISHED
     60        echo "1. delete $outputDir/$crawlId" 2>&1 | tee -a ${siteDir}UNFINISHED
     61        echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED
     62        echo "   cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED
     63        echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED
     64    fi
     65   
     66    # outputDir/$crawlId should not yet exist
     67        ./$NUTCH_COMMAND readdb -dump $outputDir/$crawlId -text -crawlId $crawlId
     68        ./$NUTCH_COMMAND readdb -stats -crawlId $crawlId > $outputDir/$crawlId/stats
     69        cat $outputDir/$crawlId/part-r-00000 > $outputDir/$crawlId/dump.txt
     70    else
     71    # appending to log.out
     72        echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
     73    fi
    3974   
    4075}
     
    4378# https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
    4479for siteDir in $sitesDir/*/; do
    45     #echo "$siteDir"
     80
    4681    # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
    4782    # Remove the $sitesDir prefix of to_crawl/sites followed by /,
     
    5085    crawlId=${crawlId%/}
    5186
    52     #echo "crawlId: $crawlId"
    53     prepareSite $siteDir $crawlId
     87    echo "Processing crawlId: $crawlId"
     88   
     89    if [ -d "$outputDir/$crawlId" ]; then
     90    # Skip site already processed. *Append* this msg to log.out
     91    echo "" 2>&1 | tee -a ${siteDir}log.out
     92    echo "**** $siteDir already processed. Skipping...." 2>&1 | tee -a ${siteDir}log.out
     93    echo "Delete $outputDir/$crawlId if you want to reprocess it." 2>&1 | tee -a ${siteDir}log.out 
     94
     95    else   
     96    crawlSite $siteDir $crawlId
     97
     98    fi
     99    echo "--------------------------------------------------"
     100   
    54101    break
    55102done
Note: See TracChangeset for help on using the changeset viewer.