Changeset 33573 for gs3-extensions


Ignore:
Timestamp:
2019-10-16T21:39:56+13:00 (5 years ago)
Author:
ak19
Message:

Forgot to document that spaces were also allowed as separator in the input of crawl-site ids to the batchcrawl.sh script

Location:
gs3-extensions/maori-lang-detection
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33571 r33573  
    165165    echo "  $0 -all|<ids>"
    166166    echo "    where an id is a folder name in to_crawl/sites"
    167     echo "    and ids can be a comma separated list of"
     167    echo "    and ids can be a comma or space separated list of"
    168168    echo "    individual ids and/or ranges"
    169169    echo "  Examples:"
    170     echo "    $0 00008-00022,00025,00026,00028-00034"
     170    echo "    $0 00008-00022,00025,00026,00028-00034 00050"
    171171    echo "    $0 -all"
    172172    echo "    $0 00312"
     
    189189    fi
    190190   
    191     # works - split args on comma
     191    # works - split args on comma or space
    192192    # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash
    193193    IFS=', ' read -ra IDS <<< "$args"
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33569 r33573  
    761761        // Finally, we can process this WETFile's records into the keep and discard pile
    762762        wetFileCount++;
    763         logger.debug("Off to process " + WETFile);
     763        debug("Off to process " + WETFile);
    764764        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
    765765        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##     
     
    794794    public static void error(String msg, Exception e) {
    795795    logger.error(msg, e);
    796     System.err.println(msg);
     796    System.err.println("\n"+msg);
    797797    e.printStackTrace();
    798798    }
     
    810810    public boolean accept(File dir, String name) {
    811811        if(name.endsWith(".warc.wet")) {
    812         logger.debug("Will include " + name + " for processing.");
     812        debug("Will include " + name + " for processing.");
    813813        return true;
    814814        }
     
    818818        File unzippedVersion = new File(dir, nameWithoutGZext);
    819819        if(unzippedVersion.exists()) {
    820             logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
    821             logger.debug("Skipping " + name);
     820            debug("--- Unzipped version " + unzippedVersion + " exists.");
     821            debug("Skipping " + name);
    822822            return false; // don't count gzipped version if unzipped version exists.
    823823        }
    824824        else {
    825             logger.debug("Only zipped version " + name + " exists.");
     825            debug("Only zipped version " + name + " exists.");
    826826            return true; // No unzipped version, so have to work with gzipped version
    827827        }
     
    829829
    830830        // we're not even interested in any other file extensions
    831         logger.debug("Not a WET file. Skipping " + name);
     831        debug("Not a WET file. Skipping " + name);
    832832        return false;
    833833    }
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33552 r33573  
    7777
    7878    String fileID = inFile.getName();
    79     //System.err.println("*** Processing wetfile: " + fileID);         
     79    //debug("*** Processing wetfile: " + fileID);           
    8080    fileID = fileID.substring(fileID.lastIndexOf("0")+1);
    8181    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
     
    195195                  String recordURI, String record)
    196196    {
    197     System.err.println("CrawlID: CC-MAIN-" + this.crawlID
     197    info("CrawlID: CC-MAIN-" + this.crawlID
    198198               + " WET #" + this.WETFileID
    199199               + " record #" + recordID
    200200               + " - contentLength: " + contentLength
    201201               + " - lineCount: " + lineCount);
    202     System.err.println("URI: " + recordURI);
    203     //System.err.println(record);
    204     //System.err.println("--------------------------");
     202    info("URI: " + recordURI);
     203    //debug(record);
     204    //info("--------------------------");
    205205
    206206    File parentFolder = null;
     
    215215        else if(batchProcessor.isGreylisted(recordURI)) {
    216216        parentFolder = batchProcessor.greyListedFolder;
    217         System.err.println("@@@GREYLISTED");
     217        debug("@@@GREYLISTED");
    218218        }
    219219        else { // url was only blacklisted
    220220        parentFolder = batchProcessor.discardFolder;
    221         System.err.println("@@@DISCARDING - blacklisted");
     221        debug("@@@DISCARDING - blacklisted");
    222222        }
    223223    }
     
    229229        else {
    230230        parentFolder = batchProcessor.greyListedFolder;
    231         System.err.println("@@@GREYLISTED");
     231        debug("@@@GREYLISTED");
    232232        }
    233233    }
     
    274274        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
    275275        parentFolder = batchProcessor.discardFolder;
    276         System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
     276        debug("@@@DISCARDING - CAMELCASED CONTENTS");
    277277        }
    278278        else*/
     
    282282        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
    283283        parentFolder = batchProcessor.keepFolder;
    284         System.err.println("@@@KEEPING");
     284        debug("@@@KEEPING");
    285285        }
    286286    }
     
    289289    if(parentFolder == null) {
    290290        parentFolder = batchProcessor.discardFolder;
    291         System.err.println("@@@DISCARDING");
     291        debug("@@@DISCARDING");
    292292    }
    293293
     
    301301        }
    302302    } catch(Exception e) {
    303         System.err.println("Unable to write URL");
     303        debug("Unable to write URL");
    304304        e.printStackTrace();
    305305    }
    306306   
    307     System.err.println("--------------------------");
     307    debug("--------------------------");
    308308
    309309    // outFilename will look something like YYYY-##-####
     
    319319    } catch(IOException ioe) {
    320320        ioe.printStackTrace();
    321         System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
    322     }
     321        error("@@@@@@@@@ Error writing to file " + outFile, ioe);
     322    }
     323    }
     324
     325
     326    public void info(String msg) {
     327    System.err.println(msg);
     328    logger.info(msg);
     329    }
     330    public void debug(String msg) {
     331    System.err.println(msg);
     332    logger.debug(msg);
     333    }
     334    public void warn(String msg) {
     335    System.err.println(msg);
     336    logger.warn(msg);
     337    }
     338    public void error(String msg) {
     339    System.err.println(msg);
     340    logger.error(msg);
     341    }
     342    public void error(String msg, Exception e) {
     343    logger.error(msg, e);
     344    System.err.println("\n"+msg);
     345    e.printStackTrace();
    323346    }
    324347}
Note: See TracChangeset for help on using the changeset viewer.