Changeset 33573

Show
Ignore:
Timestamp:
16.10.2019 21:39:56 (4 weeks ago)
Author:
ak19
Message:

Forgot to document that spaces were also allowed as separator in the input of crawl-site ids to the batchcrawl.sh script

Location:
gs3-extensions/maori-lang-detection
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33571 r33573  
    165165    echo "  $0 -all|<ids>" 
    166166    echo "    where an id is a folder name in to_crawl/sites" 
    167     echo "    and ids can be a comma separated list of" 
     167    echo "    and ids can be a comma or space separated list of" 
    168168    echo "    individual ids and/or ranges" 
    169169    echo "  Examples:" 
    170     echo "    $0 00008-00022,00025,00026,00028-00034" 
     170    echo "    $0 00008-00022,00025,00026,00028-00034 00050" 
    171171    echo "    $0 -all" 
    172172    echo "    $0 00312" 
     
    189189    fi 
    190190     
    191     # works - split args on comma 
     191    # works - split args on comma or space 
    192192    # https://stackoverflow.com/questions/918886/how-do-i-split-a-string-on-a-delimiter-in-bash 
    193193    IFS=', ' read -ra IDS <<< "$args" 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33569 r33573  
    761761        // Finally, we can process this WETFile's records into the keep and discard pile 
    762762        wetFileCount++; 
    763         logger.debug("Off to process " + WETFile); 
     763        debug("Off to process " + WETFile); 
    764764        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 
    765765        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##      
     
    794794    public static void error(String msg, Exception e) { 
    795795    logger.error(msg, e); 
    796     System.err.println(msg); 
     796    System.err.println("\n"+msg); 
    797797    e.printStackTrace(); 
    798798    } 
     
    810810    public boolean accept(File dir, String name) { 
    811811        if(name.endsWith(".warc.wet")) { 
    812         logger.debug("Will include " + name + " for processing."); 
     812        debug("Will include " + name + " for processing."); 
    813813        return true; 
    814814        } 
     
    818818        File unzippedVersion = new File(dir, nameWithoutGZext); 
    819819        if(unzippedVersion.exists()) { 
    820             logger.debug("--- Unzipped version " + unzippedVersion + " exists."); 
    821             logger.debug("Skipping " + name); 
     820            debug("--- Unzipped version " + unzippedVersion + " exists."); 
     821            debug("Skipping " + name); 
    822822            return false; // don't count gzipped version if unzipped version exists. 
    823823        } 
    824824        else { 
    825             logger.debug("Only zipped version " + name + " exists."); 
     825            debug("Only zipped version " + name + " exists."); 
    826826            return true; // No unzipped version, so have to work with gzipped version 
    827827        } 
     
    829829 
    830830        // we're not even interested in any other file extensions 
    831         logger.debug("Not a WET file. Skipping " + name); 
     831        debug("Not a WET file. Skipping " + name); 
    832832        return false; 
    833833    } 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33552 r33573  
    7777 
    7878    String fileID = inFile.getName(); 
    79     //System.err.println("*** Processing wetfile: " + fileID);           
     79    //debug("*** Processing wetfile: " + fileID);            
    8080    fileID = fileID.substring(fileID.lastIndexOf("0")+1); 
    8181    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet 
     
    195195                  String recordURI, String record) 
    196196    { 
    197     System.err.println("CrawlID: CC-MAIN-" + this.crawlID 
     197    info("CrawlID: CC-MAIN-" + this.crawlID 
    198198               + " WET #" + this.WETFileID 
    199199               + " record #" + recordID 
    200200               + " - contentLength: " + contentLength 
    201201               + " - lineCount: " + lineCount); 
    202     System.err.println("URI: " + recordURI); 
    203     //System.err.println(record); 
    204     //System.err.println("--------------------------"); 
     202    info("URI: " + recordURI); 
     203    //debug(record); 
     204    //info("--------------------------"); 
    205205 
    206206    File parentFolder = null; 
     
    215215        else if(batchProcessor.isGreylisted(recordURI)) { 
    216216        parentFolder = batchProcessor.greyListedFolder; 
    217         System.err.println("@@@GREYLISTED"); 
     217        debug("@@@GREYLISTED"); 
    218218        } 
    219219        else { // url was only blacklisted 
    220220        parentFolder = batchProcessor.discardFolder; 
    221         System.err.println("@@@DISCARDING - blacklisted"); 
     221        debug("@@@DISCARDING - blacklisted"); 
    222222        } 
    223223    } 
     
    229229        else { 
    230230        parentFolder = batchProcessor.greyListedFolder; 
    231         System.err.println("@@@GREYLISTED"); 
     231        debug("@@@GREYLISTED"); 
    232232        } 
    233233    } 
     
    274274        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 
    275275        parentFolder = batchProcessor.discardFolder; 
    276         System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 
     276        debug("@@@DISCARDING - CAMELCASED CONTENTS"); 
    277277        } 
    278278        else*/ 
     
    282282        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
    283283        parentFolder = batchProcessor.keepFolder; 
    284         System.err.println("@@@KEEPING"); 
     284        debug("@@@KEEPING"); 
    285285        } 
    286286    } 
     
    289289    if(parentFolder == null) { 
    290290        parentFolder = batchProcessor.discardFolder; 
    291         System.err.println("@@@DISCARDING"); 
     291        debug("@@@DISCARDING"); 
    292292    } 
    293293 
     
    301301        } 
    302302    } catch(Exception e) { 
    303         System.err.println("Unable to write URL"); 
     303        debug("Unable to write URL"); 
    304304        e.printStackTrace(); 
    305305    } 
    306306    
    307     System.err.println("--------------------------"); 
     307    debug("--------------------------"); 
    308308 
    309309    // outFilename will look something like YYYY-##-#### 
     
    319319    } catch(IOException ioe) { 
    320320        ioe.printStackTrace(); 
    321         System.err.println("\n@@@@@@@@@ Error writing to file " + outFile); 
    322     } 
     321        error("@@@@@@@@@ Error writing to file " + outFile, ioe); 
     322    } 
     323    } 
     324 
     325 
     326    public void info(String msg) { 
     327    System.err.println(msg); 
     328    logger.info(msg); 
     329    } 
     330    public void debug(String msg) { 
     331    System.err.println(msg); 
     332    logger.debug(msg); 
     333    } 
     334    public void warn(String msg) { 
     335    System.err.println(msg); 
     336    logger.warn(msg); 
     337    } 
     338    public void error(String msg) { 
     339    System.err.println(msg); 
     340    logger.error(msg); 
     341    } 
     342    public void error(String msg, Exception e) { 
     343    logger.error(msg, e); 
     344    System.err.println("\n"+msg); 
     345    e.printStackTrace(); 
    323346    } 
    324347}