Ignore:
Timestamp:
2019-10-16T21:39:56+13:00 (5 years ago)
Author:
ak19
Message:

Forgot to document that spaces were also allowed as separator in the input of crawl-site ids to the batchcrawl.sh script

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33552 r33573  
    7777
    7878    String fileID = inFile.getName();
    79     //System.err.println("*** Processing wetfile: " + fileID);         
     79    //debug("*** Processing wetfile: " + fileID);           
    8080    fileID = fileID.substring(fileID.lastIndexOf("0")+1);
    8181    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
     
    195195                  String recordURI, String record)
    196196    {
    197     System.err.println("CrawlID: CC-MAIN-" + this.crawlID
     197    info("CrawlID: CC-MAIN-" + this.crawlID
    198198               + " WET #" + this.WETFileID
    199199               + " record #" + recordID
    200200               + " - contentLength: " + contentLength
    201201               + " - lineCount: " + lineCount);
    202     System.err.println("URI: " + recordURI);
    203     //System.err.println(record);
    204     //System.err.println("--------------------------");
     202    info("URI: " + recordURI);
     203    //debug(record);
     204    //info("--------------------------");
    205205
    206206    File parentFolder = null;
     
    215215        else if(batchProcessor.isGreylisted(recordURI)) {
    216216        parentFolder = batchProcessor.greyListedFolder;
    217         System.err.println("@@@GREYLISTED");
     217        debug("@@@GREYLISTED");
    218218        }
    219219        else { // url was only blacklisted
    220220        parentFolder = batchProcessor.discardFolder;
    221         System.err.println("@@@DISCARDING - blacklisted");
     221        debug("@@@DISCARDING - blacklisted");
    222222        }
    223223    }
     
    229229        else {
    230230        parentFolder = batchProcessor.greyListedFolder;
    231         System.err.println("@@@GREYLISTED");
     231        debug("@@@GREYLISTED");
    232232        }
    233233    }
     
    274274        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
    275275        parentFolder = batchProcessor.discardFolder;
    276         System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
     276        debug("@@@DISCARDING - CAMELCASED CONTENTS");
    277277        }
    278278        else*/
     
    282282        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
    283283        parentFolder = batchProcessor.keepFolder;
    284         System.err.println("@@@KEEPING");
     284        debug("@@@KEEPING");
    285285        }
    286286    }
     
    289289    if(parentFolder == null) {
    290290        parentFolder = batchProcessor.discardFolder;
    291         System.err.println("@@@DISCARDING");
     291        debug("@@@DISCARDING");
    292292    }
    293293
     
    301301        }
    302302    } catch(Exception e) {
    303         System.err.println("Unable to write URL");
     303        debug("Unable to write URL");
    304304        e.printStackTrace();
    305305    }
    306306   
    307     System.err.println("--------------------------");
     307    debug("--------------------------");
    308308
    309309    // outFilename will look something like YYYY-##-####
     
    319319    } catch(IOException ioe) {
    320320        ioe.printStackTrace();
    321         System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
    322     }
     321        error("@@@@@@@@@ Error writing to file " + outFile, ioe);
     322    }
     323    }
     324
     325
     326    public void info(String msg) {
     327    System.err.println(msg);
     328    logger.info(msg);
     329    }
     330    public void debug(String msg) {
     331    System.err.println(msg);
     332    logger.debug(msg);
     333    }
     334    public void warn(String msg) {
     335    System.err.println(msg);
     336    logger.warn(msg);
     337    }
     338    public void error(String msg) {
     339    System.err.println(msg);
     340    logger.error(msg);
     341    }
     342    public void error(String msg, Exception e) {
     343    logger.error(msg, e);
     344    System.err.println("\n"+msg);
     345    e.printStackTrace();
    323346    }
    324347}
Note: See TracChangeset for help on using the changeset viewer.