Changeset 33615 for gs3-extensions


Ignore:
Timestamp:
2019-10-31T20:03:55+13:00 (4 years ago)
Author:
ak19
Message:
  1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
Location:
gs3-extensions/maori-lang-detection
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt

    r33603 r33615  
    332332https://ipinfo.info/html/ip_checker.php
    333333
     334
     335
     336----------
     337MongoDB
     338Installation:
     339    https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/
     340        https://docs.mongodb.com/manual/administration/install-on-linux/
     341    https://hevodata.com/blog/install-mongodb-on-ubuntu/
     342    https://www.digitalocean.com/community/tutorials/how-to-install-mongodb-on-ubuntu-16-04
     343    CENTOS (Analytics): https://tecadmin.net/install-mongodb-on-centos/
     344    FROM SOURCE: https://github.com/mongodb/mongo/wiki/Build-Mongodb-From-Source
     345GUI:
     346    https://robomongo.org/
     347    Robomongo is Robo 3T now
     348
     349https://www.tutorialspoint.com/mongodb/mongodb_java.htm
     350JAR FILE:
     351    http://central.maven.org/maven2/org/mongodb/mongo-java-driver/
     352    https://mongodb.github.io/mongo-java-driver/
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33480 r33615  
    2121WETprocessor.min.num.words=20
    2222WETprocessor.max.words.camelcase=10
     23
     24
     25mongodb.user=admin
     26mongodb.pwd=pinky
     27#CHANGEME
     28
  • gs3-extensions/maori-lang-detection/conf/log4j.properties

    r33400 r33615  
    1212
    1313# Let's log everything from the most basic DEBUG msgs on (so INFO, WARN, ERROR too)
    14 log4j.rootCategory=DEBUG, mldlog
     14# Also want to log to the console, see
     15# https://stackoverflow.com/questions/3382985/how-to-make-log4j-to-write-to-the-console-as-well
    1516
    16 # copied from GS3
    17 log4j.category.web.servlets.DirectInput=INFO, mldlog
    18 # Prevent logging duplicating its messages into rootLogger:
    19 # https://stackoverflow.com/questions/9208710/java-log4j-log-to-both-category-and-root-logger
    20 log4j.additivity.web.servlets.DirectInput=false
     17log4j.rootCategory=DEBUG, console, mldlog
     18
     19# Define Console Appender
     20log4j.appender.console=org.apache.log4j.ConsoleAppender
     21# Define the layout for console appender. If you do not define it, you will get an error
     22log4j.appender.console.layout=org.apache.log4j.PatternLayout
     23log4j.appender.console.Target=System.err
    2124
    2225# We're saying our log "mldlog" is some file that gets appended to,
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33604 r33615  
    150150       
    151151    } catch(Exception e) {
    152         error("Exception attempting to read properties from config.properties.", e);
     152        logger.error("Exception attempting to read properties from config.properties.", e);
    153153    }
    154154
    155155    if(configProperties.size() == 0) {
    156         warn("*** Warning: no values read into config properties. Using defaults.");
     156        logger.warn("*** Warning: no values read into config properties. Using defaults.");
    157157    }
    158158   
     
    190190
    191191    // prepare our blacklist, greylist (for inspection) and whitelist
    192     info("Loading blacklist.");
     192    logger.info("Loading blacklist.");
    193193    blackList = new HashMap<String, Integer>();
    194194    initURLFilterList(blackList, "url-blacklist-filter.txt");
    195195   
    196     info("Loading greylist.");
     196    logger.info("Loading greylist.");
    197197    greyList = new HashMap<String, Integer>();
    198198    initURLFilterList(greyList, "url-greylist-filter.txt");
    199199   
    200     info("Loading whitelist.");
     200    logger.info("Loading whitelist.");
    201201    whiteList = new HashMap<String, Integer>();
    202202    initURLFilterList(whiteList, "url-whitelist-filter.txt");
    203203
    204204    // Create the map of topSites
    205     info("Loading map of topsites with regex of allowable url patterns for each topsite.");
     205    logger.info("Loading map of topsites with regex of allowable url patterns for each topsite.");
    206206    topSitesMap = new HashMap<String, String>();
    207207   
     
    226226        topSitesMap.put(topsite, allowed_url_pattern);
    227227
    228         //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
     228        //logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
    229229       
    230230        }
    231231    } catch(Exception e) {
    232         error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
     232        logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
    233233    }   
    234234 
    235     //debug("Prematurely terminating for testing purposes.");
     235    //logger.debug("Prematurely terminating for testing purposes.");
    236236    //System.exit(-1);
    237237    }
     
    323323        // for later manual inspection
    324324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
     325            /*
    325326            if(!possibleProductDomains.contains(domainWithProtocol)) {
    326327
     
    333334                countryCode = ""; // forces domain to be included for inspection
    334335               
    335                 error("Could not check if domain " + domainWithProtocol
     336                logger.error("Could not check if domain " + domainWithProtocol
    336337                  + " was in country: " + countryCode,
    337338                  exceptObj);
     
    348349                possibleProductSitesWriter.write("\t" + url + "\n");
    349350            }
    350             } /*else {
     351            }*/ /*else {
    351352            // already wrote out domain to file at some point, write just the URL out to file
    352353            possibleProductSitesWriter.write("\t" + url + "\n");
     
    355356        }
    356357    } catch (IOException ioe) {
    357         error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
     358        logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
    358359    }
    359360
     
    393394    String value = topSitesMap.get("wikipedia.org");
    394395    if(value == null) {
    395         debug("### wikipedia.org had null value");
     396        logger.debug("### wikipedia.org had null value");
    396397    } else {
    397         debug("### wikipedia.org had value: " + value);
     398        logger.debug("### wikipedia.org had value: " + value);
    398399    } // DEBUG
    399400        */
     
    407408
    408409        /*if(domain.contains("docs.google.com")) {
    409             debug("domain with protocol: " + domainWithProtocol);
    410             debug("domain: " + domain);
     410            logger.debug("domain with protocol: " + domainWithProtocol);
     411            logger.debug("domain: " + domain);
    411412            }*/
    412413       
     
    531532           
    532533        } catch (IOException ioe) {
    533             error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
     534            logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
    534535        }
    535536       
     
    537538
    538539    } catch (IOException ioe) {
    539         error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
     540        logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
    540541                   + "\n\t" + urlFilterFile
    541542                   + "\n\t" + domainURLsFile
     
    545546    /*
    546547    // BEGIN DEBUG
    547     debug("@@@@ TopSitesMap contains: ");
     548    logger.debug("@@@@ TopSitesMap contains: ");
    548549    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
    549550        String topSite = entry.getKey();
    550551        String urlPattern = entry.getValue();       
    551         debug(topSite + " - " + urlPattern);
     552        logger.debug(topSite + " - " + urlPattern);
    552553    } // END DEBUG
    553554    */
     
    620621        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
    621622        // there's an entry for the URL in the topSitesMap
    622         debug("##### A top site matches URL domain " + domain);
     623        logger.debug("##### A top site matches URL domain " + domain);
    623624
    624625        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
     
    687688    String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
    688689    if(!isBlackListed && url.contains("jasmin")) {
    689         warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
     690        logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
    690691        blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
    691692    }
     
    722723    // if filterListFilename does not exist in the conf folder, just return
    723724    if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
    724         warn("Filter list filename: " + filterListFilename + " does not exist");
     725        logger.warn("Filter list filename: " + filterListFilename + " does not exist");
    725726        return;     
    726727    }
     
    744745            filter = filter.substring(1);
    745746            list.put(filter, LIST_ENTRY_STARTSWITH);
    746             //debug("Match filter startswith: " + filter);
     747            //logger.debug("Match filter startswith: " + filter);
    747748        }
    748749        else if(filter.endsWith("$")) {
    749750            filter = filter.substring(0, filter.length()-1);
    750751            list.put(filter, LIST_ENTRY_ENDSWITH);
    751             //debug("@@@ Match filter endswith: " + filter);
     752            //logger.debug("@@@ Match filter endswith: " + filter);
    752753        }
    753754        else {
    754755            list.put(filter, LIST_ENTRY_CONTAINS);
    755756        }
    756         //debug("Got filter: " + filter);
     757        //logger.debug("Got filter: " + filter);
    757758        }
    758759       
    759760    } catch (IOException ioe) {
    760         error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
     761        logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
    761762    }
    762763   
     
    781782    for(int i = 0; i < WETFiles.length; i++) {
    782783        File WETFile = WETFiles[i];     
    783         debug("Processing WETfile: " + WETFile);
     784        logger.debug("Processing WETfile: " + WETFile);
    784785
    785786        // Any .gz files listed means they haven't been unzipped yet. So unzip.
     
    797798
    798799        if(!WETFile.exists() || !WETFile.isFile()) {       
    799         error("Error: " + WETFile + " does not exist (failure to unzip?)");
     800        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
    800801        return;
    801802        }
     
    803804        // Finally, we can process this WETFile's records into the keep and discard pile
    804805        wetFileCount++;
    805         debug("Off to process " + WETFile);
     806        logger.debug("Off to process " + WETFile);
    806807        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
    807808        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##     
     
    817818
    818819
    819     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
    820     public static void info(String msg) {
    821     System.err.println(msg);
    822     logger.info(msg);
    823     }
    824     public static void debug(String msg) {
    825     System.err.println(msg);
    826     logger.debug(msg);
    827     }
    828     public static void warn(String msg) {
    829     System.err.println(msg);
    830     logger.warn(msg);
    831     }
    832     public static void error(String msg) {
    833     System.err.println(msg);
    834     logger.error(msg);
    835     }
    836     public static void error(String msg, Exception e) {
    837     logger.error(msg, e);
    838     System.err.println("\n"+msg);
    839     e.printStackTrace();
    840     }
    841    
     820    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //   
    842821    public static void printUsage() {
    843     info("Run this program as:");
    844     info("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");   
     822    System.err.println("Run this program as:");
     823    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
    845824    }
    846825
     
    852831    public boolean accept(File dir, String name) {
    853832        if(name.endsWith(".warc.wet")) {
    854         debug("Will include " + name + " for processing.");
     833        logger.debug("Will include " + name + " for processing.");
    855834        return true;
    856835        }
     
    860839        File unzippedVersion = new File(dir, nameWithoutGZext);
    861840        if(unzippedVersion.exists()) {
    862             debug("--- Unzipped version " + unzippedVersion + " exists.");
    863             debug("Skipping " + name);
     841            logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
     842            logger.debug("Skipping " + name);
    864843            return false; // don't count gzipped version if unzipped version exists.
    865844        }
    866845        else {
    867             debug("Only zipped version " + name + " exists.");
     846            logger.debug("Only zipped version " + name + " exists.");
    868847            return true; // No unzipped version, so have to work with gzipped version
    869848        }
     
    871850
    872851        // we're not even interested in any other file extensions
    873         debug("Not a WET file. Skipping " + name);
     852        logger.debug("Not a WET file. Skipping " + name);
    874853        return false;
    875854    }
     
    887866        }
    888867        else {
    889         info("File " + f + " is not a directory");
     868        logger.info("File " + f + " is not a directory");
    890869        }
    891870        return false;                 
     
    901880    File commoncrawlDir = new File(args[0]);
    902881    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
    903         error("Error: " + args[0] + " does not exist or is not a directory");
     882        logger.error("Error: " + args[0] + " does not exist or is not a directory");
    904883        return;
    905884    }
     
    907886    File outFolder = new File(args[1]);
    908887    if(!outFolder.exists() || !outFolder.isDirectory()) {
    909         error("Error: " + args[1] + " does not exist or is not a directory.");
     888        logger.error("Error: " + args[1] + " does not exist or is not a directory.");
    910889        return;
    911890    }   
     
    918897    for(int i = 0; i < ccrawlFolders.length; i++) {
    919898        File ccrawlFolder = ccrawlFolders[i];
    920         info("About to process commoncrawl WET files folder: " + ccrawlFolder);
     899        logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder);
    921900        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);       
    922901    }
     
    933912    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
    934913
    935     info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
    936 
    937     info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
     914    logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     915
     916    logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
    938917   
    939918   
    940919    } catch(Exception e) {
    941920    // can get an exception when instantiating CCWETProcessor instance
    942     error(e.getMessage(), e);
     921    logger.error(e.getMessage(), e);
    943922    }
    944923   
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java

    r33608 r33615  
    1212package org.greenstone.atea;
    1313
    14 import java.io.*;
     14
    1515import opennlp.tools.langdetect.*;
    1616import opennlp.tools.sentdetect.*;
    1717import opennlp.tools.util.*;
    1818
     19import java.io.*;
    1920import java.util.ArrayList;
     21
     22//import org.apache.log4j.Logger;
     23
    2024
    2125/**
     
    2731 *
    2832 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
    29  *    maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
     33 *    maori-lang-detection/src$ javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
    3034 *
    3135 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
    3236 *
    33  *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
    34  *
    35  *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file &lt;full/path/to/textfile&gt;
    36  *
    37  *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
     37 *    maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
     38 *
     39 *    maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file &lt;full/path/to/textfile&gt;
     40 *
     41 *    maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
    3842 *       Press enter. This variant of the program expects text to stream in from standard input.
    3943 *       If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
     
    4347 */
    4448public class MaoriTextDetector extends TextLanguageDetector {
     49    //static Logger logger = Logger.getLogger(org.greenstone.atea.MaoriTextDetector.class.getName());
     50   
    4551    /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
    4652    public static final String MAORI_3LETTER_CODE = "mri";
     
    135141    }
    136142    public static void doPrintErr(boolean runSilent, String msg) {
    137     if(!runSilent) System.err.println(msg); 
     143    if(!runSilent) System.err.println(msg);
    138144    }
    139145   
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33602 r33615  
    4949*/
    5050public class NutchTextDumpProcessor {
    51     private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
     51    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
    5252
    5353    static boolean DEBUG_MODE = true;
     
    9191    if(DEBUG_MODE) {
    9292        // START DEBUG
    93         debug("__________________________________________");
    94         debug("@@@ Found page entry: ");
    95         debug("__________________________________________");
    96         debug(pageDump.toString());
    97         debug("------------------------------------------");
     93        logger.debug("__________________________________________");
     94        logger.debug("@@@ Found page entry: ");
     95        logger.debug("__________________________________________");
     96        logger.debug(pageDump.toString());
     97        logger.debug("------------------------------------------");
    9898        // END DEBUG
    9999    }
     
    174174       
    175175    } catch (IOException ioe) {
    176         error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
     176        logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
    177177    }
    178178
     
    302302   
    303303   
    304     info("------------- " + this.siteID + " SITE STATS -----------");
    305 
    306     info("SITE DOMAIN: " + this.domainOfSite);
    307     info("Total number of web pages in site: " + pages.size());
    308     info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
     304    logger.info("------------- " + this.siteID + " SITE STATS -----------");
     305
     306    logger.info("SITE DOMAIN: " + this.domainOfSite);
     307    logger.info("Total number of web pages in site: " + pages.size());
     308    logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
    309309   
    310310    if(pagesInMRI.size() > 0) {
    311         info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
     311        logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
    312312        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
    313         info(mriWebPageInfo.toString());
    314         }
    315     }
    316 
    317     info("                      -----------                   ");
     313        logger.info(mriWebPageInfo.toString());
     314        }
     315    }
     316
     317    logger.info("                      -----------                   ");
    318318    if(pagesContainingMRI.size() > 0) {     
    319         info("The following pages weren't detected as primarily being in Māori");
    320         info("But still contained sentences detected as Māori");
     319        logger.info("The following pages weren't detected as primarily being in Māori");
     320        logger.info("But still contained sentences detected as Māori");
    321321        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
    322         info(mriWebPageInfo.toString());
     322        logger.info(mriWebPageInfo.toString());
    323323        }
    324324       
    325325    } else {
    326         info("No further pages detected as containing any sentences in MRI");     
    327     }
    328     info("                      -----------                   ");
     326        logger.info("No further pages detected as containing any sentences in MRI");       
     327    }
     328    logger.info("                      -----------                   ");
    329329    }
    330330
     
    358358   
    359359    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
    360     public static void info(String msg) {
    361     System.err.println(msg);
    362     logger.info(msg);
    363     }
    364     public static void debug(String msg) {
    365     System.err.println(msg);
    366     logger.debug(msg);
    367     }
    368     public static void warn(String msg) {
    369     System.err.println(msg);
    370     logger.warn(msg);
    371     }
    372     public static void error(String msg) {
    373     System.err.println(msg);
    374     logger.error(msg);
    375     }
    376     public static void error(String msg, Exception e) {
    377     logger.error(msg, e);
    378     System.err.println("\n"+msg);
    379     e.printStackTrace();
    380     }
    381    
     360   
    382361    public static void printUsage() {
    383     info("Run this program as:");
    384     info("\tNutchTextDumpProcessor <path to 'crawled' folder>");
     362    System.err.println("Run this program as:");
     363    System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>");
    385364    }
    386365   
     
    393372    File sitesDir = new File(args[0]);
    394373    if(!sitesDir.exists() || !sitesDir.isDirectory()) {
    395         error("Error: " + args[0] + " does not exist or is not a directory");
     374        logger.error("Error: " + args[0] + " does not exist or is not a directory");
    396375        return;
    397376    }
     
    430409            File txtDumpFile = new File(siteDir, "dump.txt");
    431410            if(!txtDumpFile.exists()) {
    432             error("Text dump file " + txtDumpFile + " did not exist");
     411            logger.error("Text dump file " + txtDumpFile + " did not exist");
    433412            continue;
    434413            }
     
    439418            String siteID = siteDir.getName();
    440419            long lastModified = siteDir.lastModified();
    441             debug("Found siteID: " + siteID);           
     420            logger.debug("Found siteID: " + siteID);           
    442421            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
    443422                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
     
    458437        // can get an exception when instantiating CCWETProcessor instance
    459438        // or with CSV file
    460         error(e.getMessage(), e);
     439        logger.error(e.getMessage(), e);
    461440    }
    462441    }
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33582 r33615  
    3838        tuples.put("key", key.trim());
    3939        //} else {
    40         //debug("@@@@ no key for pageURL: " + pageURL);
     40        //logger.debug("@@@@ no key for pageURL: " + pageURL);
    4141        //}
    4242        /*
    4343        if(pageURL.contains(TEXT_END_MARKER)) {
    44         debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
    45         debug("+++++++++");
    46         debug(unparsedPageDump);
    47         debug("+++++++++");
     44        logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");
     45        logger.debug("+++++++++");
     46        logger.debug(unparsedPageDump);
     47        logger.debug("+++++++++");
    4848        }
    4949        */
     
    8787            } else {
    8888                if(NutchTextDumpProcessor.DEBUG_MODE) {
    89                 error("No meta key for meta: " + line);
    90                 error(unparsedPageDump);
     89                logger.error("No meta key for meta: " + line);
     90                logger.error(unparsedPageDump);
    9191                }
    9292            }
     
    103103       
    104104    } catch (IOException ioe) {
    105         error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
     105        logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);
    106106    }
    107107
     
    115115    public void debugTuples() {
    116116    if(NutchTextDumpProcessor.DEBUG_MODE) {
    117         debug("__________________________________________");
     117        logger.debug("__________________________________________");
    118118        for(Map.Entry<String, String> entry : tuples.entrySet()) {
    119119        String key = entry.getKey();
    120120        String value = entry.getValue();       
    121         debug(key + " - " + value);
     121        logger.debug(key + " - " + value);
    122122        }
    123         debug("__________________________________________");
     123        logger.debug("__________________________________________");
    124124    }
    125125    }
     
    164164    }
    165165   
    166     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
    167     public static void info(String msg) {
    168     System.err.println(msg);
    169     logger.info(msg);
    170     }
    171     public static void debug(String msg) {
    172     System.err.println(msg);
    173     logger.debug(msg);
    174     }
    175     public static void warn(String msg) {
    176     System.err.println(msg);
    177     logger.warn(msg);
    178     }
    179     public static void error(String msg) {
    180     System.err.println(msg);
    181     logger.error(msg);
    182     }
    183     public static void error(String msg, Exception e) {
    184     logger.error(msg, e);
    185     System.err.println("\n"+msg);
    186     e.printStackTrace();
    187     }
    188    
    189166}
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33573 r33615  
    7777
    7878    String fileID = inFile.getName();
    79     //debug("*** Processing wetfile: " + fileID);           
     79    //logger.debug("*** Processing wetfile: " + fileID);           
    8080    fileID = fileID.substring(fileID.lastIndexOf("0")+1);
    8181    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
     
    195195                  String recordURI, String record)
    196196    {
    197     info("CrawlID: CC-MAIN-" + this.crawlID
     197    logger.info("CrawlID: CC-MAIN-" + this.crawlID
    198198               + " WET #" + this.WETFileID
    199199               + " record #" + recordID
    200200               + " - contentLength: " + contentLength
    201201               + " - lineCount: " + lineCount);
    202     info("URI: " + recordURI);
    203     //debug(record);
    204     //info("--------------------------");
     202    logger.info("URI: " + recordURI);
     203    //logger.debug(record);
     204    //logger.info("--------------------------");
    205205
    206206    File parentFolder = null;
     
    215215        else if(batchProcessor.isGreylisted(recordURI)) {
    216216        parentFolder = batchProcessor.greyListedFolder;
    217         debug("@@@GREYLISTED");
     217        logger.debug("@@@GREYLISTED");
    218218        }
    219219        else { // url was only blacklisted
    220220        parentFolder = batchProcessor.discardFolder;
    221         debug("@@@DISCARDING - blacklisted");
     221        logger.debug("@@@DISCARDING - blacklisted");
    222222        }
    223223    }
     
    229229        else {
    230230        parentFolder = batchProcessor.greyListedFolder;
    231         debug("@@@GREYLISTED");
     231        logger.debug("@@@GREYLISTED");
    232232        }
    233233    }
     
    274274        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
    275275        parentFolder = batchProcessor.discardFolder;
    276         debug("@@@DISCARDING - CAMELCASED CONTENTS");
     276        logger.debug("@@@DISCARDING - CAMELCASED CONTENTS");
    277277        }
    278278        else*/
     
    282282        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
    283283        parentFolder = batchProcessor.keepFolder;
    284         debug("@@@KEEPING");
     284        logger.debug("@@@KEEPING");
    285285        }
    286286    }
     
    289289    if(parentFolder == null) {
    290290        parentFolder = batchProcessor.discardFolder;
    291         debug("@@@DISCARDING");
     291        logger.debug("@@@DISCARDING");
    292292    }
    293293
     
    301301        }
    302302    } catch(Exception e) {
    303         debug("Unable to write URL");
     303        logger.debug("Unable to write URL");
    304304        e.printStackTrace();
    305305    }
    306306   
    307     debug("--------------------------");
     307    logger.debug("--------------------------");
    308308
    309309    // outFilename will look something like YYYY-##-####
     
    319319    } catch(IOException ioe) {
    320320        ioe.printStackTrace();
    321         error("@@@@@@@@@ Error writing to file " + outFile, ioe);
     321        logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe);
    322322    }
    323323    }
    324324
    325325
    326     public void info(String msg) {
    327     System.err.println(msg);
    328     logger.info(msg);
    329     }
    330     public void debug(String msg) {
    331     System.err.println(msg);
    332     logger.debug(msg);
    333     }
    334     public void warn(String msg) {
    335     System.err.println(msg);
    336     logger.warn(msg);
    337     }
    338     public void error(String msg) {
    339     System.err.println(msg);
    340     logger.error(msg);
    341     }
    342     public void error(String msg, Exception e) {
    343     logger.error(msg, e);
    344     System.err.println("\n"+msg);
    345     e.printStackTrace();
    346     }
    347326}
Note: See TracChangeset for help on using the changeset viewer.