Changeset 33615

Show
Ignore:
Timestamp:
31.10.2019 20:03:55 (2 weeks ago)
Author:
ak19
Message:

1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading?.

Location:
gs3-extensions/maori-lang-detection
Files:
8 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt

    r33603 r33615  
    332332https://ipinfo.info/html/ip_checker.php 
    333333 
     334 
     335 
     336---------- 
     337MongoDB 
     338Installation: 
     339    https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ 
     340        https://docs.mongodb.com/manual/administration/install-on-linux/ 
     341    https://hevodata.com/blog/install-mongodb-on-ubuntu/ 
     342    https://www.digitalocean.com/community/tutorials/how-to-install-mongodb-on-ubuntu-16-04 
     343    CENTOS (Analytics): https://tecadmin.net/install-mongodb-on-centos/ 
     344    FROM SOURCE: https://github.com/mongodb/mongo/wiki/Build-Mongodb-From-Source 
     345GUI: 
     346    https://robomongo.org/ 
     347    Robomongo is Robo 3T now 
     348 
     349https://www.tutorialspoint.com/mongodb/mongodb_java.htm 
     350JAR FILE: 
     351    http://central.maven.org/maven2/org/mongodb/mongo-java-driver/ 
     352    https://mongodb.github.io/mongo-java-driver/ 
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33480 r33615  
    2121WETprocessor.min.num.words=20 
    2222WETprocessor.max.words.camelcase=10 
     23 
     24 
     25mongodb.user=admin 
     26mongodb.pwd=pinky 
     27#CHANGEME 
     28 
  • gs3-extensions/maori-lang-detection/conf/log4j.properties

    r33400 r33615  
    1212 
    1313# Let's log everything from the most basic DEBUG msgs on (so INFO, WARN, ERROR too) 
    14 log4j.rootCategory=DEBUG, mldlog 
     14# Also want to log to the console, see 
     15# https://stackoverflow.com/questions/3382985/how-to-make-log4j-to-write-to-the-console-as-well 
    1516 
    16 # copied from GS3 
    17 log4j.category.web.servlets.DirectInput=INFO, mldlog 
    18 # Prevent logging duplicating its messages into rootLogger: 
    19 # https://stackoverflow.com/questions/9208710/java-log4j-log-to-both-category-and-root-logger 
    20 log4j.additivity.web.servlets.DirectInput=false 
     17log4j.rootCategory=DEBUG, console, mldlog 
     18 
     19# Define Console Appender 
     20log4j.appender.console=org.apache.log4j.ConsoleAppender 
     21# Define the layout for console appender. If you do not define it, you will get an error 
     22log4j.appender.console.layout=org.apache.log4j.PatternLayout 
     23log4j.appender.console.Target=System.err 
    2124 
    2225# We're saying our log "mldlog" is some file that gets appended to, 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33604 r33615  
    150150         
    151151    } catch(Exception e) { 
    152         error("Exception attempting to read properties from config.properties.", e); 
     152        logger.error("Exception attempting to read properties from config.properties.", e); 
    153153    } 
    154154 
    155155    if(configProperties.size() == 0) { 
    156         warn("*** Warning: no values read into config properties. Using defaults."); 
     156        logger.warn("*** Warning: no values read into config properties. Using defaults."); 
    157157    } 
    158158     
     
    190190 
    191191    // prepare our blacklist, greylist (for inspection) and whitelist 
    192     info("Loading blacklist."); 
     192    logger.info("Loading blacklist."); 
    193193    blackList = new HashMap<String, Integer>(); 
    194194    initURLFilterList(blackList, "url-blacklist-filter.txt"); 
    195195     
    196     info("Loading greylist."); 
     196    logger.info("Loading greylist."); 
    197197    greyList = new HashMap<String, Integer>(); 
    198198    initURLFilterList(greyList, "url-greylist-filter.txt"); 
    199199     
    200     info("Loading whitelist."); 
     200    logger.info("Loading whitelist."); 
    201201    whiteList = new HashMap<String, Integer>(); 
    202202    initURLFilterList(whiteList, "url-whitelist-filter.txt"); 
    203203 
    204204    // Create the map of topSites 
    205     info("Loading map of topsites with regex of allowable url patterns for each topsite."); 
     205    logger.info("Loading map of topsites with regex of allowable url patterns for each topsite."); 
    206206    topSitesMap = new HashMap<String, String>(); 
    207207     
     
    226226        topSitesMap.put(topsite, allowed_url_pattern); 
    227227 
    228         //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern); 
     228        //logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern); 
    229229         
    230230        } 
    231231    } catch(Exception e) { 
    232         error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e); 
     232        logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e); 
    233233    }    
    234234  
    235     //debug("Prematurely terminating for testing purposes."); 
     235    //logger.debug("Prematurely terminating for testing purposes."); 
    236236    //System.exit(-1); 
    237237    } 
     
    323323        // for later manual inspection 
    324324        if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     325            /* 
    325326            if(!possibleProductDomains.contains(domainWithProtocol)) { 
    326327 
     
    333334                countryCode = ""; // forces domain to be included for inspection 
    334335                 
    335                 error("Could not check if domain " + domainWithProtocol 
     336                logger.error("Could not check if domain " + domainWithProtocol 
    336337                  + " was in country: " + countryCode, 
    337338                  exceptObj); 
     
    348349                possibleProductSitesWriter.write("\t" + url + "\n"); 
    349350            } 
    350             } /*else { 
     351            }*/ /*else { 
    351352            // already wrote out domain to file at some point, write just the URL out to file 
    352353            possibleProductSitesWriter.write("\t" + url + "\n"); 
     
    355356        } 
    356357    } catch (IOException ioe) { 
    357         error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe); 
     358        logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe); 
    358359    } 
    359360 
     
    393394    String value = topSitesMap.get("wikipedia.org"); 
    394395    if(value == null) { 
    395         debug("### wikipedia.org had null value"); 
     396        logger.debug("### wikipedia.org had null value"); 
    396397    } else { 
    397         debug("### wikipedia.org had value: " + value); 
     398        logger.debug("### wikipedia.org had value: " + value); 
    398399    } // DEBUG 
    399400        */ 
     
    407408 
    408409        /*if(domain.contains("docs.google.com")) { 
    409             debug("domain with protocol: " + domainWithProtocol); 
    410             debug("domain: " + domain); 
     410            logger.debug("domain with protocol: " + domainWithProtocol); 
     411            logger.debug("domain: " + domain); 
    411412            }*/ 
    412413         
     
    531532             
    532533        } catch (IOException ioe) { 
    533             error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe); 
     534            logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe); 
    534535        } 
    535536         
     
    537538 
    538539    } catch (IOException ioe) { 
    539         error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile 
     540        logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile 
    540541                   + "\n\t" + urlFilterFile 
    541542                   + "\n\t" + domainURLsFile 
     
    545546    /* 
    546547    // BEGIN DEBUG 
    547     debug("@@@@ TopSitesMap contains: "); 
     548    logger.debug("@@@@ TopSitesMap contains: "); 
    548549    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) { 
    549550        String topSite = entry.getKey(); 
    550551        String urlPattern = entry.getValue();        
    551         debug(topSite + " - " + urlPattern); 
     552        logger.debug(topSite + " - " + urlPattern); 
    552553    } // END DEBUG 
    553554    */ 
     
    620621        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 
    621622        // there's an entry for the URL in the topSitesMap 
    622         debug("##### A top site matches URL domain " + domain); 
     623        logger.debug("##### A top site matches URL domain " + domain); 
    623624 
    624625        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without 
     
    687688    String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol 
    688689    if(!isBlackListed && url.contains("jasmin")) { 
    689         warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); 
     690        logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); 
    690691        blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS); 
    691692    } 
     
    722723    // if filterListFilename does not exist in the conf folder, just return 
    723724    if(MY_CLASSLOADER.getResource(filterListFilename) == null) { 
    724         warn("Filter list filename: " + filterListFilename + " does not exist"); 
     725        logger.warn("Filter list filename: " + filterListFilename + " does not exist"); 
    725726        return;      
    726727    } 
     
    744745            filter = filter.substring(1); 
    745746            list.put(filter, LIST_ENTRY_STARTSWITH); 
    746             //debug("Match filter startswith: " + filter); 
     747            //logger.debug("Match filter startswith: " + filter); 
    747748        } 
    748749        else if(filter.endsWith("$")) { 
    749750            filter = filter.substring(0, filter.length()-1); 
    750751            list.put(filter, LIST_ENTRY_ENDSWITH); 
    751             //debug("@@@ Match filter endswith: " + filter); 
     752            //logger.debug("@@@ Match filter endswith: " + filter); 
    752753        } 
    753754        else { 
    754755            list.put(filter, LIST_ENTRY_CONTAINS); 
    755756        } 
    756         //debug("Got filter: " + filter); 
     757        //logger.debug("Got filter: " + filter); 
    757758        } 
    758759         
    759760    } catch (IOException ioe) { 
    760         error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe); 
     761        logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe); 
    761762    } 
    762763     
     
    781782    for(int i = 0; i < WETFiles.length; i++) { 
    782783        File WETFile = WETFiles[i];      
    783         debug("Processing WETfile: " + WETFile); 
     784        logger.debug("Processing WETfile: " + WETFile); 
    784785 
    785786        // Any .gz files listed means they haven't been unzipped yet. So unzip. 
     
    797798 
    798799        if(!WETFile.exists() || !WETFile.isFile()) {         
    799         error("Error: " + WETFile + " does not exist (failure to unzip?)"); 
     800        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 
    800801        return; 
    801802        } 
     
    803804        // Finally, we can process this WETFile's records into the keep and discard pile 
    804805        wetFileCount++; 
    805         debug("Off to process " + WETFile); 
     806        logger.debug("Off to process " + WETFile); 
    806807        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 
    807808        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##      
     
    817818 
    818819 
    819     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
    820     public static void info(String msg) { 
    821     System.err.println(msg); 
    822     logger.info(msg); 
    823     } 
    824     public static void debug(String msg) { 
    825     System.err.println(msg); 
    826     logger.debug(msg); 
    827     } 
    828     public static void warn(String msg) { 
    829     System.err.println(msg); 
    830     logger.warn(msg); 
    831     } 
    832     public static void error(String msg) { 
    833     System.err.println(msg); 
    834     logger.error(msg); 
    835     } 
    836     public static void error(String msg, Exception e) { 
    837     logger.error(msg, e); 
    838     System.err.println("\n"+msg); 
    839     e.printStackTrace(); 
    840     } 
    841      
     820    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //     
    842821    public static void printUsage() { 
    843     info("Run this program as:"); 
    844     info("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");    
     822    System.err.println("Run this program as:"); 
     823    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");  
    845824    } 
    846825 
     
    852831    public boolean accept(File dir, String name) { 
    853832        if(name.endsWith(".warc.wet")) { 
    854         debug("Will include " + name + " for processing."); 
     833        logger.debug("Will include " + name + " for processing."); 
    855834        return true; 
    856835        } 
     
    860839        File unzippedVersion = new File(dir, nameWithoutGZext); 
    861840        if(unzippedVersion.exists()) { 
    862             debug("--- Unzipped version " + unzippedVersion + " exists."); 
    863             debug("Skipping " + name); 
     841            logger.debug("--- Unzipped version " + unzippedVersion + " exists."); 
     842            logger.debug("Skipping " + name); 
    864843            return false; // don't count gzipped version if unzipped version exists. 
    865844        } 
    866845        else { 
    867             debug("Only zipped version " + name + " exists."); 
     846            logger.debug("Only zipped version " + name + " exists."); 
    868847            return true; // No unzipped version, so have to work with gzipped version 
    869848        } 
     
    871850 
    872851        // we're not even interested in any other file extensions 
    873         debug("Not a WET file. Skipping " + name); 
     852        logger.debug("Not a WET file. Skipping " + name); 
    874853        return false; 
    875854    } 
     
    887866        } 
    888867        else { 
    889         info("File " + f + " is not a directory"); 
     868        logger.info("File " + f + " is not a directory"); 
    890869        } 
    891870        return false;                   
     
    901880    File commoncrawlDir = new File(args[0]); 
    902881    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { 
    903         error("Error: " + args[0] + " does not exist or is not a directory"); 
     882        logger.error("Error: " + args[0] + " does not exist or is not a directory"); 
    904883        return; 
    905884    } 
     
    907886    File outFolder = new File(args[1]); 
    908887    if(!outFolder.exists() || !outFolder.isDirectory()) { 
    909         error("Error: " + args[1] + " does not exist or is not a directory."); 
     888        logger.error("Error: " + args[1] + " does not exist or is not a directory."); 
    910889        return; 
    911890    }    
     
    918897    for(int i = 0; i < ccrawlFolders.length; i++) { 
    919898        File ccrawlFolder = ccrawlFolders[i]; 
    920         info("About to process commoncrawl WET files folder: " + ccrawlFolder); 
     899        logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder); 
    921900        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);         
    922901    } 
     
    933912    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 
    934913 
    935     info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
    936  
    937     info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 
     914    logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
     915 
     916    logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 
    938917     
    939918     
    940919    } catch(Exception e) { 
    941920    // can get an exception when instantiating CCWETProcessor instance 
    942     error(e.getMessage(), e); 
     921    logger.error(e.getMessage(), e); 
    943922    } 
    944923     
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java

    r33608 r33615  
    1212package org.greenstone.atea; 
    1313 
    14 import java.io.*; 
     14 
    1515import opennlp.tools.langdetect.*; 
    1616import opennlp.tools.sentdetect.*; 
    1717import opennlp.tools.util.*; 
    1818 
     19import java.io.*; 
    1920import java.util.ArrayList; 
     21 
     22//import org.apache.log4j.Logger; 
     23 
    2024 
    2125/** 
     
    2731 * 
    2832 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file): 
    29  *    maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java 
     33 *    maori-lang-detection/src$ javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java 
    3034 * 
    3135 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file): 
    3236 * 
    33  *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help 
    34  * 
    35  *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file &lt;full/path/to/textfile&gt; 
    36  * 
    37  *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector - 
     37 *    maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help 
     38 * 
     39 *    maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file &lt;full/path/to/textfile&gt; 
     40 * 
     41 *    maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector - 
    3842 *       Press enter. This variant of the program expects text to stream in from standard input. 
    3943 *       If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn. 
     
    4347 */ 
    4448public class MaoriTextDetector extends TextLanguageDetector { 
     49    //static Logger logger = Logger.getLogger(org.greenstone.atea.MaoriTextDetector.class.getName()); 
     50     
    4551    /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */  
    4652    public static final String MAORI_3LETTER_CODE = "mri"; 
     
    135141    } 
    136142    public static void doPrintErr(boolean runSilent, String msg) { 
    137     if(!runSilent) System.err.println(msg);  
     143    if(!runSilent) System.err.println(msg); 
    138144    } 
    139145     
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33602 r33615  
    4949*/ 
    5050public class NutchTextDumpProcessor { 
    51     private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 
     51    static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 
    5252 
    5353    static boolean DEBUG_MODE = true; 
     
    9191    if(DEBUG_MODE) { 
    9292        // START DEBUG 
    93         debug("__________________________________________"); 
    94         debug("@@@ Found page entry: "); 
    95         debug("__________________________________________"); 
    96         debug(pageDump.toString()); 
    97         debug("------------------------------------------"); 
     93        logger.debug("__________________________________________"); 
     94        logger.debug("@@@ Found page entry: "); 
     95        logger.debug("__________________________________________"); 
     96        logger.debug(pageDump.toString()); 
     97        logger.debug("------------------------------------------"); 
    9898        // END DEBUG 
    9999    } 
     
    174174         
    175175    } catch (IOException ioe) { 
    176         error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe); 
     176        logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe); 
    177177    } 
    178178 
     
    302302     
    303303     
    304     info("------------- " + this.siteID + " SITE STATS -----------"); 
    305  
    306     info("SITE DOMAIN: " + this.domainOfSite); 
    307     info("Total number of web pages in site: " + pages.size()); 
    308     info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size()); 
     304    logger.info("------------- " + this.siteID + " SITE STATS -----------"); 
     305 
     306    logger.info("SITE DOMAIN: " + this.domainOfSite); 
     307    logger.info("Total number of web pages in site: " + pages.size()); 
     308    logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size()); 
    309309     
    310310    if(pagesInMRI.size() > 0) { 
    311         info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 
     311        logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 
    312312        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 
    313         info(mriWebPageInfo.toString()); 
    314         } 
    315     } 
    316  
    317     info("                      -----------                   "); 
     313        logger.info(mriWebPageInfo.toString()); 
     314        } 
     315    } 
     316 
     317    logger.info("                      -----------                   "); 
    318318    if(pagesContainingMRI.size() > 0) {      
    319         info("The following pages weren't detected as primarily being in Māori"); 
    320         info("But still contained sentences detected as Māori"); 
     319        logger.info("The following pages weren't detected as primarily being in Māori"); 
     320        logger.info("But still contained sentences detected as Māori"); 
    321321        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) { 
    322         info(mriWebPageInfo.toString()); 
     322        logger.info(mriWebPageInfo.toString()); 
    323323        } 
    324324         
    325325    } else { 
    326         info("No further pages detected as containing any sentences in MRI");       
    327     } 
    328     info("                      -----------                   "); 
     326        logger.info("No further pages detected as containing any sentences in MRI");        
     327    } 
     328    logger.info("                      -----------                   "); 
    329329    } 
    330330 
     
    358358     
    359359    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
    360     public static void info(String msg) { 
    361     System.err.println(msg); 
    362     logger.info(msg); 
    363     } 
    364     public static void debug(String msg) { 
    365     System.err.println(msg); 
    366     logger.debug(msg); 
    367     } 
    368     public static void warn(String msg) { 
    369     System.err.println(msg); 
    370     logger.warn(msg); 
    371     } 
    372     public static void error(String msg) { 
    373     System.err.println(msg); 
    374     logger.error(msg); 
    375     } 
    376     public static void error(String msg, Exception e) { 
    377     logger.error(msg, e); 
    378     System.err.println("\n"+msg); 
    379     e.printStackTrace(); 
    380     } 
    381      
     360    
    382361    public static void printUsage() { 
    383     info("Run this program as:"); 
    384     info("\tNutchTextDumpProcessor <path to 'crawled' folder>"); 
     362    System.err.println("Run this program as:"); 
     363    System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>"); 
    385364    } 
    386365     
     
    393372    File sitesDir = new File(args[0]); 
    394373    if(!sitesDir.exists() || !sitesDir.isDirectory()) { 
    395         error("Error: " + args[0] + " does not exist or is not a directory"); 
     374        logger.error("Error: " + args[0] + " does not exist or is not a directory"); 
    396375        return; 
    397376    } 
     
    430409            File txtDumpFile = new File(siteDir, "dump.txt"); 
    431410            if(!txtDumpFile.exists()) { 
    432             error("Text dump file " + txtDumpFile + " did not exist"); 
     411            logger.error("Text dump file " + txtDumpFile + " did not exist"); 
    433412            continue; 
    434413            } 
     
    439418            String siteID = siteDir.getName(); 
    440419            long lastModified = siteDir.lastModified(); 
    441             debug("Found siteID: " + siteID);            
     420            logger.debug("Found siteID: " + siteID);             
    442421            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor( 
    443422                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, 
     
    458437        // can get an exception when instantiating CCWETProcessor instance 
    459438        // or with CSV file 
    460         error(e.getMessage(), e); 
     439        logger.error(e.getMessage(), e); 
    461440    } 
    462441    } 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33582 r33615  
    3838        tuples.put("key", key.trim()); 
    3939        //} else { 
    40         //debug("@@@@ no key for pageURL: " + pageURL); 
     40        //logger.debug("@@@@ no key for pageURL: " + pageURL); 
    4141        //} 
    4242        /* 
    4343        if(pageURL.contains(TEXT_END_MARKER)) { 
    44         debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: "); 
    45         debug("+++++++++"); 
    46         debug(unparsedPageDump); 
    47         debug("+++++++++"); 
     44        logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: "); 
     45        logger.debug("+++++++++"); 
     46        logger.debug(unparsedPageDump); 
     47        logger.debug("+++++++++"); 
    4848        } 
    4949        */ 
     
    8787            } else { 
    8888                if(NutchTextDumpProcessor.DEBUG_MODE) { 
    89                 error("No meta key for meta: " + line); 
    90                 error(unparsedPageDump); 
     89                logger.error("No meta key for meta: " + line); 
     90                logger.error(unparsedPageDump); 
    9191                } 
    9292            } 
     
    103103         
    104104    } catch (IOException ioe) { 
    105         error("@@@@@@@@@ Error reading in txtdump of a page.", ioe); 
     105        logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe); 
    106106    } 
    107107 
     
    115115    public void debugTuples() { 
    116116    if(NutchTextDumpProcessor.DEBUG_MODE) { 
    117         debug("__________________________________________"); 
     117        logger.debug("__________________________________________"); 
    118118        for(Map.Entry<String, String> entry : tuples.entrySet()) { 
    119119        String key = entry.getKey(); 
    120120        String value = entry.getValue();         
    121         debug(key + " - " + value); 
     121        logger.debug(key + " - " + value); 
    122122        } 
    123         debug("__________________________________________"); 
     123        logger.debug("__________________________________________"); 
    124124    } 
    125125    } 
     
    164164    } 
    165165     
    166     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
    167     public static void info(String msg) { 
    168     System.err.println(msg); 
    169     logger.info(msg); 
    170     } 
    171     public static void debug(String msg) { 
    172     System.err.println(msg); 
    173     logger.debug(msg); 
    174     } 
    175     public static void warn(String msg) { 
    176     System.err.println(msg); 
    177     logger.warn(msg); 
    178     } 
    179     public static void error(String msg) { 
    180     System.err.println(msg); 
    181     logger.error(msg); 
    182     } 
    183     public static void error(String msg, Exception e) { 
    184     logger.error(msg, e); 
    185     System.err.println("\n"+msg); 
    186     e.printStackTrace(); 
    187     } 
    188      
    189166} 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33573 r33615  
    7777 
    7878    String fileID = inFile.getName(); 
    79     //debug("*** Processing wetfile: " + fileID);            
     79    //logger.debug("*** Processing wetfile: " + fileID);             
    8080    fileID = fileID.substring(fileID.lastIndexOf("0")+1); 
    8181    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet 
     
    195195                  String recordURI, String record) 
    196196    { 
    197     info("CrawlID: CC-MAIN-" + this.crawlID 
     197    logger.info("CrawlID: CC-MAIN-" + this.crawlID 
    198198               + " WET #" + this.WETFileID 
    199199               + " record #" + recordID 
    200200               + " - contentLength: " + contentLength 
    201201               + " - lineCount: " + lineCount); 
    202     info("URI: " + recordURI); 
    203     //debug(record); 
    204     //info("--------------------------"); 
     202    logger.info("URI: " + recordURI); 
     203    //logger.debug(record); 
     204    //logger.info("--------------------------"); 
    205205 
    206206    File parentFolder = null; 
     
    215215        else if(batchProcessor.isGreylisted(recordURI)) { 
    216216        parentFolder = batchProcessor.greyListedFolder; 
    217         debug("@@@GREYLISTED"); 
     217        logger.debug("@@@GREYLISTED"); 
    218218        } 
    219219        else { // url was only blacklisted 
    220220        parentFolder = batchProcessor.discardFolder; 
    221         debug("@@@DISCARDING - blacklisted"); 
     221        logger.debug("@@@DISCARDING - blacklisted"); 
    222222        } 
    223223    } 
     
    229229        else { 
    230230        parentFolder = batchProcessor.greyListedFolder; 
    231         debug("@@@GREYLISTED"); 
     231        logger.debug("@@@GREYLISTED"); 
    232232        } 
    233233    } 
     
    274274        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 
    275275        parentFolder = batchProcessor.discardFolder; 
    276         debug("@@@DISCARDING - CAMELCASED CONTENTS"); 
     276        logger.debug("@@@DISCARDING - CAMELCASED CONTENTS"); 
    277277        } 
    278278        else*/ 
     
    282282        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
    283283        parentFolder = batchProcessor.keepFolder; 
    284         debug("@@@KEEPING"); 
     284        logger.debug("@@@KEEPING"); 
    285285        } 
    286286    } 
     
    289289    if(parentFolder == null) { 
    290290        parentFolder = batchProcessor.discardFolder; 
    291         debug("@@@DISCARDING"); 
     291        logger.debug("@@@DISCARDING"); 
    292292    } 
    293293 
     
    301301        } 
    302302    } catch(Exception e) { 
    303         debug("Unable to write URL"); 
     303        logger.debug("Unable to write URL"); 
    304304        e.printStackTrace(); 
    305305    } 
    306306    
    307     debug("--------------------------"); 
     307    logger.debug("--------------------------"); 
    308308 
    309309    // outFilename will look something like YYYY-##-#### 
     
    319319    } catch(IOException ioe) { 
    320320        ioe.printStackTrace(); 
    321         error("@@@@@@@@@ Error writing to file " + outFile, ioe); 
     321        logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe); 
    322322    } 
    323323    } 
    324324 
    325325 
    326     public void info(String msg) { 
    327     System.err.println(msg); 
    328     logger.info(msg); 
    329     } 
    330     public void debug(String msg) { 
    331     System.err.println(msg); 
    332     logger.debug(msg); 
    333     } 
    334     public void warn(String msg) { 
    335     System.err.println(msg); 
    336     logger.warn(msg); 
    337     } 
    338     public void error(String msg) { 
    339     System.err.println(msg); 
    340     logger.error(msg); 
    341     } 
    342     public void error(String msg, Exception e) { 
    343     logger.error(msg, e); 
    344     System.err.println("\n"+msg); 
    345     e.printStackTrace(); 
    346     } 
    347326}