Changeset 33615
- Timestamp:
- 2019-10-31T20:03:55+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/MoreReading/crawling-Nutch.txt
r33603 r33615 332 332 https://ipinfo.info/html/ip_checker.php 333 333 334 335 336 ---------- 337 MongoDB 338 Installation: 339 https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/ 340 https://docs.mongodb.com/manual/administration/install-on-linux/ 341 https://hevodata.com/blog/install-mongodb-on-ubuntu/ 342 https://www.digitalocean.com/community/tutorials/how-to-install-mongodb-on-ubuntu-16-04 343 CENTOS (Analytics): https://tecadmin.net/install-mongodb-on-centos/ 344 FROM SOURCE: https://github.com/mongodb/mongo/wiki/Build-Mongodb-From-Source 345 GUI: 346 https://robomongo.org/ 347 Robomongo is Robo 3T now 348 349 https://www.tutorialspoint.com/mongodb/mongodb_java.htm 350 JAR FILE: 351 http://central.maven.org/maven2/org/mongodb/mongo-java-driver/ 352 https://mongodb.github.io/mongo-java-driver/ -
gs3-extensions/maori-lang-detection/conf/config.properties
r33480 r33615 21 21 WETprocessor.min.num.words=20 22 22 WETprocessor.max.words.camelcase=10 23 24 25 mongodb.user=admin 26 mongodb.pwd=pinky 27 #CHANGEME 28 -
gs3-extensions/maori-lang-detection/conf/log4j.properties
r33400 r33615 12 12 13 13 # Let's log everything from the most basic DEBUG msgs on (so INFO, WARN, ERROR too) 14 log4j.rootCategory=DEBUG, mldlog 14 # Also want to log to the console, see 15 # https://stackoverflow.com/questions/3382985/how-to-make-log4j-to-write-to-the-console-as-well 15 16 16 # copied from GS3 17 log4j.category.web.servlets.DirectInput=INFO, mldlog 18 # Prevent logging duplicating its messages into rootLogger: 19 # https://stackoverflow.com/questions/9208710/java-log4j-log-to-both-category-and-root-logger 20 log4j.additivity.web.servlets.DirectInput=false 17 log4j.rootCategory=DEBUG, console, mldlog 18 19 # Define Console Appender 20 log4j.appender.console=org.apache.log4j.ConsoleAppender 21 # Define the layout for console appender. If you do not define it, you will get an error 22 log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 log4j.appender.console.Target=System.err 21 24 22 25 # We're saying our log "mldlog" is some file that gets appended to, -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33604 r33615 150 150 151 151 } catch(Exception e) { 152 error("Exception attempting to read properties from config.properties.", e);152 logger.error("Exception attempting to read properties from config.properties.", e); 153 153 } 154 154 155 155 if(configProperties.size() == 0) { 156 warn("*** Warning: no values read into config properties. Using defaults.");156 logger.warn("*** Warning: no values read into config properties. Using defaults."); 157 157 } 158 158 … … 190 190 191 191 // prepare our blacklist, greylist (for inspection) and whitelist 192 info("Loading blacklist.");192 logger.info("Loading blacklist."); 193 193 blackList = new HashMap<String, Integer>(); 194 194 initURLFilterList(blackList, "url-blacklist-filter.txt"); 195 195 196 info("Loading greylist.");196 logger.info("Loading greylist."); 197 197 greyList = new HashMap<String, Integer>(); 198 198 initURLFilterList(greyList, "url-greylist-filter.txt"); 199 199 200 info("Loading whitelist.");200 logger.info("Loading whitelist."); 201 201 whiteList = new HashMap<String, Integer>(); 202 202 initURLFilterList(whiteList, "url-whitelist-filter.txt"); 203 203 204 204 // Create the map of topSites 205 info("Loading map of topsites with regex of allowable url patterns for each topsite.");205 logger.info("Loading map of topsites with regex of allowable url patterns for each topsite."); 206 206 topSitesMap = new HashMap<String, String>(); 207 207 … … 226 226 topSitesMap.put(topsite, allowed_url_pattern); 227 227 228 // debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);228 //logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern); 229 229 230 230 } 231 231 } catch(Exception e) { 232 error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);232 logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e); 233 233 } 234 234 235 // debug("Prematurely terminating for testing purposes.");235 //logger.debug("Prematurely terminating for testing purposes."); 236 236 //System.exit(-1); 237 237 } … … 323 323 // for later manual inspection 324 324 if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) { 325 /* 325 326 if(!possibleProductDomains.contains(domainWithProtocol)) { 326 327 … … 333 334 countryCode = ""; // forces domain to be included for inspection 334 335 335 error("Could not check if domain " + domainWithProtocol336 logger.error("Could not check if domain " + domainWithProtocol 336 337 + " was in country: " + countryCode, 337 338 exceptObj); … … 348 349 possibleProductSitesWriter.write("\t" + url + "\n"); 349 350 } 350 }/*else {351 }*/ /*else { 351 352 // already wrote out domain to file at some point, write just the URL out to file 352 353 possibleProductSitesWriter.write("\t" + url + "\n"); … … 355 356 } 356 357 } catch (IOException ioe) { 357 error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);358 logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe); 358 359 } 359 360 … … 393 394 String value = topSitesMap.get("wikipedia.org"); 394 395 if(value == null) { 395 debug("### wikipedia.org had null value");396 logger.debug("### wikipedia.org had null value"); 396 397 } else { 397 debug("### wikipedia.org had value: " + value);398 logger.debug("### wikipedia.org had value: " + value); 398 399 } // DEBUG 399 400 */ … … 407 408 408 409 /*if(domain.contains("docs.google.com")) { 409 debug("domain with protocol: " + domainWithProtocol);410 debug("domain: " + domain);410 logger.debug("domain with protocol: " + domainWithProtocol); 411 logger.debug("domain: " + domain); 411 412 }*/ 412 413 … … 531 532 532 533 } catch (IOException ioe) { 533 error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);534 logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe); 534 535 } 535 536 … … 537 538 538 539 } catch (IOException ioe) { 539 error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile540 logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile 540 541 + "\n\t" + urlFilterFile 541 542 + "\n\t" + domainURLsFile … … 545 546 /* 546 547 // BEGIN DEBUG 547 debug("@@@@ TopSitesMap contains: ");548 logger.debug("@@@@ TopSitesMap contains: "); 548 549 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) { 549 550 String topSite = entry.getKey(); 550 551 String urlPattern = entry.getValue(); 551 debug(topSite + " - " + urlPattern);552 logger.debug(topSite + " - " + urlPattern); 552 553 } // END DEBUG 553 554 */ … … 620 621 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 621 622 // there's an entry for the URL in the topSitesMap 622 debug("##### A top site matches URL domain " + domain);623 logger.debug("##### A top site matches URL domain " + domain); 623 624 624 625 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without … … 687 688 String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol 688 689 if(!isBlackListed && url.contains("jasmin")) { 689 warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);690 logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); 690 691 blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS); 691 692 } … … 722 723 // if filterListFilename does not exist in the conf folder, just return 723 724 if(MY_CLASSLOADER.getResource(filterListFilename) == null) { 724 warn("Filter list filename: " + filterListFilename + " does not exist");725 logger.warn("Filter list filename: " + filterListFilename + " does not exist"); 725 726 return; 726 727 } … … 744 745 filter = filter.substring(1); 745 746 list.put(filter, LIST_ENTRY_STARTSWITH); 746 // debug("Match filter startswith: " + filter);747 //logger.debug("Match filter startswith: " + filter); 747 748 } 748 749 else if(filter.endsWith("$")) { 749 750 filter = filter.substring(0, filter.length()-1); 750 751 list.put(filter, LIST_ENTRY_ENDSWITH); 751 // debug("@@@ Match filter endswith: " + filter);752 //logger.debug("@@@ Match filter endswith: " + filter); 752 753 } 753 754 else { 754 755 list.put(filter, LIST_ENTRY_CONTAINS); 755 756 } 756 // debug("Got filter: " + filter);757 //logger.debug("Got filter: " + filter); 757 758 } 758 759 759 760 } catch (IOException ioe) { 760 error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);761 logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe); 761 762 } 762 763 … … 781 782 for(int i = 0; i < WETFiles.length; i++) { 782 783 File WETFile = WETFiles[i]; 783 debug("Processing WETfile: " + WETFile);784 logger.debug("Processing WETfile: " + WETFile); 784 785 785 786 // Any .gz files listed means they haven't been unzipped yet. So unzip. … … 797 798 798 799 if(!WETFile.exists() || !WETFile.isFile()) { 799 error("Error: " + WETFile + " does not exist (failure to unzip?)");800 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 800 801 return; 801 802 } … … 803 804 // Finally, we can process this WETFile's records into the keep and discard pile 804 805 wetFileCount++; 805 debug("Off to process " + WETFile);806 logger.debug("Off to process " + WETFile); 806 807 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 807 808 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-## … … 817 818 818 819 819 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 820 public static void info(String msg) { 821 System.err.println(msg); 822 logger.info(msg); 823 } 824 public static void debug(String msg) { 825 System.err.println(msg); 826 logger.debug(msg); 827 } 828 public static void warn(String msg) { 829 System.err.println(msg); 830 logger.warn(msg); 831 } 832 public static void error(String msg) { 833 System.err.println(msg); 834 logger.error(msg); 835 } 836 public static void error(String msg, Exception e) { 837 logger.error(msg, e); 838 System.err.println("\n"+msg); 839 e.printStackTrace(); 840 } 841 820 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 842 821 public static void printUsage() { 843 info("Run this program as:");844 info("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");822 System.err.println("Run this program as:"); 823 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>"); 845 824 } 846 825 … … 852 831 public boolean accept(File dir, String name) { 853 832 if(name.endsWith(".warc.wet")) { 854 debug("Will include " + name + " for processing.");833 logger.debug("Will include " + name + " for processing."); 855 834 return true; 856 835 } … … 860 839 File unzippedVersion = new File(dir, nameWithoutGZext); 861 840 if(unzippedVersion.exists()) { 862 debug("--- Unzipped version " + unzippedVersion + " exists.");863 debug("Skipping " + name);841 logger.debug("--- Unzipped version " + unzippedVersion + " exists."); 842 logger.debug("Skipping " + name); 864 843 return false; // don't count gzipped version if unzipped version exists. 865 844 } 866 845 else { 867 debug("Only zipped version " + name + " exists.");846 logger.debug("Only zipped version " + name + " exists."); 868 847 return true; // No unzipped version, so have to work with gzipped version 869 848 } … … 871 850 872 851 // we're not even interested in any other file extensions 873 debug("Not a WET file. Skipping " + name);852 logger.debug("Not a WET file. Skipping " + name); 874 853 return false; 875 854 } … … 887 866 } 888 867 else { 889 info("File " + f + " is not a directory");868 logger.info("File " + f + " is not a directory"); 890 869 } 891 870 return false; … … 901 880 File commoncrawlDir = new File(args[0]); 902 881 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { 903 error("Error: " + args[0] + " does not exist or is not a directory");882 logger.error("Error: " + args[0] + " does not exist or is not a directory"); 904 883 return; 905 884 } … … 907 886 File outFolder = new File(args[1]); 908 887 if(!outFolder.exists() || !outFolder.isDirectory()) { 909 error("Error: " + args[1] + " does not exist or is not a directory.");888 logger.error("Error: " + args[1] + " does not exist or is not a directory."); 910 889 return; 911 890 } … … 918 897 for(int i = 0; i < ccrawlFolders.length; i++) { 919 898 File ccrawlFolder = ccrawlFolders[i]; 920 info("About to process commoncrawl WET files folder: " + ccrawlFolder);899 logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder); 921 900 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder); 922 901 } … … 933 912 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 934 913 935 info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");936 937 info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");914 logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 915 916 logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 938 917 939 918 940 919 } catch(Exception e) { 941 920 // can get an exception when instantiating CCWETProcessor instance 942 error(e.getMessage(), e);921 logger.error(e.getMessage(), e); 943 922 } 944 923 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java
r33608 r33615 12 12 package org.greenstone.atea; 13 13 14 import java.io.*; 14 15 15 import opennlp.tools.langdetect.*; 16 16 import opennlp.tools.sentdetect.*; 17 17 import opennlp.tools.util.*; 18 18 19 import java.io.*; 19 20 import java.util.ArrayList; 21 22 //import org.apache.log4j.Logger; 23 20 24 21 25 /** … … 27 31 * 28 32 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file): 29 * maori-lang-detection/src$ javac -cp ".: $OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java33 * maori-lang-detection/src$ javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java 30 34 * 31 35 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file): 32 36 * 33 * maori-lang-detection/src$ java -cp ".: $OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help34 * 35 * maori-lang-detection/src$ java -cp ".: $OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>36 * 37 * maori-lang-detection/src$ java -cp ".: $OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -37 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help 38 * 39 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile> 40 * 41 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector - 38 42 * Press enter. This variant of the program expects text to stream in from standard input. 39 43 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn. … … 43 47 */ 44 48 public class MaoriTextDetector extends TextLanguageDetector { 49 //static Logger logger = Logger.getLogger(org.greenstone.atea.MaoriTextDetector.class.getName()); 50 45 51 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */ 46 52 public static final String MAORI_3LETTER_CODE = "mri"; … … 135 141 } 136 142 public static void doPrintErr(boolean runSilent, String msg) { 137 if(!runSilent) System.err.println(msg); 143 if(!runSilent) System.err.println(msg); 138 144 } 139 145 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33602 r33615 49 49 */ 50 50 public class NutchTextDumpProcessor { 51 privatestatic Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 52 52 53 53 static boolean DEBUG_MODE = true; … … 91 91 if(DEBUG_MODE) { 92 92 // START DEBUG 93 debug("__________________________________________");94 debug("@@@ Found page entry: ");95 debug("__________________________________________");96 debug(pageDump.toString());97 debug("------------------------------------------");93 logger.debug("__________________________________________"); 94 logger.debug("@@@ Found page entry: "); 95 logger.debug("__________________________________________"); 96 logger.debug(pageDump.toString()); 97 logger.debug("------------------------------------------"); 98 98 // END DEBUG 99 99 } … … 174 174 175 175 } catch (IOException ioe) { 176 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);176 logger.error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe); 177 177 } 178 178 … … 302 302 303 303 304 info("------------- " + this.siteID + " SITE STATS -----------");305 306 info("SITE DOMAIN: " + this.domainOfSite);307 info("Total number of web pages in site: " + pages.size());308 info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());304 logger.info("------------- " + this.siteID + " SITE STATS -----------"); 305 306 logger.info("SITE DOMAIN: " + this.domainOfSite); 307 logger.info("Total number of web pages in site: " + pages.size()); 308 logger.info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size()); 309 309 310 310 if(pagesInMRI.size() > 0) { 311 info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");311 logger.info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 312 312 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 313 info(mriWebPageInfo.toString());314 } 315 } 316 317 info(" ----------- ");313 logger.info(mriWebPageInfo.toString()); 314 } 315 } 316 317 logger.info(" ----------- "); 318 318 if(pagesContainingMRI.size() > 0) { 319 info("The following pages weren't detected as primarily being in MÄori");320 info("But still contained sentences detected as MÄori");319 logger.info("The following pages weren't detected as primarily being in MÄori"); 320 logger.info("But still contained sentences detected as MÄori"); 321 321 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) { 322 info(mriWebPageInfo.toString());322 logger.info(mriWebPageInfo.toString()); 323 323 } 324 324 325 325 } else { 326 info("No further pages detected as containing any sentences in MRI");327 } 328 info(" ----------- ");326 logger.info("No further pages detected as containing any sentences in MRI"); 327 } 328 logger.info(" ----------- "); 329 329 } 330 330 … … 358 358 359 359 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 360 public static void info(String msg) { 361 System.err.println(msg); 362 logger.info(msg); 363 } 364 public static void debug(String msg) { 365 System.err.println(msg); 366 logger.debug(msg); 367 } 368 public static void warn(String msg) { 369 System.err.println(msg); 370 logger.warn(msg); 371 } 372 public static void error(String msg) { 373 System.err.println(msg); 374 logger.error(msg); 375 } 376 public static void error(String msg, Exception e) { 377 logger.error(msg, e); 378 System.err.println("\n"+msg); 379 e.printStackTrace(); 380 } 381 360 382 361 public static void printUsage() { 383 info("Run this program as:");384 info("\tNutchTextDumpProcessor <path to 'crawled' folder>");362 System.err.println("Run this program as:"); 363 System.err.println("\tNutchTextDumpProcessor <path to 'crawled' folder>"); 385 364 } 386 365 … … 393 372 File sitesDir = new File(args[0]); 394 373 if(!sitesDir.exists() || !sitesDir.isDirectory()) { 395 error("Error: " + args[0] + " does not exist or is not a directory");374 logger.error("Error: " + args[0] + " does not exist or is not a directory"); 396 375 return; 397 376 } … … 430 409 File txtDumpFile = new File(siteDir, "dump.txt"); 431 410 if(!txtDumpFile.exists()) { 432 error("Text dump file " + txtDumpFile + " did not exist");411 logger.error("Text dump file " + txtDumpFile + " did not exist"); 433 412 continue; 434 413 } … … 439 418 String siteID = siteDir.getName(); 440 419 long lastModified = siteDir.lastModified(); 441 debug("Found siteID: " + siteID);420 logger.debug("Found siteID: " + siteID); 442 421 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor( 443 422 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, … … 458 437 // can get an exception when instantiating CCWETProcessor instance 459 438 // or with CSV file 460 error(e.getMessage(), e);439 logger.error(e.getMessage(), e); 461 440 } 462 441 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33582 r33615 38 38 tuples.put("key", key.trim()); 39 39 //} else { 40 // debug("@@@@ no key for pageURL: " + pageURL);40 //logger.debug("@@@@ no key for pageURL: " + pageURL); 41 41 //} 42 42 /* 43 43 if(pageURL.contains(TEXT_END_MARKER)) { 44 debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: ");45 debug("+++++++++");46 debug(unparsedPageDump);47 debug("+++++++++");44 logger.debug("@@@@ TEXT_END_MARKER assigned to pageURL for page: "); 45 logger.debug("+++++++++"); 46 logger.debug(unparsedPageDump); 47 logger.debug("+++++++++"); 48 48 } 49 49 */ … … 87 87 } else { 88 88 if(NutchTextDumpProcessor.DEBUG_MODE) { 89 error("No meta key for meta: " + line);90 error(unparsedPageDump);89 logger.error("No meta key for meta: " + line); 90 logger.error(unparsedPageDump); 91 91 } 92 92 } … … 103 103 104 104 } catch (IOException ioe) { 105 error("@@@@@@@@@ Error reading in txtdump of a page.", ioe);105 logger.error("@@@@@@@@@ Error reading in txtdump of a page.", ioe); 106 106 } 107 107 … … 115 115 public void debugTuples() { 116 116 if(NutchTextDumpProcessor.DEBUG_MODE) { 117 debug("__________________________________________");117 logger.debug("__________________________________________"); 118 118 for(Map.Entry<String, String> entry : tuples.entrySet()) { 119 119 String key = entry.getKey(); 120 120 String value = entry.getValue(); 121 debug(key + " - " + value);121 logger.debug(key + " - " + value); 122 122 } 123 debug("__________________________________________");123 logger.debug("__________________________________________"); 124 124 } 125 125 } … … 164 164 } 165 165 166 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //167 public static void info(String msg) {168 System.err.println(msg);169 logger.info(msg);170 }171 public static void debug(String msg) {172 System.err.println(msg);173 logger.debug(msg);174 }175 public static void warn(String msg) {176 System.err.println(msg);177 logger.warn(msg);178 }179 public static void error(String msg) {180 System.err.println(msg);181 logger.error(msg);182 }183 public static void error(String msg, Exception e) {184 logger.error(msg, e);185 System.err.println("\n"+msg);186 e.printStackTrace();187 }188 189 166 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33573 r33615 77 77 78 78 String fileID = inFile.getName(); 79 // debug("*** Processing wetfile: " + fileID);79 //logger.debug("*** Processing wetfile: " + fileID); 80 80 fileID = fileID.substring(fileID.lastIndexOf("0")+1); 81 81 if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet … … 195 195 String recordURI, String record) 196 196 { 197 info("CrawlID: CC-MAIN-" + this.crawlID197 logger.info("CrawlID: CC-MAIN-" + this.crawlID 198 198 + " WET #" + this.WETFileID 199 199 + " record #" + recordID 200 200 + " - contentLength: " + contentLength 201 201 + " - lineCount: " + lineCount); 202 info("URI: " + recordURI);203 // debug(record);204 // info("--------------------------");202 logger.info("URI: " + recordURI); 203 //logger.debug(record); 204 //logger.info("--------------------------"); 205 205 206 206 File parentFolder = null; … … 215 215 else if(batchProcessor.isGreylisted(recordURI)) { 216 216 parentFolder = batchProcessor.greyListedFolder; 217 debug("@@@GREYLISTED");217 logger.debug("@@@GREYLISTED"); 218 218 } 219 219 else { // url was only blacklisted 220 220 parentFolder = batchProcessor.discardFolder; 221 debug("@@@DISCARDING - blacklisted");221 logger.debug("@@@DISCARDING - blacklisted"); 222 222 } 223 223 } … … 229 229 else { 230 230 parentFolder = batchProcessor.greyListedFolder; 231 debug("@@@GREYLISTED");231 logger.debug("@@@GREYLISTED"); 232 232 } 233 233 } … … 274 274 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 275 275 parentFolder = batchProcessor.discardFolder; 276 debug("@@@DISCARDING - CAMELCASED CONTENTS");276 logger.debug("@@@DISCARDING - CAMELCASED CONTENTS"); 277 277 } 278 278 else*/ … … 282 282 if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 283 283 parentFolder = batchProcessor.keepFolder; 284 debug("@@@KEEPING");284 logger.debug("@@@KEEPING"); 285 285 } 286 286 } … … 289 289 if(parentFolder == null) { 290 290 parentFolder = batchProcessor.discardFolder; 291 debug("@@@DISCARDING");291 logger.debug("@@@DISCARDING"); 292 292 } 293 293 … … 301 301 } 302 302 } catch(Exception e) { 303 debug("Unable to write URL");303 logger.debug("Unable to write URL"); 304 304 e.printStackTrace(); 305 305 } 306 306 307 debug("--------------------------");307 logger.debug("--------------------------"); 308 308 309 309 // outFilename will look something like YYYY-##-#### … … 319 319 } catch(IOException ioe) { 320 320 ioe.printStackTrace(); 321 error("@@@@@@@@@ Error writing to file " + outFile, ioe);321 logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe); 322 322 } 323 323 } 324 324 325 325 326 public void info(String msg) {327 System.err.println(msg);328 logger.info(msg);329 }330 public void debug(String msg) {331 System.err.println(msg);332 logger.debug(msg);333 }334 public void warn(String msg) {335 System.err.println(msg);336 logger.warn(msg);337 }338 public void error(String msg) {339 System.err.println(msg);340 logger.error(msg);341 }342 public void error(String msg, Exception e) {343 logger.error(msg, e);344 System.err.println("\n"+msg);345 e.printStackTrace();346 }347 326 }
Note:
See TracChangeset
for help on using the changeset viewer.