Changeset 33633 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java
- Timestamp:
- 2019-11-08T19:43:39+13:00 (4 years ago)
- File:
-
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToCSV.java
r33623 r33633 38 38 * TO COMPILE: 39 39 * maori-lang-detection/src$ 40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDump Processor.java40 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV.java 41 41 * 42 42 * TO RUN: 43 43 * maori-lang-detection/src$ 44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDump Processor../crawled-small44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small 45 45 * 46 46 * or: 47 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDump Processor../crawled-small > ../crawled-small/bla.txt 2>&147 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpToCSV ../crawled-small > ../crawled-small/bla.txt 2>&1 48 48 * 49 49 */ 50 public class NutchTextDump Processor{51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDump Processor.class.getName());50 public class NutchTextDumpToCSV { 51 static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpToCSV.class.getName()); 52 52 53 53 static boolean DEBUG_MODE = true; 54 54 55 55 /** Counter for number of sites. 56 * Should be equal to number of times NutchTextDump Processorconstructor56 * Should be equal to number of times NutchTextDumpToCSV constructor 57 57 * is called: once per site. 58 58 */ … … 66 66 public final boolean siteCrawlUnfinished; 67 67 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 68 68 69 69 private String domainOfSite; 70 70 … … 100 100 } 101 101 102 /** A NutchTextDump Processorprocesses the dump.txt for one site */103 public NutchTextDump Processor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,102 /** A NutchTextDumpToCSV processes the dump.txt for one site */ 103 public NutchTextDumpToCSV(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter, 104 104 MaoriTextDetector maoriTxtDetector, String siteID, 105 105 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 106 106 throws IOException 107 107 { 108 // increment static counter of sites processed by a NutchTextDump Processorinstance108 // increment static counter of sites processed by a NutchTextDumpToCSV instance 109 109 SITE_COUNTER++; 110 110 … … 190 190 prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter); 191 191 } 192 192 193 /* UNUSED */ 193 194 /** pageID: id into pages array */ 195 /* 194 196 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException { 195 197 … … 228 230 return pagesContainingMRI.size(); 229 231 } 232 */ 230 233 231 234 private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException { … … 257 260 // remaining elements are the actual sentences that were detected as being MÄori 258 261 int totalSentences = Integer.parseInt(mriSentences.get(0)); 259 int numSentencesInMRI = mriSentences.size() - 1; 262 int numSentencesInMRI = mriSentences.size() - 1; 260 263 261 264 // Add page to list of MRI pages if the page's body text overall was detected … … 367 370 public static void printUsage() { 368 371 System.err.println("Run this program as:"); 369 System.err.println("\tNutchTextDump Processor<path to 'crawled' folder>");372 System.err.println("\tNutchTextDumpToCSV <path to 'crawled' folder>"); 370 373 } 371 374 … … 382 385 } 383 386 384 NutchTextDump Processor.DEBUG_MODE = false;387 NutchTextDumpToCSV.DEBUG_MODE = false; 385 388 386 389 File websitesCSVFile = new File(sitesDir, "websites.csv"); … … 426 429 long lastModified = siteDir.lastModified(); 427 430 logger.debug("Found siteID: " + siteID); 428 NutchTextDump Processor nutchTxtDump = new NutchTextDumpProcessor(431 NutchTextDumpToCSV nutchTxtDump = new NutchTextDumpToCSV( 429 432 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, 430 433 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); … … 442 445 443 446 } catch(Exception e) { 444 // can get an exception when instantiating NutchTextDump Processorinstance447 // can get an exception when instantiating NutchTextDumpToCSV instance 445 448 // or with CSV file 446 449 logger.error(e.getMessage(), e);
Note:
See TracChangeset
for help on using the changeset viewer.