Changeset 33600
- Timestamp:
- 2019-10-23T23:05:38+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java
r33587 r33600 40 40 return str.toString(); 41 41 } 42 43 /** for converting to csv */ 44 public String[] toCSV() { 45 String[] csvRecord = { Integer.toString(pageID), 46 siteID, // foreign key 47 URL, 48 Boolean.toString(isMRI), 49 Integer.toString(numSentences), 50 Integer.toString(numSentencesInMRI) 51 }; 52 53 return csvRecord; 54 } 42 55 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33587 r33600 3 3 import java.io.*; 4 4 import java.lang.ArrayIndexOutOfBoundsException; 5 import java.time.LocalDateTime; 5 6 import java.util.ArrayList; 6 7 import java.util.Arrays; 7 8 9 import org.apache.commons.csv.*; 8 10 import org.apache.log4j.Logger; 11 9 12 10 13 /** … … 39 42 * TO RUN: 40 43 * maori-lang-detection/src$ 41 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled 44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small 42 45 * 43 46 * or: 44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&147 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled-small > ../crawled-small/bla.txt 2>&1 45 48 * 46 49 */ … … 49 52 50 53 static boolean DEBUG_MODE = true; 54 55 /** Counter for number of sites. 56 * Should be equal to number of times NutchTextDumpProcessor constructor 57 * is called: once per site. 58 */ 59 static private int SITE_COUNTER = 0; 51 60 52 61 private final MaoriTextDetector maoriTxtDetector; 53 62 54 public final String siteID; // is this necessary? 63 public final String siteID; 64 public final boolean siteCrawlUnfinished; 65 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 66 55 67 private String domainOfSite; 56 68 … … 60 72 /** list of pages in this site which were detected as being in MRI */ 61 73 private ArrayList<MRIWebPageStats> pagesInMRI; 62 /** list of pages in this site which were NOT detected as being in MRI but nevertheless 74 /** 75 * list of pages in this site which were NOT detected as being in MRI but nevertheless 63 76 * contain one or more sentences in MRI 64 77 */ … … 84 97 } 85 98 } 86 87 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) { 99 100 /** A NutchTextDumpProcessor processes the dump.txt for one site */ 101 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, 102 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 103 { 104 // increment static counter of sites processed by a NutchTextDumpProcessor instance 105 SITE_COUNTER++; 106 88 107 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder 89 108 this.siteID = siteID; 109 this.siteCrawlUnfinished = siteCrawlUnfinished; 110 this.siteCrawledTimestamp = lastModified; 111 90 112 this.maoriTxtDetector = maoriTxtDetector; 91 113 92 114 pages = new ArrayList<TextDumpPage>(); 93 115 … … 195 217 public int totalNumPages() { 196 218 return pages.size(); 197 } 198 219 } 220 public int getNumPagesInMRI() { 221 return pagesInMRI.size(); 222 } 223 public int getNumPagesContainingMRI() { 224 return pagesContainingMRI.size(); 225 } 226 199 227 private void prepareSiteStats() { 200 228 pagesInMRI = new ArrayList<MRIWebPageStats>(); … … 217 245 218 246 // Even if the entire page is not found to be overall in MÄori, 219 // let's s itll inspect the sentences of the page and count how many (if any)247 // let's still inspect the sentences of the page and count how many (if any) 220 248 // are in te reo. 221 249 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text); … … 242 270 } 243 271 } 272 273 274 275 244 276 } 245 277 } … … 276 308 277 309 310 public void writeSiteRecordToCSV(CSVPrinter websitesCSVPrinter) throws IOException { 311 312 // https://stackoverflow.com/questions/35183146/how-can-i-create-a-java-8-localdate-from-a-long-epoch-time-in-milliseconds 313 // LocalDateTime date = 314 // LocalDateTime.ofInstant(Instant.ofEpochMilli(this.siteCrawledTimestamp), ZoneId.systemDefault()); 315 // String crawlTimestamp = 316 // date.format(DateTimeFormatter.ofPattern("yyyy-MM-dd")) + " " + date.format(DateTimeFormatter.ofPattern("HH:mm:ss")); 317 318 boolean redoCrawl = false; 319 int numPagesInMRI = pagesInMRI.size(); 320 int numPagesContainingMRI = pagesContainingMRI.size(); 321 322 if(this.siteCrawlUnfinished) { 323 // arbitrary decision, but need some indication that the MRI content was not close to one-off in the website 324 if(numPagesInMRI > 2 || numPagesContainingMRI > 2) { 325 redoCrawl = true; 326 } 327 } 328 329 // site.csv CSV file row: 330 // ID, siteID, domainURL, totalPagesInSite, numPagesInMRI, numPagesContainingMRI, crawlUnfinished, redoCrawl 331 websitesCSVPrinter.printRecord(SITE_COUNTER, this.siteID, this.domainOfSite, 332 pages.size(), numPagesInMRI, numPagesContainingMRI, 333 this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl); 334 } 335 336 /* 337 public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences) 338 throws IOException 339 { 340 int totalSentences 341 342 for(int i = 0; i < ) 343 printer.printRecord(); 344 345 } catch (IOException ex) { 346 ex.printStackTrace(); 347 } 348 } 349 */ 350 351 /* 352 public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) { 353 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 354 355 //CSVFormat csvFormat = CSVFormat.DEFAULT. 356 // withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI"); 357 358 try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) { 359 // header 360 //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI"); 361 // skip first one 362 363 for() 364 printer.printRecord(); 365 366 } catch (IOException ex) { 367 ex.printStackTrace(); 368 } 369 } 370 */ 371 278 372 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 279 373 public static void info(String msg) { … … 301 395 public static void printUsage() { 302 396 info("Run this program as:"); 303 info("\tNutchTextDumpProcessor <path to ' sites' folder>");397 info("\tNutchTextDumpProcessor <path to 'crawled' folder>"); 304 398 } 305 399 … … 318 412 NutchTextDumpProcessor.DEBUG_MODE = false; 319 413 320 try { 414 File websitesCSVFile = new File(sitesDir, "websites.csv"); 415 416 try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) { 417 418 // print out the column headers for the websites csv file 419 websitesCSVPrinter.printRecord("ID", "siteID", "domainURL", 420 "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 421 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 422 423 321 424 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 322 425 File[] sites = sitesDir.listFiles(); … … 336 439 337 440 else { 441 File UNFINISHED_FILE = new File(siteDir, "UNFINISHED"); 442 338 443 String siteID = siteDir.getName(); 444 long lastModified = siteDir.lastModified(); 339 445 debug("Found siteID: " + siteID); 340 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 446 NutchTextDumpProcessor nutchTxtDump 447 = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 341 448 // now it's parsed all the web pages in the site's text dump 342 449 … … 345 452 346 453 nutchTxtDump.printSiteStats(); 454 nutchTxtDump.writeSiteRecordToCSV(websitesCSVPrinter); 347 455 } 348 456 } … … 352 460 } catch(Exception e) { 353 461 // can get an exception when instantiating CCWETProcessor instance 462 // or with CSV file 354 463 error(e.getMessage(), e); 355 464 }
Note:
See TracChangeset
for help on using the changeset viewer.