Changeset 33601 for gs3-extensions
- Timestamp:
- 2019-10-23T23:22:14+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33600 r33601 99 99 100 100 /** A NutchTextDumpProcessor processes the dump.txt for one site */ 101 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, 101 public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, 102 MaoriTextDetector maoriTxtDetector, String siteID, 102 103 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 104 throws IOException 103 105 { 104 106 // increment static counter of sites processed by a NutchTextDumpProcessor instance … … 184 186 } 185 187 186 prepareSiteStats( );188 prepareSiteStats(webpagesCSVPrinter); 187 189 } 188 190 … … 225 227 } 226 228 227 private void prepareSiteStats( ){229 private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException { 228 230 pagesInMRI = new ArrayList<MRIWebPageStats>(); 229 231 pagesContainingMRI = new ArrayList<MRIWebPageStats>(); … … 235 237 236 238 String text = page.getPageText(); 239 String url = page.getPageURL(); 240 237 241 if(text.equals("")) { 238 242 page.addMRILanguageStatus(false); 243 244 // write to webpage CSV for all pages 245 // not just those webpages that are overall MRI, or those that contain MRI sentences 246 // CSV column headings: 247 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 248 webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0); 239 249 continue; 240 250 } … … 248 258 // are in te reo. 249 259 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text); 250 // first element is always total num sentences260 // first element of ArrayList returned is always total num sentences on page 251 261 // remaining elements are the actual sentences that were detected as being MÄori 252 262 int totalSentences = Integer.parseInt(mriSentences.get(0)); 253 int numSentencesInMRI = mriSentences.size() - 1; 254 263 int numSentencesInMRI = mriSentences.size() - 1; 255 264 256 265 // Add page to list of MRI pages if the page's body text overall was detected … … 259 268 // were detected as being in MRI 260 269 if(isMRI || numSentencesInMRI >= 1) { 261 String url = page.getPageURL();270 262 271 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI, 263 272 totalSentences, numSentencesInMRI); … … 268 277 } 269 278 270 } 271 } 272 273 274 275 279 } 280 281 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 282 webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI); 283 } 276 284 } 277 285 } … … 413 421 414 422 File websitesCSVFile = new File(sitesDir, "websites.csv"); 415 416 try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) { 423 File webpagesCSVFile = new File(sitesDir, "webpages.csv"); 424 425 try ( 426 CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT); 427 CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT); 428 ) { 417 429 418 430 // print out the column headers for the websites csv file … … 420 432 "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 421 433 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 422 434 webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI", 435 "numSentences", "numSentencesInMRI"); 423 436 424 437 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent … … 445 458 debug("Found siteID: " + siteID); 446 459 NutchTextDumpProcessor nutchTxtDump 447 = new NutchTextDumpProcessor( mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());460 = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 448 461 // now it's parsed all the web pages in the site's text dump 449 462
Note:
See TracChangeset
for help on using the changeset viewer.