Changeset 33623 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
- Timestamp:
- 2019-11-05T21:04:09+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33615 r33623 182 182 TextDumpPage firstPage = pages.get(0); 183 183 String url = firstPage.getPageURL(); 184 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);184 this.domainOfSite = Utility.getDomainForURL(url, true); 185 185 } 186 186 else { … … 248 248 249 249 page.addMRILanguageStatus(isMRI); 250 250 251 251 252 // Even if the entire page is not found to be overall in MÄori, 252 253 // let's still inspect the sentences of the page and count how many (if any) … … 281 282 webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++, 282 283 SITE_COUNTER, /* alternative: this.siteID */ 283 url, isMRI, totalSentences, numSentencesInMRI); 284 url, 285 //"origCharEncoding", "modifiedTime", "fetchTime", 286 page.getOriginalCharEncoding(), 287 page.getModifiedTime(), 288 page.getFetchTime(), 289 isMRI, totalSentences, numSentencesInMRI); 284 290 285 291 // Write the sentences that are in te reo into the mri-sentences CSV file … … 393 399 "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 394 400 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 395 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI", 396 "numSentences", "numSentencesInMRI"); 401 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", 402 "origCharEncoding", "modifiedTime", "fetchTime", 403 "isMRI", "numSentences", "numSentencesInMRI"); 397 404 mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence"); 398 405 … … 435 442 436 443 } catch(Exception e) { 437 // can get an exception when instantiating CCWETProcessor instance444 // can get an exception when instantiating NutchTextDumpProcessor instance 438 445 // or with CSV file 439 446 logger.error(e.getMessage(), e);
Note:
See TracChangeset
for help on using the changeset viewer.