Changeset 33602 for gs3-extensions
- Timestamp:
- 2019-10-23T23:49:34+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java
r33600 r33602 42 42 43 43 /** for converting to csv */ 44 /* 45 Unused. 44 46 public String[] toCSV() { 45 47 String[] csvRecord = { Integer.toString(pageID), … … 53 55 return csvRecord; 54 56 } 57 */ 55 58 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33601 r33602 58 58 */ 59 59 static private int SITE_COUNTER = 0; 60 static private long WEBPAGE_COUNTER = 0; 61 static private long MRI_SENTENCE_COUNTER = 0; 60 62 61 63 private final MaoriTextDetector maoriTxtDetector; … … 99 101 100 102 /** A NutchTextDumpProcessor processes the dump.txt for one site */ 101 public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, 103 public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter, 102 104 MaoriTextDetector maoriTxtDetector, String siteID, 103 105 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) … … 186 188 } 187 189 188 prepareSiteStats(webpagesCSVPrinter );190 prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter); 189 191 } 190 192 … … 227 229 } 228 230 229 private void prepareSiteStats(CSVPrinter webpageCSVPrinter ) throws IOException {231 private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException { 230 232 pagesInMRI = new ArrayList<MRIWebPageStats>(); 231 233 pagesContainingMRI = new ArrayList<MRIWebPageStats>(); … … 237 239 238 240 String text = page.getPageText(); 239 String url = page.getPageURL();240 241 241 242 if(text.equals("")) { 242 243 page.addMRILanguageStatus(false); 243 244 // write to webpage CSV for all pages245 // not just those webpages that are overall MRI, or those that contain MRI sentences246 // CSV column headings:247 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI248 webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);249 244 continue; 250 245 } … … 268 263 // were detected as being in MRI 269 264 if(isMRI || numSentencesInMRI >= 1) { 270 265 String url = page.getPageURL(); 271 266 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI, 272 267 totalSentences, numSentencesInMRI); … … 277 272 } 278 273 279 } 280 281 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 282 webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI); 274 // Only write to webpages csv file for those pages that had any MRI 275 // language content. 276 // column headers: 277 // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 278 //int pageID = i; // not primary key by itself, 279 // must be combined with siteID to form primary key 280 281 webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++, 282 SITE_COUNTER, /* alternative: this.siteID */ 283 url, isMRI, totalSentences, numSentencesInMRI); 284 285 // Write the sentences that are in te reo into the mri-sentences CSV file 286 // whether from webpages that are MRI overall or only those that containing 287 // any sentences in MRI 288 for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence 289 //int sentenceID = j; // combine with siteID and pageID to form primary key 290 String mriSentence = mriSentences.get(j); 291 // sentenceID, pageID, sentence 292 //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence); 293 mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence); 294 } 295 } 296 283 297 } 284 298 } … … 342 356 } 343 357 344 /*345 public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences)346 throws IOException347 {348 int totalSentences349 350 for(int i = 0; i < )351 printer.printRecord();352 353 } catch (IOException ex) {354 ex.printStackTrace();355 }356 }357 */358 359 /*360 public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) {361 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html362 363 //CSVFormat csvFormat = CSVFormat.DEFAULT.364 // withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");365 366 try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) {367 // header368 //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");369 // skip first one370 371 for()372 printer.printRecord();373 374 } catch (IOException ex) {375 ex.printStackTrace();376 }377 }378 */379 358 380 359 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // … … 422 401 File websitesCSVFile = new File(sitesDir, "websites.csv"); 423 402 File webpagesCSVFile = new File(sitesDir, "webpages.csv"); 403 File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv"); 424 404 425 405 try ( 426 406 CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT); 427 407 CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT); 408 CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT); 428 409 ) { 429 410 430 411 // print out the column headers for the websites csv file 431 websitesCSVPrinter.printRecord("ID", "siteID", "domainURL", 432 "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 433 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 434 webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI", 412 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 413 websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/, 414 "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 415 "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 416 webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI", 435 417 "numSentences", "numSentencesInMRI"); 418 mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence"); 436 419 437 420 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent … … 457 440 long lastModified = siteDir.lastModified(); 458 441 debug("Found siteID: " + siteID); 459 NutchTextDumpProcessor nutchTxtDump 460 = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 442 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor( 443 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, 444 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 461 445 // now it's parsed all the web pages in the site's text dump 462 446
Note:
See TracChangeset
for help on using the changeset viewer.