Changeset 34005 for other-projects
- Timestamp:
- 2020-03-10T17:33:20+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33988 r34005 62 62 63 63 static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class 64 64 65 65 /** Counter for number of sites. 66 66 * Should be equal to number of times NutchTextDumpToMongoDB constructor … … 76 76 public final boolean siteCrawlUnfinished; 77 77 public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 78 79 // private handle to a csv writer 80 private CSVPrinter emptyWebPageInfoCSVPrinter; 78 81 79 82 private int countOfWebPagesWithBodyText = 0; … … 119 122 120 123 /** A NutchTextDumpToMongoDB processes the dump.txt for one site */ 121 public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess, 124 public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess, CSVPrinter emptyWebPageInfoCSVPrinter, 122 125 MaoriTextDetector maoriTxtDetector, String siteID, 123 126 File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) … … 126 129 // increment static counter of sites processed by a NutchTextDumpToMongoDB instance 127 130 SITE_COUNTER++; 131 132 // keep a handle to the csv file writer 133 this.emptyWebPageInfoCSVPrinter = emptyWebPageInfoCSVPrinter; 128 134 129 135 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder … … 251 257 252 258 if(text.equals("")) { 253 System.err.println("siteID: " + siteID + "- Empty page " + i + " - URL: " 254 + page.getPageURL()); 259 System.err.println(siteID + ",Empty page " + i + "," + page.getPageURL() 260 + "," + page.get("status") 261 + "," + page.get("protocolStatus") 262 + "," + page.get("parseStatus")); 263 // write information about any empty web page into the emptyPage csv file 264 emptyWebPageInfoCSVPrinter.printRecord(siteID, i, page.getPageURL(), 265 page.get("status"), page.get("protocolStatus"),page.get("parseStatus")); 255 266 256 267 // don't care about empty pages … … 385 396 386 397 NutchTextDumpToMongoDB.DEBUG_MODE = false; 387 388 398 389 399 try ( 390 400 MongoDBAccess mongodb = new MongoDBAccess(); 401 CSVPrinter emptyWebPageInfoCSVPrinter = new CSVPrinter(new FileWriter("InfoOnEmptyPagesNotInMongoDB.csv"), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); 391 402 ) { 392 403 393 404 mongodb.connectToDB(); 394 405 //mongodb.showCollections(); 395 406 407 // write out csv column headings into the csv file on empty web pages 408 emptyWebPageInfoCSVPrinter.printRecord("siteID","pagenum","URL","(fetch)status","protocolStatus","parseStatus"); 409 396 410 // print out the column headers for the websites csv file 397 411 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html … … 426 440 logger.debug("@@@ Processing siteID: " + siteID); 427 441 NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB( 428 mongodb, mriTxtDetector,442 mongodb, emptyWebPageInfoCSVPrinter, mriTxtDetector, 429 443 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 430 444 // now it's parsed all the web pages in the site's text dump
Note:
See TracChangeset
for help on using the changeset viewer.