Ignore:
Timestamp:
2020-03-10T17:33:20+13:00 (4 years ago)
Author:
ak19
Message:

InfoOnEmptyPagesNotInMongoDB.txt is now written out to a file, instead of redirecting all system.err into a file. Also it's now a csv file with additional information besides the URL, now including (fetch) status, protocolStatus and parseStatus.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33988 r34005  
    6262
    6363    static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class
    64 
     64       
    6565    /** Counter for number of sites.
    6666     * Should be equal to number of times NutchTextDumpToMongoDB constructor
     
    7676    public final boolean siteCrawlUnfinished;
    7777    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */
     78
     79    // private handle to a csv writer
     80    private CSVPrinter emptyWebPageInfoCSVPrinter;
    7881   
    7982    private int countOfWebPagesWithBodyText = 0;
     
    119122
    120123    /** A NutchTextDumpToMongoDB processes the dump.txt for one site */
    121     public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess,
     124    public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess, CSVPrinter emptyWebPageInfoCSVPrinter,
    122125                  MaoriTextDetector maoriTxtDetector, String siteID,
    123126                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
     
    126129    // increment static counter of sites processed by a NutchTextDumpToMongoDB instance
    127130    SITE_COUNTER++;
     131
     132    // keep a handle to the csv file writer
     133    this.emptyWebPageInfoCSVPrinter = emptyWebPageInfoCSVPrinter;
    128134   
    129135    // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
     
    251257       
    252258        if(text.equals("")) {
    253         System.err.println("siteID: " + siteID + "- Empty page " + i + " - URL: "
    254                    + page.getPageURL());
     259        System.err.println(siteID + ",Empty page " + i + "," + page.getPageURL()
     260                   + "," + page.get("status")
     261                   + "," + page.get("protocolStatus")
     262                   + "," + page.get("parseStatus"));
     263        // write information about any empty web page into the emptyPage csv file
     264        emptyWebPageInfoCSVPrinter.printRecord(siteID, i, page.getPageURL(),
     265               page.get("status"), page.get("protocolStatus"),page.get("parseStatus"));
    255266       
    256267        // don't care about empty pages
     
    385396
    386397    NutchTextDumpToMongoDB.DEBUG_MODE = false;
    387 
    388398   
    389399    try (
    390400         MongoDBAccess mongodb = new MongoDBAccess();
     401         CSVPrinter emptyWebPageInfoCSVPrinter = new CSVPrinter(new FileWriter("InfoOnEmptyPagesNotInMongoDB.csv"), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
    391402         ) {
    392403       
    393404        mongodb.connectToDB();
    394405        //mongodb.showCollections();
    395        
     406
     407        // write out csv column headings into the csv file on empty web pages
     408        emptyWebPageInfoCSVPrinter.printRecord("siteID","pagenum","URL","(fetch)status","protocolStatus","parseStatus");
     409       
    396410        // print out the column headers for the websites csv file
    397411        // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
     
    426440            logger.debug("@@@ Processing siteID: " + siteID);           
    427441            NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB(
    428                  mongodb, mriTxtDetector,
     442                 mongodb, emptyWebPageInfoCSVPrinter, mriTxtDetector,
    429443                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
    430444            // now it's parsed all the web pages in the site's text dump
Note: See TracChangeset for help on using the changeset viewer.