Changeset 34005

Show
Ignore:
Timestamp:
10.03.2020 17:33:20 (3 weeks ago)
Author:
ak19
Message:

InfoOnEmptyPagesNotInMongoDB.txt is now written out to a file, instead of redirecting all system.err into a file. Also it's now a csv file with additional information besides the URL, now including (fetch) status, protocolStatus and parseStatus.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33988 r34005  
    6262 
    6363    static boolean DEBUG_MODE = true; // this is set to false in main() at the end of this class 
    64  
     64        
    6565    /** Counter for number of sites. 
    6666     * Should be equal to number of times NutchTextDumpToMongoDB constructor 
     
    7676    public final boolean siteCrawlUnfinished; 
    7777    public final long siteCrawledTimestamp; /** When the crawl of the site terminated */ 
     78 
     79    // private handle to a csv writer 
     80    private CSVPrinter emptyWebPageInfoCSVPrinter; 
    7881     
    7982    private int countOfWebPagesWithBodyText = 0; 
     
    119122 
    120123    /** A NutchTextDumpToMongoDB processes the dump.txt for one site */ 
    121     public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess, 
     124    public NutchTextDumpToMongoDB(MongoDBAccess mongodbAccess, CSVPrinter emptyWebPageInfoCSVPrinter, 
    122125                  MaoriTextDetector maoriTxtDetector, String siteID, 
    123126                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 
     
    126129    // increment static counter of sites processed by a NutchTextDumpToMongoDB instance 
    127130    SITE_COUNTER++; 
     131 
     132    // keep a handle to the csv file writer 
     133    this.emptyWebPageInfoCSVPrinter = emptyWebPageInfoCSVPrinter; 
    128134     
    129135    // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder 
     
    251257         
    252258        if(text.equals("")) { 
    253         System.err.println("siteID: " + siteID + "- Empty page " + i + " - URL: " 
    254                    + page.getPageURL()); 
     259        System.err.println(siteID + ",Empty page " + i + "," + page.getPageURL() 
     260                   + "," + page.get("status") 
     261                   + "," + page.get("protocolStatus") 
     262                   + "," + page.get("parseStatus")); 
     263        // write information about any empty web page into the emptyPage csv file 
     264        emptyWebPageInfoCSVPrinter.printRecord(siteID, i, page.getPageURL(), 
     265               page.get("status"), page.get("protocolStatus"),page.get("parseStatus")); 
    255266         
    256267        // don't care about empty pages 
     
    385396 
    386397    NutchTextDumpToMongoDB.DEBUG_MODE = false; 
    387  
    388398     
    389399    try ( 
    390400         MongoDBAccess mongodb = new MongoDBAccess(); 
     401         CSVPrinter emptyWebPageInfoCSVPrinter = new CSVPrinter(new FileWriter("InfoOnEmptyPagesNotInMongoDB.csv"), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); 
    391402         ) { 
    392403         
    393404        mongodb.connectToDB(); 
    394405        //mongodb.showCollections(); 
    395          
     406 
     407        // write out csv column headings into the csv file on empty web pages 
     408        emptyWebPageInfoCSVPrinter.printRecord("siteID","pagenum","URL","(fetch)status","protocolStatus","parseStatus"); 
     409         
    396410        // print out the column headers for the websites csv file 
    397411        // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 
     
    426440            logger.debug("@@@ Processing siteID: " + siteID);            
    427441            NutchTextDumpToMongoDB nutchTxtDump = new NutchTextDumpToMongoDB( 
    428                  mongodb, mriTxtDetector, 
     442                 mongodb, emptyWebPageInfoCSVPrinter, mriTxtDetector, 
    429443                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
    430444            // now it's parsed all the web pages in the site's text dump