Changeset 33601

Show
Ignore:
Timestamp:
23.10.2019 23:22:14 (3 weeks ago)
Author:
ak19
Message:

Creates the 2nd csv file, with info about webpages. At present stores irrelevant records too, such as webpages with no content (no text body) or web pages that were neither detected as MRI overall nor had any MRI sentences. In the subsequent commit, these web pages will be discarded.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33600 r33601  
    9999 
    100100    /** A NutchTextDumpProcessor processes the dump.txt for one site */ 
    101     public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, 
     101    public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, 
     102                  MaoriTextDetector maoriTxtDetector, String siteID, 
    102103                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 
     104    throws IOException 
    103105    {    
    104106    // increment static counter of sites processed by a NutchTextDumpProcessor instance 
     
    184186    } 
    185187     
    186     prepareSiteStats(); 
     188    prepareSiteStats(webpagesCSVPrinter); 
    187189    } 
    188190     
     
    225227    } 
    226228     
    227     private void prepareSiteStats() { 
     229    private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException { 
    228230    pagesInMRI = new ArrayList<MRIWebPageStats>(); 
    229231    pagesContainingMRI = new ArrayList<MRIWebPageStats>(); 
     
    235237         
    236238        String text = page.getPageText(); 
     239        String url = page.getPageURL(); 
     240         
    237241        if(text.equals("")) { 
    238242        page.addMRILanguageStatus(false); 
     243 
     244        // write to webpage CSV for all pages 
     245        // not just those webpages that are overall MRI, or those that contain MRI sentences 
     246        // CSV column headings: 
     247        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 
     248        webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);       
    239249        continue; 
    240250        } 
     
    248258        // are in te reo. 
    249259        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text); 
    250         // first element is always total num sentences 
     260        // first element of ArrayList returned is always total num sentences on page 
    251261        // remaining elements are the actual sentences that were detected as being Māori 
    252262        int totalSentences = Integer.parseInt(mriSentences.get(0)); 
    253         int numSentencesInMRI = mriSentences.size() - 1;             
    254          
     263        int numSentencesInMRI = mriSentences.size() - 1;         
    255264 
    256265        // Add page to list of MRI pages if the page's body text overall was detected 
     
    259268        // were detected as being in MRI 
    260269        if(isMRI || numSentencesInMRI >= 1) { 
    261             String url = page.getPageURL(); 
     270             
    262271            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI, 
    263272                           totalSentences, numSentencesInMRI); 
     
    268277            } 
    269278 
    270         }        
    271         } 
    272  
    273          
    274          
    275          
     279        } 
     280         
     281        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 
     282        webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI); 
     283        }        
    276284    } 
    277285    } 
     
    413421     
    414422    File websitesCSVFile = new File(sitesDir, "websites.csv"); 
    415      
    416     try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) { 
     423    File webpagesCSVFile = new File(sitesDir, "webpages.csv"); 
     424     
     425    try ( 
     426         CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT); 
     427         CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT); 
     428         ) { 
    417429 
    418430        // print out the column headers for the websites csv file 
     
    420432                   "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 
    421433                   "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 
    422  
     434        webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI", 
     435                       "numSentences", "numSentencesInMRI"); 
    423436         
    424437        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 
     
    445458            debug("Found siteID: " + siteID);            
    446459            NutchTextDumpProcessor nutchTxtDump 
    447                 = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
     460                = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
    448461            // now it's parsed all the web pages in the site's text dump 
    449462