Changeset 33601


Ignore:
Timestamp:
2019-10-23T23:22:14+13:00 (4 years ago)
Author:
ak19
Message:

Creates the 2nd csv file, with info about webpages. At present stores irrelevant records too, such as webpages with no content (no text body) or web pages that were neither detected as MRI overall nor had any MRI sentences. In the subsequent commit, these web pages will be discarded.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33600 r33601  
    9999
    100100    /** A NutchTextDumpProcessor processes the dump.txt for one site */
    101     public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID,
     101    public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter,
     102                  MaoriTextDetector maoriTxtDetector, String siteID,
    102103                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
     104    throws IOException
    103105    {   
    104106    // increment static counter of sites processed by a NutchTextDumpProcessor instance
     
    184186    }
    185187   
    186     prepareSiteStats();
     188    prepareSiteStats(webpagesCSVPrinter);
    187189    }
    188190   
     
    225227    }
    226228   
    227     private void prepareSiteStats() {
     229    private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException {
    228230    pagesInMRI = new ArrayList<MRIWebPageStats>();
    229231    pagesContainingMRI = new ArrayList<MRIWebPageStats>();
     
    235237       
    236238        String text = page.getPageText();
     239        String url = page.getPageURL();
     240       
    237241        if(text.equals("")) {
    238242        page.addMRILanguageStatus(false);
     243
     244        // write to webpage CSV for all pages
     245        // not just those webpages that are overall MRI, or those that contain MRI sentences
     246        // CSV column headings:
     247        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
     248        webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);     
    239249        continue;
    240250        }
     
    248258        // are in te reo.
    249259        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
    250         // first element is always total num sentences
     260        // first element of ArrayList returned is always total num sentences on page
    251261        // remaining elements are the actual sentences that were detected as being Māori
    252262        int totalSentences = Integer.parseInt(mriSentences.get(0));
    253         int numSentencesInMRI = mriSentences.size() - 1;           
    254        
     263        int numSentencesInMRI = mriSentences.size() - 1;       
    255264
    256265        // Add page to list of MRI pages if the page's body text overall was detected
     
    259268        // were detected as being in MRI
    260269        if(isMRI || numSentencesInMRI >= 1) {
    261             String url = page.getPageURL();
     270           
    262271            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
    263272                           totalSentences, numSentencesInMRI);
     
    268277            }
    269278
    270         }       
    271         }
    272 
    273        
    274        
    275        
     279        }
     280       
     281        // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
     282        webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI);
     283        }       
    276284    }
    277285    }
     
    413421   
    414422    File websitesCSVFile = new File(sitesDir, "websites.csv");
    415    
    416     try (CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT)) {
     423    File webpagesCSVFile = new File(sitesDir, "webpages.csv");
     424   
     425    try (
     426         CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
     427         CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
     428         ) {
    417429
    418430        // print out the column headers for the websites csv file
     
    420432                   "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
    421433                   "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
    422 
     434        webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI",
     435                       "numSentences", "numSentencesInMRI");
    423436       
    424437        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
     
    445458            debug("Found siteID: " + siteID);           
    446459            NutchTextDumpProcessor nutchTxtDump
    447                 = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
     460                = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
    448461            // now it's parsed all the web pages in the site's text dump
    449462
Note: See TracChangeset for help on using the changeset viewer.