Changeset 33602

Show
Ignore:
Timestamp:
23.10.2019 23:49:34 (3 weeks ago)
Author:
ak19
Message:

1. The final csv file, mri-sentences.csv, is now written out. 2. Only pages that are overall in MRI or contain any MRI sentences get entries in the webpages csv file now. 3. Corrections to ID columns written to the webpages and websites csv files. 4. Some cleanup of unused code.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

    r33600 r33602  
    4242 
    4343    /** for converting to csv */ 
     44    /* 
     45      Unused. 
    4446    public String[] toCSV() { 
    4547    String[] csvRecord = { Integer.toString(pageID), 
     
    5355    return csvRecord; 
    5456    } 
     57    */ 
    5558} 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33601 r33602  
    5858     */ 
    5959    static private int SITE_COUNTER = 0; 
     60    static private long WEBPAGE_COUNTER = 0; 
     61    static private long MRI_SENTENCE_COUNTER = 0; 
    6062     
    6163    private final MaoriTextDetector maoriTxtDetector; 
     
    99101 
    100102    /** A NutchTextDumpProcessor processes the dump.txt for one site */ 
    101     public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, 
     103    public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter, 
    102104                  MaoriTextDetector maoriTxtDetector, String siteID, 
    103105                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished) 
     
    186188    } 
    187189     
    188     prepareSiteStats(webpagesCSVPrinter); 
     190    prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter); 
    189191    } 
    190192     
     
    227229    } 
    228230     
    229     private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException { 
     231    private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException { 
    230232    pagesInMRI = new ArrayList<MRIWebPageStats>(); 
    231233    pagesContainingMRI = new ArrayList<MRIWebPageStats>(); 
     
    237239         
    238240        String text = page.getPageText(); 
    239         String url = page.getPageURL(); 
    240241         
    241242        if(text.equals("")) { 
    242243        page.addMRILanguageStatus(false); 
    243  
    244         // write to webpage CSV for all pages 
    245         // not just those webpages that are overall MRI, or those that contain MRI sentences 
    246         // CSV column headings: 
    247         // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 
    248         webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);       
    249244        continue; 
    250245        } 
     
    268263        // were detected as being in MRI 
    269264        if(isMRI || numSentencesInMRI >= 1) { 
    270              
     265            String url = page.getPageURL(); 
    271266            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI, 
    272267                           totalSentences, numSentencesInMRI); 
     
    277272            } 
    278273 
    279         } 
    280          
    281         // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 
    282         webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI); 
     274            // Only write to webpages csv file for those pages that had any MRI 
     275            // language content. 
     276            // column headers: 
     277            //    pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI 
     278            //int pageID = i; // not primary key by itself, 
     279                    // must be combined with siteID to form primary key 
     280             
     281            webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++, 
     282                          SITE_COUNTER, /* alternative: this.siteID */ 
     283                          url, isMRI, totalSentences, numSentencesInMRI); 
     284 
     285            // Write the sentences that are in te reo into the mri-sentences CSV file 
     286            // whether from webpages that are MRI overall or only those that containing 
     287            // any sentences in MRI 
     288            for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence 
     289            //int sentenceID = j; // combine with siteID and pageID to form primary key 
     290            String mriSentence = mriSentences.get(j); 
     291            // sentenceID, pageID, sentence 
     292            //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence); 
     293            mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence); 
     294            } 
     295        }        
     296 
    283297        }        
    284298    } 
     
    342356    } 
    343357 
    344     /* 
    345     public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences) 
    346     throws IOException 
    347     { 
    348     int totalSentences 
    349      
    350     for(int i = 0; i < ) 
    351         printer.printRecord(); 
    352          
    353     } catch (IOException ex) { 
    354         ex.printStackTrace(); 
    355     } 
    356     } 
    357     */ 
    358      
    359     /* 
    360     public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) { 
    361     // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 
    362  
    363     //CSVFormat csvFormat = CSVFormat.DEFAULT. 
    364     //    withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI"); 
    365      
    366     try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) { 
    367         // header 
    368         //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI"); 
    369         // skip first one 
    370          
    371         for() 
    372         printer.printRecord(); 
    373          
    374     } catch (IOException ex) { 
    375         ex.printStackTrace(); 
    376     } 
    377     } 
    378     */ 
    379358     
    380359    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
     
    422401    File websitesCSVFile = new File(sitesDir, "websites.csv"); 
    423402    File webpagesCSVFile = new File(sitesDir, "webpages.csv"); 
     403    File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv"); 
    424404     
    425405    try ( 
    426406         CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT); 
    427407         CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT); 
     408         CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT); 
    428409         ) { 
    429410 
    430411        // print out the column headers for the websites csv file 
    431         websitesCSVPrinter.printRecord("ID", "siteID", "domainURL", 
    432                    "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 
    433                    "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 
    434         webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI", 
     412        // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 
     413        websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/, 
     414           "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI", 
     415           "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl"); 
     416        webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI", 
    435417                       "numSentences", "numSentencesInMRI"); 
     418        mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence"); 
    436419         
    437420        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 
     
    457440            long lastModified = siteDir.lastModified(); 
    458441            debug("Found siteID: " + siteID);            
    459             NutchTextDumpProcessor nutchTxtDump 
    460                 = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
     442            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor( 
     443                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector, 
     444                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists()); 
    461445            // now it's parsed all the web pages in the site's text dump 
    462446