Changeset 33602 for gs3-extensions


Ignore:
Timestamp:
2019-10-23T23:49:34+13:00 (5 years ago)
Author:
ak19
Message:
  1. The final csv file, mri-sentences.csv, is now written out. 2. Only pages that are overall in MRI or contain any MRI sentences get entries in the webpages csv file now. 3. Corrections to ID columns written to the webpages and websites csv files. 4. Some cleanup of unused code.
Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

    r33600 r33602  
    4242
    4343    /** for converting to csv */
     44    /*
     45      Unused.
    4446    public String[] toCSV() {
    4547    String[] csvRecord = { Integer.toString(pageID),
     
    5355    return csvRecord;
    5456    }
     57    */
    5558}
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33601 r33602  
    5858     */
    5959    static private int SITE_COUNTER = 0;
     60    static private long WEBPAGE_COUNTER = 0;
     61    static private long MRI_SENTENCE_COUNTER = 0;
    6062   
    6163    private final MaoriTextDetector maoriTxtDetector;
     
    99101
    100102    /** A NutchTextDumpProcessor processes the dump.txt for one site */
    101     public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter,
     103    public NutchTextDumpProcessor(CSVPrinter webpagesCSVPrinter, CSVPrinter mriSentencesCSVPrinter,
    102104                  MaoriTextDetector maoriTxtDetector, String siteID,
    103105                  File txtDumpFile, long lastModified, boolean siteCrawlUnfinished)
     
    186188    }
    187189   
    188     prepareSiteStats(webpagesCSVPrinter);
     190    prepareSiteStats(webpagesCSVPrinter, mriSentencesCSVPrinter);
    189191    }
    190192   
     
    227229    }
    228230   
    229     private void prepareSiteStats(CSVPrinter webpageCSVPrinter) throws IOException {
     231    private void prepareSiteStats(CSVPrinter webpageCSVPrinter, CSVPrinter mriSentencesCSVPrinter) throws IOException {
    230232    pagesInMRI = new ArrayList<MRIWebPageStats>();
    231233    pagesContainingMRI = new ArrayList<MRIWebPageStats>();
     
    237239       
    238240        String text = page.getPageText();
    239         String url = page.getPageURL();
    240241       
    241242        if(text.equals("")) {
    242243        page.addMRILanguageStatus(false);
    243 
    244         // write to webpage CSV for all pages
    245         // not just those webpages that are overall MRI, or those that contain MRI sentences
    246         // CSV column headings:
    247         // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
    248         webpageCSVPrinter.printRecord(i, this.siteID, url, "false", 0, 0);     
    249244        continue;
    250245        }
     
    268263        // were detected as being in MRI
    269264        if(isMRI || numSentencesInMRI >= 1) {
    270            
     265            String url = page.getPageURL();
    271266            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
    272267                           totalSentences, numSentencesInMRI);
     
    277272            }
    278273
    279         }
    280        
    281         // pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
    282         webpageCSVPrinter.printRecord(i, this.siteID, url, isMRI, totalSentences, numSentencesInMRI);
     274            // Only write to webpages csv file for those pages that had any MRI
     275            // language content.
     276            // column headers:
     277            //    pageID, siteID, URL, isMRI, numSentences, numSentencesInMRI
     278            //int pageID = i; // not primary key by itself,
     279                    // must be combined with siteID to form primary key
     280           
     281            webpageCSVPrinter.printRecord(WEBPAGE_COUNTER++,
     282                          SITE_COUNTER, /* alternative: this.siteID */
     283                          url, isMRI, totalSentences, numSentencesInMRI);
     284
     285            // Write the sentences that are in te reo into the mri-sentences CSV file
     286            // whether from webpages that are MRI overall or only those that containing
     287            // any sentences in MRI
     288            for (int j = 1; j < mriSentences.size(); j++) { // 1st element not a sentence
     289            //int sentenceID = j; // combine with siteID and pageID to form primary key
     290            String mriSentence = mriSentences.get(j);
     291            // sentenceID, pageID, sentence
     292            //mriSentencesCSVPrinter.printRecord(sentenceID, pageID, mriSentence);
     293            mriSentencesCSVPrinter.printRecord(MRI_SENTENCE_COUNTER++, WEBPAGE_COUNTER, mriSentence);
     294            }
     295        }       
     296
    283297        }       
    284298    }
     
    342356    }
    343357
    344     /*
    345     public void sitePagesToCSV(CSVPrinter webpageCSVPrinter, ArrayList<String> mriSentences)
    346     throws IOException
    347     {
    348     int totalSentences
    349    
    350     for(int i = 0; i < )
    351         printer.printRecord();
    352        
    353     } catch (IOException ex) {
    354         ex.printStackTrace();
    355     }
    356     }
    357     */
    358    
    359     /*
    360     public void xsitePagesToCSV(File webpageCSVFile, ArrayList<String> mriSentences) {
    361     // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
    362 
    363     //CSVFormat csvFormat = CSVFormat.DEFAULT.
    364     //    withHeader("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
    365    
    366     try (CSVPrinter printer = new CSVPrinter(new FileWriter(webpageCSVFile), CSVFormat.DEFAULT)) {
    367         // header
    368         //printer.printRecord("pageID", "siteID", "URL", "isMRI", "numSentences", "numSentencesInMRI");
    369         // skip first one
    370        
    371         for()
    372         printer.printRecord();
    373        
    374     } catch (IOException ex) {
    375         ex.printStackTrace();
    376     }
    377     }
    378     */
    379358   
    380359    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
     
    422401    File websitesCSVFile = new File(sitesDir, "websites.csv");
    423402    File webpagesCSVFile = new File(sitesDir, "webpages.csv");
     403    File mriSentencesCSVFile = new File(sitesDir, "mri-sentences.csv");
    424404   
    425405    try (
    426406         CSVPrinter websitesCSVPrinter = new CSVPrinter(new FileWriter(websitesCSVFile), CSVFormat.DEFAULT);
    427407         CSVPrinter webpagesCSVPrinter = new CSVPrinter(new FileWriter(webpagesCSVFile), CSVFormat.DEFAULT);
     408         CSVPrinter mriSentencesCSVPrinter = new CSVPrinter(new FileWriter(mriSentencesCSVFile), CSVFormat.DEFAULT);
    428409         ) {
    429410
    430411        // print out the column headers for the websites csv file
    431         websitesCSVPrinter.printRecord("ID", "siteID", "domainURL",
    432                    "totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
    433                    "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
    434         webpagesCSVPrinter.printRecord("pageID", "siteID", "URL", "isMRI",
     412        // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
     413        websitesCSVPrinter.printRecord("ID" /*websiteID*/, "siteID"/* site folder name*/,
     414           "domainURL","totalPagesInSite", "numPagesInMRI", "numOtherPagesContainingMRI",
     415           "nutchCrawlTimestamp", "crawlUnfinished", "redoCrawl");
     416        webpagesCSVPrinter.printRecord("webpageID", "websiteID", "URL", "isMRI",
    435417                       "numSentences", "numSentencesInMRI");
     418        mriSentencesCSVPrinter.printRecord("sentenceID", "webpageID", "sentence");
    436419       
    437420        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
     
    457440            long lastModified = siteDir.lastModified();
    458441            debug("Found siteID: " + siteID);           
    459             NutchTextDumpProcessor nutchTxtDump
    460                 = new NutchTextDumpProcessor(webpagesCSVPrinter, mriTxtDetector, siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
     442            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(
     443                 webpagesCSVPrinter, mriSentencesCSVPrinter, mriTxtDetector,
     444                 siteID, txtDumpFile, lastModified, UNFINISHED_FILE.exists());
    461445            // now it's parsed all the web pages in the site's text dump
    462446
Note: See TracChangeset for help on using the changeset viewer.