Ignore:
Timestamp:
2019-10-18T23:16:25+13:00 (5 years ago)
Author:
ak19
Message:
  1. Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33582 r33587  
    22
    33import java.io.*;
     4import java.lang.ArrayIndexOutOfBoundsException;
    45import java.util.ArrayList;
    5 //import java.util.HashMap;
    6 //import java.util.Map;
    7 import java.lang.ArrayIndexOutOfBoundsException;
     6import java.util.Arrays;
    87
    98import org.apache.log4j.Logger;
    109
    1110/**
    12  * Class to process the dump text files produced for each site (e.g. site "00001") that
     11 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
    1312 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
    1413 * This reads in the dump.txt file contained in each site folder within the input folder.
     
    5958    private ArrayList<TextDumpPage> pages;
    6059
     60    /** list of pages in this site which were detected as being in MRI */
    6161    private ArrayList<MRIWebPageStats> pagesInMRI;
    62 
     62    /** list of pages in this site which were NOT detected as being in MRI but nevertheless
     63     * contain one or more sentences in MRI
     64     */
     65    private ArrayList<MRIWebPageStats> pagesContainingMRI;
     66   
    6367    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
    6468    // The start of a new web page's record in nutch's text dump of an entire site
     
    147151    }
    148152
    149     // Just do this once: get domain of site.
     153    // Just do this once: get and store domain of site.
    150154    // Passing true to get domain with protocol prefix
    151155    if(pages.size() > 0) {
    152         TextDumpPage firstPage = pages.get(0);     
    153         String url = firstPage.getPageURL();       
    154         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);     
     156        TextDumpPage firstPage = pages.get(0);
     157        String url = firstPage.getPageURL();
     158        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
    155159    }
    156160    else {
     
    195199    private void prepareSiteStats() {
    196200    pagesInMRI = new ArrayList<MRIWebPageStats>();
    197 
     201    pagesContainingMRI = new ArrayList<MRIWebPageStats>();
    198202
    199203    TextDumpPage page = null;
    200204    for(int i = 0; i < pages.size(); i++) {
    201 
    202205       
    203206        page = pages.get(i);
    204 
    205         /*
    206         // just do this once: get domain. Passing true to get domain with protocol prefix
    207         if(this.domainOfSite == null) {     
    208         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
    209         }
    210         */
    211207       
    212208        String text = page.getPageText();
     
    217213        else {
    218214        boolean isMRI = maoriTxtDetector.isTextInMaori(text);
     215       
    219216        page.addMRILanguageStatus(isMRI);
    220217
    221         if(isMRI) { // add page to list of meaningful pages.           
     218        // Even if the entire page is not found to be overall in Māori,
     219        // let's sitll inspect the sentences of the page and count how many (if any)
     220        // are in te reo.
     221        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
     222        // first element is always total num sentences
     223        // remaining elements are the actual sentences that were detected as being Māori
     224        int totalSentences = Integer.parseInt(mriSentences.get(0));
     225        int numSentencesInMRI = mriSentences.size() - 1;           
     226       
     227
     228        // Add page to list of MRI pages if the page's body text overall was detected
     229        // as Māori
     230        // Add page to list of pages containing MRI if >= 1 sentences in the page
     231        // were detected as being in MRI
     232        if(isMRI || numSentencesInMRI >= 1) {
    222233            String url = page.getPageURL();
    223             MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i);
    224             pagesInMRI.add(MRIpageStats);
    225         }
    226        
     234            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
     235                           totalSentences, numSentencesInMRI);
     236            if(isMRI) {   
     237            pagesInMRI.add(MRIpageStats);
     238            } else if(numSentencesInMRI >= 1) {
     239            pagesContainingMRI.add(MRIpageStats);
     240            }
     241
     242        }       
    227243        }
    228244    }
     
    237253    info("Total number of web pages in site: " + pages.size());
    238254    info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
    239 
    240     info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
    241     for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
    242         info("URL: " + mriWebPageInfo.URL);
    243         info("siteID: " + mriWebPageInfo.siteID);
     255   
     256    if(pagesInMRI.size() > 0) {
     257        info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
     258        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
     259        info(mriWebPageInfo.toString());
     260        }
     261    }
     262
     263    info("                      -----------                   ");
     264    if(pagesContainingMRI.size() > 0) {     
     265        info("The following pages weren't detected as primarily being in Māori");
     266        info("But still contained sentences detected as Māori");
     267        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
     268        info(mriWebPageInfo.toString());
     269        }
     270       
     271    } else {
     272        info("No further pages detected as containing any sentences in MRI");     
    244273    }
    245274    info("                      -----------                   ");
     
    292321        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
    293322        File[] sites = sitesDir.listFiles();
     323
     324        // sort site folders in alphabetical order
     325        // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
     326        Arrays.sort(sites);
     327       
    294328        for(File siteDir : sites) { // e.g. 00001
    295329        if(siteDir.isDirectory()) {
Note: See TracChangeset for help on using the changeset viewer.