Changeset 33587 for gs3-extensions


Ignore:
Timestamp:
2019-10-18T23:16:25+13:00 (5 years ago)
Author:
ak19
Message:
  1. Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.
Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

    r33582 r33587  
    1111    public final String URL; // URL of webpage
    1212    public final int pageID; // index into NutchTextDumpProcessor::pages ArrayList
     13
     14    public final boolean isMRI;
     15    public final int numSentences; // count of all sentences in the webpage's body
     16    public final int numSentencesInMRI; // count of sentences in the webpage's body in Māori (mri)
     17
    1318   
    14     public MRIWebPageStats(String siteID, String url, int pageID) {
     19    public MRIWebPageStats(String siteID, String url, int pageID, boolean isMRI,
     20               int numSentences, int numSentencesInMRI)
     21    {
    1522    this.siteID = siteID;
    1623    this.URL = url;
    1724    this.pageID = pageID;
     25
     26    this.isMRI = isMRI;
     27    this.numSentences = numSentences;
     28    this.numSentencesInMRI = numSentencesInMRI;
     29    }
     30
     31    public String toString() {
     32    StringBuilder str = new StringBuilder();
     33    str.append("URL: " + this.URL);
     34    str.append("\nsiteID: " + this.siteID);
     35    str.append("\nnum sentences in MRI: " + this.numSentencesInMRI+"/"+this.numSentences); 
     36    if(this.isMRI && this.numSentencesInMRI <= 0) {
     37        // one or more pages in the site were MRI, but they didn't contain proper sentences
     38        str.append(" (no PROPER sentences in MRI)");
     39    }
     40    return str.toString();
    1841    }
    1942}
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java

    r33586 r33587  
    6969     * @param text: the string of text from which sentences in the requested
    7070     * language are to be identified and returned.
    71      * @return an ArrayList of sentences in the text parameter that are
    72      * in the requested language.
    73      */
    74     public ArrayList<String> getAllSentencesInMaori(String text) throws Exception {
     71     * @return an ArrayList where:
     72     *   - the first element is the total number of sentences in the text parameter
     73     *   - remaining elements are the sentences in the text parameter that were in the
     74     *     requested language.
     75     */
     76    public ArrayList<String> getAllSentencesInMaori(String text) {
    7577    // big assumption here: that we can split incoming text into sentences
    7678    // for any language (using the Māori language trained sentence model),
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33582 r33587  
    22
    33import java.io.*;
     4import java.lang.ArrayIndexOutOfBoundsException;
    45import java.util.ArrayList;
    5 //import java.util.HashMap;
    6 //import java.util.Map;
    7 import java.lang.ArrayIndexOutOfBoundsException;
     6import java.util.Arrays;
    87
    98import org.apache.log4j.Logger;
    109
    1110/**
    12  * Class to process the dump text files produced for each site (e.g. site "00001") that
     11 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
    1312 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
    1413 * This reads in the dump.txt file contained in each site folder within the input folder.
     
    5958    private ArrayList<TextDumpPage> pages;
    6059
     60    /** list of pages in this site which were detected as being in MRI */
    6161    private ArrayList<MRIWebPageStats> pagesInMRI;
    62 
     62    /** list of pages in this site which were NOT detected as being in MRI but nevertheless
     63     * contain one or more sentences in MRI
     64     */
     65    private ArrayList<MRIWebPageStats> pagesContainingMRI;
     66   
    6367    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
    6468    // The start of a new web page's record in nutch's text dump of an entire site
     
    147151    }
    148152
    149     // Just do this once: get domain of site.
     153    // Just do this once: get and store domain of site.
    150154    // Passing true to get domain with protocol prefix
    151155    if(pages.size() > 0) {
    152         TextDumpPage firstPage = pages.get(0);     
    153         String url = firstPage.getPageURL();       
    154         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);     
     156        TextDumpPage firstPage = pages.get(0);
     157        String url = firstPage.getPageURL();
     158        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
    155159    }
    156160    else {
     
    195199    private void prepareSiteStats() {
    196200    pagesInMRI = new ArrayList<MRIWebPageStats>();
    197 
     201    pagesContainingMRI = new ArrayList<MRIWebPageStats>();
    198202
    199203    TextDumpPage page = null;
    200204    for(int i = 0; i < pages.size(); i++) {
    201 
    202205       
    203206        page = pages.get(i);
    204 
    205         /*
    206         // just do this once: get domain. Passing true to get domain with protocol prefix
    207         if(this.domainOfSite == null) {     
    208         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
    209         }
    210         */
    211207       
    212208        String text = page.getPageText();
     
    217213        else {
    218214        boolean isMRI = maoriTxtDetector.isTextInMaori(text);
     215       
    219216        page.addMRILanguageStatus(isMRI);
    220217
    221         if(isMRI) { // add page to list of meaningful pages.           
     218        // Even if the entire page is not found to be overall in Māori,
     219        // let's sitll inspect the sentences of the page and count how many (if any)
     220        // are in te reo.
     221        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
     222        // first element is always total num sentences
     223        // remaining elements are the actual sentences that were detected as being Māori
     224        int totalSentences = Integer.parseInt(mriSentences.get(0));
     225        int numSentencesInMRI = mriSentences.size() - 1;           
     226       
     227
     228        // Add page to list of MRI pages if the page's body text overall was detected
     229        // as Māori
     230        // Add page to list of pages containing MRI if >= 1 sentences in the page
     231        // were detected as being in MRI
     232        if(isMRI || numSentencesInMRI >= 1) {
    222233            String url = page.getPageURL();
    223             MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i);
    224             pagesInMRI.add(MRIpageStats);
    225         }
    226        
     234            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
     235                           totalSentences, numSentencesInMRI);
     236            if(isMRI) {   
     237            pagesInMRI.add(MRIpageStats);
     238            } else if(numSentencesInMRI >= 1) {
     239            pagesContainingMRI.add(MRIpageStats);
     240            }
     241
     242        }       
    227243        }
    228244    }
     
    237253    info("Total number of web pages in site: " + pages.size());
    238254    info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
    239 
    240     info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
    241     for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
    242         info("URL: " + mriWebPageInfo.URL);
    243         info("siteID: " + mriWebPageInfo.siteID);
     255   
     256    if(pagesInMRI.size() > 0) {
     257        info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
     258        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
     259        info(mriWebPageInfo.toString());
     260        }
     261    }
     262
     263    info("                      -----------                   ");
     264    if(pagesContainingMRI.size() > 0) {     
     265        info("The following pages weren't detected as primarily being in Māori");
     266        info("But still contained sentences detected as Māori");
     267        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
     268        info(mriWebPageInfo.toString());
     269        }
     270       
     271    } else {
     272        info("No further pages detected as containing any sentences in MRI");     
    244273    }
    245274    info("                      -----------                   ");
     
    292321        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
    293322        File[] sites = sitesDir.listFiles();
     323
     324        // sort site folders in alphabetical order
     325        // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
     326        Arrays.sort(sites);
     327       
    294328        for(File siteDir : sites) { // e.g. 00001
    295329        if(siteDir.isDirectory()) {
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33586 r33587  
    154154     * for that sentence. The confidence cutoff provides an additional check.
    155155     * @return null if no Sentence Detection Model set up in constructor
    156      * else returns an ArrayList of sentences in the text parameter that are
    157      * in the requested language.     
     156     * else returns an ArrayList where:
     157     *   - the first element is the total number of sentences in the text parameter
     158     *   - remaining elements are the sentences in the text parameter that were in the
     159     *     requested language.
    158160     */
    159161    public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
     
    180182   
    181183    String[] sentences = sentenceDetector.sentDetect(text);
     184    if(sentences == null) {
     185        sentencesInLang.add("0"); // to indicate 0 sentences in requested language
     186        return sentencesInLang;
     187    }
     188
     189    // add in first element: how many sentences there were in text.
     190    sentencesInLang.add(Integer.toString(sentences.length));
    182191   
    183192    for(int i = 0; i < sentences.length; i++) {
Note: See TracChangeset for help on using the changeset viewer.