Changeset 33587

Show
Ignore:
Timestamp:
18.10.2019 23:16:25 (4 weeks ago)
Author:
ak19
Message:

1. Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector?.java::getAllSentencesInMaori() and TextLanguageDetector?.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList? returned.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

    r33582 r33587  
    1111    public final String URL; // URL of webpage 
    1212    public final int pageID; // index into NutchTextDumpProcessor::pages ArrayList 
     13 
     14    public final boolean isMRI; 
     15    public final int numSentences; // count of all sentences in the webpage's body 
     16    public final int numSentencesInMRI; // count of sentences in the webpage's body in Māori (mri) 
     17 
    1318     
    14     public MRIWebPageStats(String siteID, String url, int pageID) { 
     19    public MRIWebPageStats(String siteID, String url, int pageID, boolean isMRI, 
     20               int numSentences, int numSentencesInMRI) 
     21    { 
    1522    this.siteID = siteID; 
    1623    this.URL = url; 
    1724    this.pageID = pageID; 
     25 
     26    this.isMRI = isMRI; 
     27    this.numSentences = numSentences; 
     28    this.numSentencesInMRI = numSentencesInMRI; 
     29    } 
     30 
     31    public String toString() { 
     32    StringBuilder str = new StringBuilder(); 
     33    str.append("URL: " + this.URL); 
     34    str.append("\nsiteID: " + this.siteID); 
     35    str.append("\nnum sentences in MRI: " + this.numSentencesInMRI+"/"+this.numSentences);   
     36    if(this.isMRI && this.numSentencesInMRI <= 0) { 
     37        // one or more pages in the site were MRI, but they didn't contain proper sentences 
     38        str.append(" (no PROPER sentences in MRI)"); 
     39    } 
     40    return str.toString(); 
    1841    } 
    1942} 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java

    r33586 r33587  
    6969     * @param text: the string of text from which sentences in the requested 
    7070     * language are to be identified and returned. 
    71      * @return an ArrayList of sentences in the text parameter that are 
    72      * in the requested language. 
    73      */ 
    74     public ArrayList<String> getAllSentencesInMaori(String text) throws Exception { 
     71     * @return an ArrayList where: 
     72     *   - the first element is the total number of sentences in the text parameter 
     73     *   - remaining elements are the sentences in the text parameter that were in the 
     74     *     requested language. 
     75     */ 
     76    public ArrayList<String> getAllSentencesInMaori(String text) { 
    7577    // big assumption here: that we can split incoming text into sentences 
    7678    // for any language (using the Māori language trained sentence model), 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33582 r33587  
    22 
    33import java.io.*; 
     4import java.lang.ArrayIndexOutOfBoundsException; 
    45import java.util.ArrayList; 
    5 //import java.util.HashMap; 
    6 //import java.util.Map; 
    7 import java.lang.ArrayIndexOutOfBoundsException; 
     6import java.util.Arrays; 
    87 
    98import org.apache.log4j.Logger; 
    109 
    1110/** 
    12  * Class to process the dump text files produced for each site (e.g. site "00001") that 
     11 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that 
    1312 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt. 
    1413 * This reads in the dump.txt file contained in each site folder within the input folder. 
     
    5958    private ArrayList<TextDumpPage> pages; 
    6059 
     60    /** list of pages in this site which were detected as being in MRI */ 
    6161    private ArrayList<MRIWebPageStats> pagesInMRI; 
    62  
     62    /** list of pages in this site which were NOT detected as being in MRI but nevertheless 
     63     * contain one or more sentences in MRI  
     64     */ 
     65    private ArrayList<MRIWebPageStats> pagesContainingMRI; 
     66     
    6367    private boolean isStartOfNewWebPageRecord(String prevLine, String line) { 
    6468    // The start of a new web page's record in nutch's text dump of an entire site 
     
    147151    } 
    148152 
    149     // Just do this once: get domain of site. 
     153    // Just do this once: get and store domain of site. 
    150154    // Passing true to get domain with protocol prefix 
    151155    if(pages.size() > 0) { 
    152         TextDumpPage firstPage = pages.get(0);       
    153         String url = firstPage.getPageURL();         
    154         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);       
     156        TextDumpPage firstPage = pages.get(0); 
     157        String url = firstPage.getPageURL(); 
     158        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 
    155159    } 
    156160    else { 
     
    195199    private void prepareSiteStats() { 
    196200    pagesInMRI = new ArrayList<MRIWebPageStats>(); 
    197  
     201    pagesContainingMRI = new ArrayList<MRIWebPageStats>(); 
    198202 
    199203    TextDumpPage page = null; 
    200204    for(int i = 0; i < pages.size(); i++) { 
    201  
    202205         
    203206        page = pages.get(i); 
    204  
    205         /* 
    206         // just do this once: get domain. Passing true to get domain with protocol prefix 
    207         if(this.domainOfSite == null) {      
    208         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true); 
    209         } 
    210         */ 
    211207         
    212208        String text = page.getPageText(); 
     
    217213        else { 
    218214        boolean isMRI = maoriTxtDetector.isTextInMaori(text); 
     215         
    219216        page.addMRILanguageStatus(isMRI); 
    220217 
    221         if(isMRI) { // add page to list of meaningful pages.             
     218        // Even if the entire page is not found to be overall in Māori, 
     219        // let's sitll inspect the sentences of the page and count how many (if any) 
     220        // are in te reo. 
     221        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text); 
     222        // first element is always total num sentences 
     223        // remaining elements are the actual sentences that were detected as being Māori 
     224        int totalSentences = Integer.parseInt(mriSentences.get(0)); 
     225        int numSentencesInMRI = mriSentences.size() - 1;             
     226         
     227 
     228        // Add page to list of MRI pages if the page's body text overall was detected 
     229        // as Māori 
     230        // Add page to list of pages containing MRI if >= 1 sentences in the page 
     231        // were detected as being in MRI 
     232        if(isMRI || numSentencesInMRI >= 1) { 
    222233            String url = page.getPageURL(); 
    223             MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i); 
    224             pagesInMRI.add(MRIpageStats); 
    225         } 
    226          
     234            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI, 
     235                           totalSentences, numSentencesInMRI); 
     236            if(isMRI) {    
     237            pagesInMRI.add(MRIpageStats); 
     238            } else if(numSentencesInMRI >= 1) { 
     239            pagesContainingMRI.add(MRIpageStats); 
     240            } 
     241 
     242        }        
    227243        } 
    228244    } 
     
    237253    info("Total number of web pages in site: " + pages.size()); 
    238254    info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size()); 
    239  
    240     info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 
    241     for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 
    242         info("URL: " + mriWebPageInfo.URL); 
    243         info("siteID: " + mriWebPageInfo.siteID); 
     255     
     256    if(pagesInMRI.size() > 0) { 
     257        info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 
     258        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 
     259        info(mriWebPageInfo.toString()); 
     260        } 
     261    } 
     262 
     263    info("                      -----------                   "); 
     264    if(pagesContainingMRI.size() > 0) {      
     265        info("The following pages weren't detected as primarily being in Māori"); 
     266        info("But still contained sentences detected as Māori"); 
     267        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) { 
     268        info(mriWebPageInfo.toString()); 
     269        } 
     270         
     271    } else { 
     272        info("No further pages detected as containing any sentences in MRI");       
    244273    } 
    245274    info("                      -----------                   "); 
     
    292321        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent 
    293322        File[] sites = sitesDir.listFiles(); 
     323 
     324        // sort site folders in alphabetical order 
     325        // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order 
     326        Arrays.sort(sites); 
     327         
    294328        for(File siteDir : sites) { // e.g. 00001 
    295329        if(siteDir.isDirectory()) { 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33586 r33587  
    154154     * for that sentence. The confidence cutoff provides an additional check. 
    155155     * @return null if no Sentence Detection Model set up in constructor 
    156      * else returns an ArrayList of sentences in the text parameter that are 
    157      * in the requested language.      
     156     * else returns an ArrayList where: 
     157     *   - the first element is the total number of sentences in the text parameter 
     158     *   - remaining elements are the sentences in the text parameter that were in the 
     159     *     requested language. 
    158160     */ 
    159161    public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff) 
     
    180182     
    181183    String[] sentences = sentenceDetector.sentDetect(text); 
     184    if(sentences == null) { 
     185        sentencesInLang.add("0"); // to indicate 0 sentences in requested language 
     186        return sentencesInLang; 
     187    } 
     188 
     189    // add in first element: how many sentences there were in text. 
     190    sentencesInLang.add(Integer.toString(sentences.length)); 
    182191     
    183192    for(int i = 0; i < sentences.length; i++) {