Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33587

Timestamp:

2019-10-18T23:16:25+13:00 (5 years ago)

Author:

ak19

Message:

Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 4 edited

MRIWebPageStats.java (modified) (1 diff)
MaoriTextDetector.java (modified) (1 diff)
NutchTextDumpProcessor.java (modified) (7 diffs)
TextLanguageDetector.java (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

-              r33582
+              r33587
     public final String URL; // URL of webpage
     public final int pageID; // index into NutchTextDumpProcessor::pages ArrayList
+    public final boolean isMRI;
+    public final int numSentences; // count of all sentences in the webpage's body
+    public final int numSentencesInMRI; // count of sentences in the webpage's body in MÄori (mri)
+    public MRIWebPageStats(String siteID, String url, int pageID) {
+    public MRIWebPageStats(String siteID, String url, int pageID, boolean isMRI,
+               int numSentences, int numSentencesInMRI)
+    {
     this.siteID = siteID;
     this.URL = url;
     this.pageID = pageID;
+    this.isMRI = isMRI;
+    this.numSentences = numSentences;
+    this.numSentencesInMRI = numSentencesInMRI;
+    }
+    public String toString() {
+    StringBuilder str = new StringBuilder();
+    str.append("URL: " + this.URL);
+    str.append("\nsiteID: " + this.siteID);
+    str.append("\nnum sentences in MRI: " + this.numSentencesInMRI+"/"+this.numSentences);
+    if(this.isMRI && this.numSentencesInMRI <= 0) {
+        // one or more pages in the site were MRI, but they didn't contain proper sentences
+        str.append(" (no PROPER sentences in MRI)");
+    }
+    return str.toString();
+    }
+}

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java

-              r33586
+              r33587
      * @param text: the string of text from which sentences in the requested
      * language are to be identified and returned.
+     * @return an ArrayList of sentences in the text parameter that are
+     * in the requested language.
+     */
+    public ArrayList<String> getAllSentencesInMaori(String text) throws Exception {
+     * @return an ArrayList where:
+     *   - the first element is the total number of sentences in the text parameter
+     *   - remaining elements are the sentences in the text parameter that were in the
+     *     requested language.
+     */
+    public ArrayList<String> getAllSentencesInMaori(String text) {
     // big assumption here: that we can split incoming text into sentences
     // for any language (using the MÄori language trained sentence model),

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33582
+              r33587
 import java.io.*;
+import java.lang.ArrayIndexOutOfBoundsException;
 import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.Map;
+import java.lang.ArrayIndexOutOfBoundsException;
+import java.util.Arrays;
 import org.apache.log4j.Logger;
 /**
  * Class to process the dump text files produced for each site (e.g. site "00001") that
+ * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
  * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
  * This reads in the dump.txt file contained in each site folder within the input folder.
 …
     private ArrayList<TextDumpPage> pages;
+    /** list of pages in this site which were detected as being in MRI */
     private ArrayList<MRIWebPageStats> pagesInMRI;
+    /** list of pages in this site which were NOT detected as being in MRI but nevertheless
+     * contain one or more sentences in MRI
+     */
+    private ArrayList<MRIWebPageStats> pagesContainingMRI;
     private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
     // The start of a new web page's record in nutch's text dump of an entire site
 …
+    }
     // Just do this once: get domain of site.
+    // Just do this once: get and store domain of site.
     // Passing true to get domain with protocol prefix
     if(pages.size() > 0) {
         TextDumpPage firstPage = pages.get(0);
         String url = firstPage.getPageURL();
         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+        TextDumpPage firstPage = pages.get(0);
+        String url = firstPage.getPageURL();
+        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+    }
     else {
 …
     private void prepareSiteStats() {
     pagesInMRI = new ArrayList<MRIWebPageStats>();
+    pagesContainingMRI = new ArrayList<MRIWebPageStats>();
     TextDumpPage page = null;
     for(int i = 0; i < pages.size(); i++) {
         page = pages.get(i);
-        /*
-        // just do this once: get domain. Passing true to get domain with protocol prefix
-        if(this.domainOfSite == null) {
-        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+        }
-        */
         String text = page.getPageText();
 …
         else {
         boolean isMRI = maoriTxtDetector.isTextInMaori(text);
         page.addMRILanguageStatus(isMRI);
+        if(isMRI) { // add page to list of meaningful pages.
+        // Even if the entire page is not found to be overall in MÄori,
+        // let's sitll inspect the sentences of the page and count how many (if any)
+        // are in te reo.
+        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
+        // first element is always total num sentences
+        // remaining elements are the actual sentences that were detected as being MÄori
+        int totalSentences = Integer.parseInt(mriSentences.get(0));
+        int numSentencesInMRI = mriSentences.size() - 1;
+        // Add page to list of MRI pages if the page's body text overall was detected
+        // as MÄori
+        // Add page to list of pages containing MRI if >= 1 sentences in the page
+        // were detected as being in MRI
+        if(isMRI || numSentencesInMRI >= 1) {
             String url = page.getPageURL();
+            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i);
+            pagesInMRI.add(MRIpageStats);
+        }
+            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
+                           totalSentences, numSentencesInMRI);
+            if(isMRI) {
+            pagesInMRI.add(MRIpageStats);
+            } else if(numSentencesInMRI >= 1) {
+            pagesContainingMRI.add(MRIpageStats);
+            }
+        }
+        }
+    }
 …
     info("Total number of web pages in site: " + pages.size());
     info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
+    info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
+    for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
+        info("URL: " + mriWebPageInfo.URL);
+        info("siteID: " + mriWebPageInfo.siteID);
+    if(pagesInMRI.size() > 0) {
+        info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
+        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
+        info(mriWebPageInfo.toString());
+        }
+    }
+    info("                      -----------                   ");
+    if(pagesContainingMRI.size() > 0) {
+        info("The following pages weren't detected as primarily being in MÄori");
+        info("But still contained sentences detected as MÄori");
+        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
+        info(mriWebPageInfo.toString());
+        }
+    } else {
+        info("No further pages detected as containing any sentences in MRI");
+    }
     info("                      -----------                   ");
 …
         MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
         File[] sites = sitesDir.listFiles();
+        // sort site folders in alphabetical order
+        // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
+        Arrays.sort(sites);
         for(File siteDir : sites) { // e.g. 00001
         if(siteDir.isDirectory()) {

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

-              r33586
+              r33587
      * for that sentence. The confidence cutoff provides an additional check.
      * @return null if no Sentence Detection Model set up in constructor
+     * else returns an ArrayList of sentences in the text parameter that are
+     * in the requested language.
+     * else returns an ArrayList where:
+     *   - the first element is the total number of sentences in the text parameter
+     *   - remaining elements are the sentences in the text parameter that were in the
+     *     requested language.
      */
     public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
 …
     String[] sentences = sentenceDetector.sentDetect(text);
+    if(sentences == null) {
+        sentencesInLang.add("0"); // to indicate 0 sentences in requested language
+        return sentencesInLang;
+    }
+    // add in first element: how many sentences there were in text.
+    sentencesInLang.add(Integer.toString(sentences.length));
     for(int i = 0; i < sentences.length; i++) {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33587

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MRIWebPageStats.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

Download in other formats: