Context Navigation

← Previous Change
Next Change →

NutchTextDumpProcessor.java

Timestamp:

2019-10-18T23:16:25+13:00 (5 years ago)

Author:

ak19

Message:

Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33582
+              r33587
 import java.io.*;
+import java.lang.ArrayIndexOutOfBoundsException;
 import java.util.ArrayList;
+//import java.util.HashMap;
+//import java.util.Map;
+import java.lang.ArrayIndexOutOfBoundsException;
+import java.util.Arrays;
 import org.apache.log4j.Logger;
 /**
  * Class to process the dump text files produced for each site (e.g. site "00001") that
+ * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
  * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
  * This reads in the dump.txt file contained in each site folder within the input folder.
 …
     private ArrayList<TextDumpPage> pages;
+    /** list of pages in this site which were detected as being in MRI */
     private ArrayList<MRIWebPageStats> pagesInMRI;
+    /** list of pages in this site which were NOT detected as being in MRI but nevertheless
+     * contain one or more sentences in MRI
+     */
+    private ArrayList<MRIWebPageStats> pagesContainingMRI;
     private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
     // The start of a new web page's record in nutch's text dump of an entire site
 …
+    }
     // Just do this once: get domain of site.
+    // Just do this once: get and store domain of site.
     // Passing true to get domain with protocol prefix
     if(pages.size() > 0) {
         TextDumpPage firstPage = pages.get(0);
         String url = firstPage.getPageURL();
         this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+        TextDumpPage firstPage = pages.get(0);
+        String url = firstPage.getPageURL();
+        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+    }
     else {
 …
     private void prepareSiteStats() {
     pagesInMRI = new ArrayList<MRIWebPageStats>();
+    pagesContainingMRI = new ArrayList<MRIWebPageStats>();
     TextDumpPage page = null;
     for(int i = 0; i < pages.size(); i++) {
         page = pages.get(i);
-        /*
-        // just do this once: get domain. Passing true to get domain with protocol prefix
-        if(this.domainOfSite == null) {
-        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+        }
-        */
         String text = page.getPageText();
 …
         else {
         boolean isMRI = maoriTxtDetector.isTextInMaori(text);
         page.addMRILanguageStatus(isMRI);
+        if(isMRI) { // add page to list of meaningful pages.
+        // Even if the entire page is not found to be overall in MÄori,
+        // let's sitll inspect the sentences of the page and count how many (if any)
+        // are in te reo.
+        ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
+        // first element is always total num sentences
+        // remaining elements are the actual sentences that were detected as being MÄori
+        int totalSentences = Integer.parseInt(mriSentences.get(0));
+        int numSentencesInMRI = mriSentences.size() - 1;
+        // Add page to list of MRI pages if the page's body text overall was detected
+        // as MÄori
+        // Add page to list of pages containing MRI if >= 1 sentences in the page
+        // were detected as being in MRI
+        if(isMRI || numSentencesInMRI >= 1) {
             String url = page.getPageURL();
+            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i);
+            pagesInMRI.add(MRIpageStats);
+        }
+            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
+                           totalSentences, numSentencesInMRI);
+            if(isMRI) {
+            pagesInMRI.add(MRIpageStats);
+            } else if(numSentencesInMRI >= 1) {
+            pagesContainingMRI.add(MRIpageStats);
+            }
+        }
+        }
+    }
 …
     info("Total number of web pages in site: " + pages.size());
     info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
+    info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
+    for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
+        info("URL: " + mriWebPageInfo.URL);
+        info("siteID: " + mriWebPageInfo.siteID);
+    if(pagesInMRI.size() > 0) {
+        info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
+        for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
+        info(mriWebPageInfo.toString());
+        }
+    }
+    info("                      -----------                   ");
+    if(pagesContainingMRI.size() > 0) {
+        info("The following pages weren't detected as primarily being in MÄori");
+        info("But still contained sentences detected as MÄori");
+        for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
+        info(mriWebPageInfo.toString());
+        }
+    } else {
+        info("No further pages detected as containing any sentences in MRI");
+    }
     info("                      -----------                   ");
 …
         MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
         File[] sites = sitesDir.listFiles();
+        // sort site folders in alphabetical order
+        // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
+        Arrays.sort(sites);
         for(File siteDir : sites) { // e.g. 00001
         if(siteDir.isDirectory()) {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33587 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

Download in other formats: