Context Navigation

← Previous Change
Next Change →

Changeset 33582 for gs3-extensions

Timestamp:

2019-10-17T23:12:38+13:00 (5 years ago)

Author:

ak19

Message:

NutchTextDumpProcessor prints each crawled site's stats: number of webpages per crawled site and how many of those were detected by OpenNLP as being in Maori (mri). Needed to make a reusable method in CCWETProcessor as public and static.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 1 added
: 3 edited

CCWETProcessor.java (modified) (1 diff)
MRIWebPageStats.java (added)
NutchTextDumpProcessor.java (modified) (7 diffs)
TextDumpPage.java (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

r33575	r33582
240	240	* This retains any www. or subdomain prefix.
241	241	*/
242		p~~rivate~~ String getDomainForURL(String url, boolean withProtocol) {
	242	public static String getDomainForURL(String url, boolean withProtocol) {
243	243	int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
244	244	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

-              r33580
+              r33582
  *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
+ *
+ * or:
+ *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1
+ *
 */
 public class NutchTextDumpProcessor {
     private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
+    static boolean DEBUG_MODE = true;
     private final MaoriTextDetector maoriTxtDetector;
     public final String siteID; // is this necessary?
+    private String domainOfSite;
     /** keep a list to store the text of each page */
     private ArrayList<TextDumpPage> pages;
+    private ArrayList<MRIWebPageStats> pagesInMRI;
     private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
 …
     public void debugPageDump(StringBuilder pageDump) {
+    // START DEBUG
+    debug("__________________________________________");
+    debug("@@@ Found page entry: ");
+    debug("__________________________________________");
+    debug(pageDump.toString());
+    debug("------------------------------------------");
+    // END DEBUG
+    if(DEBUG_MODE) {
+        // START DEBUG
+        debug("__________________________________________");
+        debug("@@@ Found page entry: ");
+        debug("__________________________________________");
+        debug(pageDump.toString());
+        debug("------------------------------------------");
+        // END DEBUG
+    }
+    }
 …
         error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
+    }
+    // Just do this once: get domain of site.
+    // Passing true to get domain with protocol prefix
+    if(pages.size() > 0) {
+        TextDumpPage firstPage = pages.get(0);
+        String url = firstPage.getPageURL();
+        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+    }
+    else {
+        this.domainOfSite = "UNKNOWN";
+    }
+    prepareSiteStats();
+    }
 …
     String text = getTextForPage(pageID);
     // QTODO: what to do when page body text is empty?
     if(text.equals("")) return false;
 …
     TextDumpPage page = getPage(pageID);
     return page.getPageURL();
+    }
+    }
+    public int totalNumPages() {
+    return pages.size();
+    }
+    private void prepareSiteStats() {
+    pagesInMRI = new ArrayList<MRIWebPageStats>();
+    TextDumpPage page = null;
+    for(int i = 0; i < pages.size(); i++) {
+        page = pages.get(i);
+        /*
+        // just do this once: get domain. Passing true to get domain with protocol prefix
+        if(this.domainOfSite == null) {
+        this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
+        }
+        */
+        String text = page.getPageText();
+        if(text.equals("")) {
+        page.addMRILanguageStatus(false);
+        continue;
+        }
+        else {
+        boolean isMRI = maoriTxtDetector.isTextInMaori(text);
+        page.addMRILanguageStatus(isMRI);
+        if(isMRI) { // add page to list of meaningful pages.
+            String url = page.getPageURL();
+            MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i);
+            pagesInMRI.add(MRIpageStats);
+        }
+        }
+    }
+    }
+    public void printSiteStats() {
+    info("------------- " + this.siteID + " SITE STATS -----------");
+    info("SITE DOMAIN: " + this.domainOfSite);
+    info("Total number of web pages in site: " + pages.size());
+    info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
+    info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
+    for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
+        info("URL: " + mriWebPageInfo.URL);
+        info("siteID: " + mriWebPageInfo.siteID);
+    }
+    info("                      -----------                   ");
+    }
     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
 …
+    }
+    NutchTextDumpProcessor.DEBUG_MODE = false;
     try {
         MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
+        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
         File[] sites = sitesDir.listFiles();
         for(File siteDir : sites) { // e.g. 00001
 …
             String siteID = siteDir.getName();
             debug("Found siteID: " + siteID);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
+            // now it's parsed all the web pages in the site's text dump
+            // Let's print stats on each web page's detected language being MRI or not
+            // and how many pages there were in the site in total.
+            nutchTxtDump.printSiteStats();
+            }
+        }

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

-              r33580
+              r33582
+            }
             else { // dealing with the rest of the page dump's metadata
+            // QTODO: nutch's text dump output is problematic
+            // strange characters are in the stream and end up here
+            // and can't detect end of metadata or even end of line.
             endIndex = line.indexOf(":");
             if(endIndex != -1) {
 …
                 tuples.put(k.trim(), v.trim());
             } else {
+                error("No meta key for meta: " + line);
+                if(NutchTextDumpProcessor.DEBUG_MODE) {
+                error("No meta key for meta: " + line);
+                error(unparsedPageDump);
+                }
+            }
+            }
 …
     // START DEBUG
+    debug("__________________________________________");
+    for(Map.Entry<String, String> entry : tuples.entrySet()) {
+        String key = entry.getKey();
+        String value = entry.getValue();
+        debug(key + " - " + value);
+    }
+    debug("__________________________________________");
+    debugTuples();
     // END DEBUG
+    }
+    public void debugTuples() {
+    if(NutchTextDumpProcessor.DEBUG_MODE) {
+        debug("__________________________________________");
+        for(Map.Entry<String, String> entry : tuples.entrySet()) {
+        String key = entry.getKey();
+        String value = entry.getValue();
+        debug(key + " - " + value);
+        }
+        debug("__________________________________________");
+    }
+    }
     public String getPageURL() {
     return tuples.get("url");
+    return tuples.get("pageURL");
+    }
 …
+    }
+    public void add(String key, String value) {
+    tuples.put(key, value);
+    }
+    public void addMRILanguageStatus(boolean status) {
+    if(status) {
+        tuples.put("isMRI", "true");
+    } else {
+        tuples.put("isMRI", "false");
+    }
+    }
+    public boolean getMRILanguageStatus() {
+    String value = tuples.get("isMRI");
+    if(value == null) {
+        return false;
+    }
+    if(value.equals("true")) {
+        return true;
+    }
+    else {
+        return false;
+    }
+    }
     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
     public static void info(String msg) {

Note: See TracChangeset for help on using the changeset viewer.