Ignore:
Timestamp:
2019-10-17T19:31:53+13:00 (5 years ago)
Author:
ak19
Message:

Corrections for compiling the 2 new classes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33576 r33578  
    22
    33import java.io.*;
    4 import java.util.HashMap;
    5 import java.util.Map;
     4import java.util.ArrayList;
     5//import java.util.HashMap;
     6//import java.util.Map;
    67import java.lang.ArrayIndexOutOfBoundsException;
    7    
     8
     9import org.apache.log4j.Logger;
     10
     11/**
     12 * Class to process the dump text files produced for each site (e.g. site "00001") that
     13 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
     14 * This reads in the dump.txt file contained in each site folder within the input folder.
     15 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
     16 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
     17 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
     18 * which parses out the actual text body content of each webpage's section within a dump.txt.
     19 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
     20 * likely to be in Maori or not.
     21 *
     22 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
     23 * nor even newline separator, it's hard to be sure that the entire page is in language.
     24 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
     25 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
     26 *
     27 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
     28 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
     29 * instead of running it over the entire html body's text.
     30 *
     31 * TO COMPILE OR RUN, FIRST DO:
     32 *    cd maori-lang-detection/apache-opennlp-1.9.1
     33 *    export OPENNLP_HOME=`pwd`
     34 *    cd maori-lang-detection/src
     35 *
     36 * TO COMPILE:
     37 *    maori-lang-detection/src$
     38 *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/*" org/greenstone/atea/NutchTextDumpProcessor.java
     39 *
     40 * TO RUN:
     41 *    maori-lang-detection/src$
     42 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor
     43 *
     44*/
    845public class NutchTextDumpProcessor {
    946    private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
    1047
    11     private static MaoriTextDetector maoriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
     48    private final MaoriTextDetector maoriTxtDetector;
    1249   
    1350    public final String siteID; // is this necessary?
     
    1754   
    1855   
    19     public NutchTextDumpProcessor(String siteID, File txtDumpFile) {
     56    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
    2057    // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
    2158    this.siteID = siteID;
    22 
    23 
     59    this.maoriTxtDetector = maoriTxtDetector;
     60   
    2461    pages = new ArrayList<TextDumpPage>();
    2562
     
    3774            pageDump.append("\n");
    3875        } else {
    39             TextDumpPage page = new TextDumpPage(pageDump.toString());
     76            TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
    4077            // parses the fields and body text of a webpage in nutch's txt dump of entire site
    4178            //page.parseFields();
     
    5794
    5895    String text = getTextForPage(pageID);
     96    // QTODO: what to do when page body text is empty?
     97    if(text.equals("")) return false;
    5998    return maoriTxtDetector.isTextInMaori(text);
    6099    }
     
    120159
    121160    try {
     161        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
    122162        File[] sites = sitesDir.listFiles();
    123163        for(File siteDir : sites) { // e.g. 00001
    124164        // look for dump.txt
    125         File txtDumpFile = new File(siteDir, dump.txt);
     165        File txtDumpFile = new File(siteDir, "dump.txt");
    126166        if(!txtDumpFile.exists()) {
    127167            error("Text dump file " + txtDumpFile + " did not exist");
     
    131171        else {
    132172            String siteID = siteDir.getName();
    133             NutchTextDumpProcessor nutchTxtDump = NutchTextDumpProcessor(siteID, txtDumpFile);
     173            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
    134174           
    135175        }
Note: See TracChangeset for help on using the changeset viewer.