Changeset 33578

Show
Ignore:
Timestamp:
17.10.2019 19:31:53 (4 weeks ago)
Author:
ak19
Message:

Corrections for compiling the 2 new classes.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java

    r33576 r33578  
    22 
    33import java.io.*; 
    4 import java.util.HashMap; 
    5 import java.util.Map; 
     4import java.util.ArrayList; 
     5//import java.util.HashMap; 
     6//import java.util.Map; 
    67import java.lang.ArrayIndexOutOfBoundsException; 
    7      
     8 
     9import org.apache.log4j.Logger; 
     10 
     11/** 
     12 * Class to process the dump text files produced for each site (e.g. site "00001") that 
     13 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt. 
     14 * This reads in the dump.txt file contained in each site folder within the input folder. 
     15 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt) 
     16 * Each dump.txt could contain the text contents for an entire site, or for individual pages. 
     17 * This class then uses class TextDumpPage to parse each webpage within a dump.txt, 
     18 * which parses out the actual text body content of each webpage's section within a dump.txt. 
     19 * Finally, MaoriTextDetector is run over that to determine whether the full body text is 
     20 * likely to be in Maori or not. 
     21 * 
     22 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph 
     23 * nor even newline separator, it's hard to be sure that the entire page is in language. 
     24 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language 
     25 * paragraphs contained in a page, if the majority/the remainder happen to be in English. 
     26 *  
     27 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run 
     28 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time, 
     29 * instead of running it over the entire html body's text. 
     30 * 
     31 * TO COMPILE OR RUN, FIRST DO: 
     32 *    cd maori-lang-detection/apache-opennlp-1.9.1 
     33 *    export OPENNLP_HOME=`pwd` 
     34 *    cd maori-lang-detection/src 
     35 * 
     36 * TO COMPILE: 
     37 *    maori-lang-detection/src$ 
     38 *       javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/*" org/greenstone/atea/NutchTextDumpProcessor.java 
     39 * 
     40 * TO RUN: 
     41 *    maori-lang-detection/src$ 
     42 *       java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor 
     43 * 
     44*/ 
    845public class NutchTextDumpProcessor { 
    946    private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 
    1047 
    11     private static MaoriTextDetector maoriTxtDetector = new MaoriTextDetector(false); // false: run non-silent 
     48    private final MaoriTextDetector maoriTxtDetector; 
    1249     
    1350    public final String siteID; // is this necessary? 
     
    1754     
    1855     
    19     public NutchTextDumpProcessor(String siteID, File txtDumpFile) { 
     56    public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) { 
    2057    // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder 
    2158    this.siteID = siteID; 
    22  
    23  
     59    this.maoriTxtDetector = maoriTxtDetector; 
     60     
    2461    pages = new ArrayList<TextDumpPage>(); 
    2562 
     
    3774            pageDump.append("\n"); 
    3875        } else { 
    39             TextDumpPage page = new TextDumpPage(pageDump.toString()); 
     76            TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 
    4077            // parses the fields and body text of a webpage in nutch's txt dump of entire site 
    4178            //page.parseFields(); 
     
    5794 
    5895    String text = getTextForPage(pageID); 
     96    // QTODO: what to do when page body text is empty? 
     97    if(text.equals("")) return false; 
    5998    return maoriTxtDetector.isTextInMaori(text); 
    6099    } 
     
    120159 
    121160    try { 
     161        MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent 
    122162        File[] sites = sitesDir.listFiles(); 
    123163        for(File siteDir : sites) { // e.g. 00001 
    124164        // look for dump.txt 
    125         File txtDumpFile = new File(siteDir, dump.txt); 
     165        File txtDumpFile = new File(siteDir, "dump.txt"); 
    126166        if(!txtDumpFile.exists()) { 
    127167            error("Text dump file " + txtDumpFile + " did not exist"); 
     
    131171        else { 
    132172            String siteID = siteDir.getName(); 
    133             NutchTextDumpProcessor nutchTxtDump = NutchTextDumpProcessor(siteID, txtDumpFile); 
     173            NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 
    134174             
    135175        } 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33576 r33578  
    55import java.util.Map; 
    66 
     7import org.apache.log4j.Logger; 
     8 
     9 
    710public class TextDumpPage { 
    8  
     11    private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName()); 
     12     
    913    public static final String TEXT_START_MARKER="text:start:"; 
    1014    public static final String TEXT_END_MARKER="text:end:"; 
    1115     
    12     private Map tuples; 
     16    private Map<String, String> tuples; 
    1317     
    1418    public TextDumpPage(String siteID, String unparsedPageDump) { 
    15     tuples = new HashMap(); 
     19    tuples = new HashMap<String, String>(); 
    1620     
    1721    try ( 
     
    2529        String pageURL = line.substring(endIndex); 
    2630         
    27         tuples.add("pageURL", pageURL.trim()); 
     31        tuples.put("pageURL", pageURL.trim()); 
    2832 
    2933        String key = line.substring(endIndex); 
    30         tuples.add("key", key.trim()); 
     34        tuples.put("key", key.trim()); 
    3135 
    3236        boolean readingPageText = false; 
     
    4246            String k = line.substring(0, endIndex); 
    4347            String v = line.substring(endIndex+1); 
    44             tuples.add(k.trim(), v.trim()); 
     48            tuples.put(k.trim(), v.trim()); 
    4549            } 
    4650             
    4751            else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text 
    48             StringBuilder pageText = new StringBuilder(); 
     52            pageText = new StringBuilder(); 
    4953            readingPageText = true; 
    5054            }                    
     
    5660            // finished with a page body             
    5761            // remove any FINAL artificial newline we introduced 
    58             tuples.add("pageText", pageText.toString().trim()); 
     62            tuples.put("pageText", pageText.toString().trim()); 
    5963            readingPageText = false; 
    6064            pageText = null; 
     
    7175        // If the page had no pageText, add a "pageText" -> "" mapping 
    7276        if(!tuples.containsKey("pageText")) { 
    73         tuples.add("pageText", ""); 
     77        tuples.put("pageText", ""); 
    7478        } 
    7579         
     
    9195    return tuples.get(key); 
    9296    } 
     97 
     98    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 
     99    public static void info(String msg) { 
     100    System.err.println(msg); 
     101    logger.info(msg); 
     102    } 
     103    public static void debug(String msg) { 
     104    System.err.println(msg); 
     105    logger.debug(msg); 
     106    } 
     107    public static void warn(String msg) { 
     108    System.err.println(msg); 
     109    logger.warn(msg); 
     110    } 
     111    public static void error(String msg) { 
     112    System.err.println(msg); 
     113    logger.error(msg); 
     114    } 
     115    public static void error(String msg, Exception e) { 
     116    logger.error(msg, e); 
     117    System.err.println("\n"+msg); 
     118    e.printStackTrace(); 
     119    } 
     120     
    93121}