Changeset 33578
- Timestamp:
- 2019-10-17T19:31:53+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java
r33576 r33578 2 2 3 3 import java.io.*; 4 import java.util.HashMap; 5 import java.util.Map; 4 import java.util.ArrayList; 5 //import java.util.HashMap; 6 //import java.util.Map; 6 7 import java.lang.ArrayIndexOutOfBoundsException; 7 8 9 import org.apache.log4j.Logger; 10 11 /** 12 * Class to process the dump text files produced for each site (e.g. site "00001") that 13 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt. 14 * This reads in the dump.txt file contained in each site folder within the input folder. 15 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt) 16 * Each dump.txt could contain the text contents for an entire site, or for individual pages. 17 * This class then uses class TextDumpPage to parse each webpage within a dump.txt, 18 * which parses out the actual text body content of each webpage's section within a dump.txt. 19 * Finally, MaoriTextDetector is run over that to determine whether the full body text is 20 * likely to be in Maori or not. 21 * 22 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph 23 * nor even newline separator, it's hard to be sure that the entire page is in language. 24 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language 25 * paragraphs contained in a page, if the majority/the remainder happen to be in English. 26 * 27 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run 28 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time, 29 * instead of running it over the entire html body's text. 30 * 31 * TO COMPILE OR RUN, FIRST DO: 32 * cd maori-lang-detection/apache-opennlp-1.9.1 33 * export OPENNLP_HOME=`pwd` 34 * cd maori-lang-detection/src 35 * 36 * TO COMPILE: 37 * maori-lang-detection/src$ 38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/*" org/greenstone/atea/NutchTextDumpProcessor.java 39 * 40 * TO RUN: 41 * maori-lang-detection/src$ 42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor 43 * 44 */ 8 45 public class NutchTextDumpProcessor { 9 46 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName()); 10 47 11 private static MaoriTextDetector maoriTxtDetector = new MaoriTextDetector(false); // false: run non-silent48 private final MaoriTextDetector maoriTxtDetector; 12 49 13 50 public final String siteID; // is this necessary? … … 17 54 18 55 19 public NutchTextDumpProcessor( String siteID, File txtDumpFile) {56 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) { 20 57 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder 21 58 this.siteID = siteID; 22 23 59 this.maoriTxtDetector = maoriTxtDetector; 60 24 61 pages = new ArrayList<TextDumpPage>(); 25 62 … … 37 74 pageDump.append("\n"); 38 75 } else { 39 TextDumpPage page = new TextDumpPage( pageDump.toString());76 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString()); 40 77 // parses the fields and body text of a webpage in nutch's txt dump of entire site 41 78 //page.parseFields(); … … 57 94 58 95 String text = getTextForPage(pageID); 96 // QTODO: what to do when page body text is empty? 97 if(text.equals("")) return false; 59 98 return maoriTxtDetector.isTextInMaori(text); 60 99 } … … 120 159 121 160 try { 161 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent 122 162 File[] sites = sitesDir.listFiles(); 123 163 for(File siteDir : sites) { // e.g. 00001 124 164 // look for dump.txt 125 File txtDumpFile = new File(siteDir, dump.txt);165 File txtDumpFile = new File(siteDir, "dump.txt"); 126 166 if(!txtDumpFile.exists()) { 127 167 error("Text dump file " + txtDumpFile + " did not exist"); … … 131 171 else { 132 172 String siteID = siteDir.getName(); 133 NutchTextDumpProcessor nutchTxtDump = NutchTextDumpProcessor(siteID, txtDumpFile);173 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile); 134 174 135 175 } -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java
r33576 r33578 5 5 import java.util.Map; 6 6 7 import org.apache.log4j.Logger; 8 9 7 10 public class TextDumpPage { 8 11 private static Logger logger = Logger.getLogger(org.greenstone.atea.TextDumpPage.class.getName()); 12 9 13 public static final String TEXT_START_MARKER="text:start:"; 10 14 public static final String TEXT_END_MARKER="text:end:"; 11 15 12 private Map tuples;16 private Map<String, String> tuples; 13 17 14 18 public TextDumpPage(String siteID, String unparsedPageDump) { 15 tuples = new HashMap ();19 tuples = new HashMap<String, String>(); 16 20 17 21 try ( … … 25 29 String pageURL = line.substring(endIndex); 26 30 27 tuples. add("pageURL", pageURL.trim());31 tuples.put("pageURL", pageURL.trim()); 28 32 29 33 String key = line.substring(endIndex); 30 tuples. add("key", key.trim());34 tuples.put("key", key.trim()); 31 35 32 36 boolean readingPageText = false; … … 42 46 String k = line.substring(0, endIndex); 43 47 String v = line.substring(endIndex+1); 44 tuples. add(k.trim(), v.trim());48 tuples.put(k.trim(), v.trim()); 45 49 } 46 50 47 51 else if(line.equals(TEXT_START_MARKER)) { // dealing with the page body text 48 StringBuilderpageText = new StringBuilder();52 pageText = new StringBuilder(); 49 53 readingPageText = true; 50 54 } … … 56 60 // finished with a page body 57 61 // remove any FINAL artificial newline we introduced 58 tuples. add("pageText", pageText.toString().trim());62 tuples.put("pageText", pageText.toString().trim()); 59 63 readingPageText = false; 60 64 pageText = null; … … 71 75 // If the page had no pageText, add a "pageText" -> "" mapping 72 76 if(!tuples.containsKey("pageText")) { 73 tuples. add("pageText", "");77 tuples.put("pageText", ""); 74 78 } 75 79 … … 91 95 return tuples.get(key); 92 96 } 97 98 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 99 public static void info(String msg) { 100 System.err.println(msg); 101 logger.info(msg); 102 } 103 public static void debug(String msg) { 104 System.err.println(msg); 105 logger.debug(msg); 106 } 107 public static void warn(String msg) { 108 System.err.println(msg); 109 logger.warn(msg); 110 } 111 public static void error(String msg) { 112 System.err.println(msg); 113 logger.error(msg); 114 } 115 public static void error(String msg, Exception e) { 116 logger.error(msg, e); 117 System.err.println("\n"+msg); 118 e.printStackTrace(); 119 } 120 93 121 }
Note:
See TracChangeset
for help on using the changeset viewer.