source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33587

Last change on this file since 33587 was 33587, checked in by ak19, 5 years ago
  1. Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.
File size: 11.8 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.lang.ArrayIndexOutOfBoundsException;
5import java.util.ArrayList;
6import java.util.Arrays;
7
8import org.apache.log4j.Logger;
9
10/**
11 * Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
12 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
13 * This reads in the dump.txt file contained in each site folder within the input folder.
14 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
15 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
16 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
17 * which parses out the actual text body content of each webpage's section within a dump.txt.
18 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
19 * likely to be in Maori or not.
20 *
21 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
22 * nor even newline separator, it's hard to be sure that the entire page is in language.
23 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
24 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
25 *
26 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
27 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
28 * instead of running it over the entire html body's text.
29 *
30 * TO COMPILE OR RUN, FIRST DO:
31 * cd maori-lang-detection/apache-opennlp-1.9.1
32 * export OPENNLP_HOME=`pwd`
33 * cd maori-lang-detection/src
34 *
35 * TO COMPILE:
36 * maori-lang-detection/src$
37 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
38 *
39 * TO RUN:
40 * maori-lang-detection/src$
41 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
42 *
43 * or:
44 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1
45 *
46*/
47public class NutchTextDumpProcessor {
48 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
49
50 static boolean DEBUG_MODE = true;
51
52 private final MaoriTextDetector maoriTxtDetector;
53
54 public final String siteID; // is this necessary?
55 private String domainOfSite;
56
57 /** keep a list to store the text of each page */
58 private ArrayList<TextDumpPage> pages;
59
60 /** list of pages in this site which were detected as being in MRI */
61 private ArrayList<MRIWebPageStats> pagesInMRI;
62 /** list of pages in this site which were NOT detected as being in MRI but nevertheless
63 * contain one or more sentences in MRI
64 */
65 private ArrayList<MRIWebPageStats> pagesContainingMRI;
66
67 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
68 // The start of a new web page's record in nutch's text dump of an entire site
69 // is denoted by a newline followed by a URL (protocol)
70 // or the very start of the file with a URL (protocol)
71 return ((prevLine == null || prevLine.equals(""))
72 && (line.startsWith("http://") || line.startsWith("https://")));
73 }
74
75 public void debugPageDump(StringBuilder pageDump) {
76 if(DEBUG_MODE) {
77 // START DEBUG
78 debug("__________________________________________");
79 debug("@@@ Found page entry: ");
80 debug("__________________________________________");
81 debug(pageDump.toString());
82 debug("------------------------------------------");
83 // END DEBUG
84 }
85 }
86
87 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
88 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
89 this.siteID = siteID;
90 this.maoriTxtDetector = maoriTxtDetector;
91
92 pages = new ArrayList<TextDumpPage>();
93
94 String line = null;
95 StringBuilder pageDump = null;
96 try (
97 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
98 ) {
99
100 boolean readingText = false;
101 String prevLine = null;
102
103 while((line = reader.readLine()) != null) { // readLine removes newline separator
104 line = line.trim();
105 // iff outside of a page's body text, then an empty line marks the end of a page
106 // in nutch's text dump of a site.
107 // But note, there can be an empty line (or more?) between the start and end
108 // markers of a page's text, though.
109
110 if(isStartOfNewWebPageRecord(prevLine, line)) {
111
112 if(pageDump != null) { // should also be the case then: if(prevLine != null)
113 // finish old pageDump and begin new one
114
115 //debugPageDump(pageDump);
116
117 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
118 // parses the fields and body text of a webpage in nutch's txt dump of entire site
119 //page.parseFields();
120 //page.getText();
121 pages.add(page);
122 pageDump = null;
123
124 }
125
126 // begin new webpage dump
127 pageDump = new StringBuilder();
128 pageDump.append(line);
129 pageDump.append("\n");
130
131 }
132 else if(!line.equals("")) {
133 pageDump.append(line);
134 pageDump.append("\n");
135
136 }
137 // can throw away any newlines between text start and end markers.
138
139 prevLine = line;
140 }
141
142 // process final webpage record:
143 //debugPageDump(pageDump);
144
145 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
146 pages.add(page);
147 pageDump = null;
148
149 } catch (IOException ioe) {
150 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
151 }
152
153 // Just do this once: get and store domain of site.
154 // Passing true to get domain with protocol prefix
155 if(pages.size() > 0) {
156 TextDumpPage firstPage = pages.get(0);
157 String url = firstPage.getPageURL();
158 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
159 }
160 else {
161 this.domainOfSite = "UNKNOWN";
162 }
163
164 prepareSiteStats();
165 }
166
167 /** pageID: id into pages array */
168 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
169
170 String text = getTextForPage(pageID);
171
172 // QTODO: what to do when page body text is empty?
173 if(text.equals("")) return false;
174 return maoriTxtDetector.isTextInMaori(text);
175 }
176
177 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
178 if(pageID < 0 || pageID >= pages.size()) {
179 throw new ArrayIndexOutOfBoundsException();
180 }
181
182 TextDumpPage page = pages.get(pageID);
183 return page;
184 }
185
186 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
187 TextDumpPage page = getPage(pageID);
188 return page.getPageText();
189 }
190 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
191 TextDumpPage page = getPage(pageID);
192 return page.getPageURL();
193 }
194
195 public int totalNumPages() {
196 return pages.size();
197 }
198
199 private void prepareSiteStats() {
200 pagesInMRI = new ArrayList<MRIWebPageStats>();
201 pagesContainingMRI = new ArrayList<MRIWebPageStats>();
202
203 TextDumpPage page = null;
204 for(int i = 0; i < pages.size(); i++) {
205
206 page = pages.get(i);
207
208 String text = page.getPageText();
209 if(text.equals("")) {
210 page.addMRILanguageStatus(false);
211 continue;
212 }
213 else {
214 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
215
216 page.addMRILanguageStatus(isMRI);
217
218 // Even if the entire page is not found to be overall in Māori,
219 // let's sitll inspect the sentences of the page and count how many (if any)
220 // are in te reo.
221 ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
222 // first element is always total num sentences
223 // remaining elements are the actual sentences that were detected as being Māori
224 int totalSentences = Integer.parseInt(mriSentences.get(0));
225 int numSentencesInMRI = mriSentences.size() - 1;
226
227
228 // Add page to list of MRI pages if the page's body text overall was detected
229 // as Māori
230 // Add page to list of pages containing MRI if >= 1 sentences in the page
231 // were detected as being in MRI
232 if(isMRI || numSentencesInMRI >= 1) {
233 String url = page.getPageURL();
234 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
235 totalSentences, numSentencesInMRI);
236 if(isMRI) {
237 pagesInMRI.add(MRIpageStats);
238 } else if(numSentencesInMRI >= 1) {
239 pagesContainingMRI.add(MRIpageStats);
240 }
241
242 }
243 }
244 }
245 }
246
247 public void printSiteStats() {
248
249
250 info("------------- " + this.siteID + " SITE STATS -----------");
251
252 info("SITE DOMAIN: " + this.domainOfSite);
253 info("Total number of web pages in site: " + pages.size());
254 info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
255
256 if(pagesInMRI.size() > 0) {
257 info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
258 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
259 info(mriWebPageInfo.toString());
260 }
261 }
262
263 info(" ----------- ");
264 if(pagesContainingMRI.size() > 0) {
265 info("The following pages weren't detected as primarily being in Māori");
266 info("But still contained sentences detected as Māori");
267 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
268 info(mriWebPageInfo.toString());
269 }
270
271 } else {
272 info("No further pages detected as containing any sentences in MRI");
273 }
274 info(" ----------- ");
275 }
276
277
278 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
279 public static void info(String msg) {
280 System.err.println(msg);
281 logger.info(msg);
282 }
283 public static void debug(String msg) {
284 System.err.println(msg);
285 logger.debug(msg);
286 }
287 public static void warn(String msg) {
288 System.err.println(msg);
289 logger.warn(msg);
290 }
291 public static void error(String msg) {
292 System.err.println(msg);
293 logger.error(msg);
294 }
295 public static void error(String msg, Exception e) {
296 logger.error(msg, e);
297 System.err.println("\n"+msg);
298 e.printStackTrace();
299 }
300
301 public static void printUsage() {
302 info("Run this program as:");
303 info("\tNutchTextDumpProcessor <path to 'sites' folder>");
304 }
305
306 public static void main(String[] args) {
307 if(args.length != 1) {
308 printUsage();
309 return;
310 }
311
312 File sitesDir = new File(args[0]);
313 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
314 error("Error: " + args[0] + " does not exist or is not a directory");
315 return;
316 }
317
318 NutchTextDumpProcessor.DEBUG_MODE = false;
319
320 try {
321 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
322 File[] sites = sitesDir.listFiles();
323
324 // sort site folders in alphabetical order
325 // https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
326 Arrays.sort(sites);
327
328 for(File siteDir : sites) { // e.g. 00001
329 if(siteDir.isDirectory()) {
330 // look for dump.txt
331 File txtDumpFile = new File(siteDir, "dump.txt");
332 if(!txtDumpFile.exists()) {
333 error("Text dump file " + txtDumpFile + " did not exist");
334 continue;
335 }
336
337 else {
338 String siteID = siteDir.getName();
339 debug("Found siteID: " + siteID);
340 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
341 // now it's parsed all the web pages in the site's text dump
342
343 // Let's print stats on each web page's detected language being MRI or not
344 // and how many pages there were in the site in total.
345
346 nutchTxtDump.printSiteStats();
347 }
348 }
349
350 }
351
352 } catch(Exception e) {
353 // can get an exception when instantiating CCWETProcessor instance
354 error(e.getMessage(), e);
355 }
356 }
357}
Note: See TracBrowser for help on using the repository browser.