source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33582

Last change on this file since 33582 was 33582, checked in by ak19, 5 years ago

NutchTextDumpProcessor prints each crawled site's stats: number of webpages per crawled site and how many of those were detected by OpenNLP as being in Maori (mri). Needed to make a reusable method in CCWETProcessor as public and static.

File size: 10.2 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.ArrayList;
5//import java.util.HashMap;
6//import java.util.Map;
7import java.lang.ArrayIndexOutOfBoundsException;
8
9import org.apache.log4j.Logger;
10
11/**
12 * Class to process the dump text files produced for each site (e.g. site "00001") that
13 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
14 * This reads in the dump.txt file contained in each site folder within the input folder.
15 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
16 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
17 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
18 * which parses out the actual text body content of each webpage's section within a dump.txt.
19 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
20 * likely to be in Maori or not.
21 *
22 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
23 * nor even newline separator, it's hard to be sure that the entire page is in language.
24 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
25 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
26 *
27 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
28 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
29 * instead of running it over the entire html body's text.
30 *
31 * TO COMPILE OR RUN, FIRST DO:
32 * cd maori-lang-detection/apache-opennlp-1.9.1
33 * export OPENNLP_HOME=`pwd`
34 * cd maori-lang-detection/src
35 *
36 * TO COMPILE:
37 * maori-lang-detection/src$
38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
39 *
40 * TO RUN:
41 * maori-lang-detection/src$
42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
43 *
44 * or:
45 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1
46 *
47*/
48public class NutchTextDumpProcessor {
49 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
50
51 static boolean DEBUG_MODE = true;
52
53 private final MaoriTextDetector maoriTxtDetector;
54
55 public final String siteID; // is this necessary?
56 private String domainOfSite;
57
58 /** keep a list to store the text of each page */
59 private ArrayList<TextDumpPage> pages;
60
61 private ArrayList<MRIWebPageStats> pagesInMRI;
62
63 private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
64 // The start of a new web page's record in nutch's text dump of an entire site
65 // is denoted by a newline followed by a URL (protocol)
66 // or the very start of the file with a URL (protocol)
67 return ((prevLine == null || prevLine.equals(""))
68 && (line.startsWith("http://") || line.startsWith("https://")));
69 }
70
71 public void debugPageDump(StringBuilder pageDump) {
72 if(DEBUG_MODE) {
73 // START DEBUG
74 debug("__________________________________________");
75 debug("@@@ Found page entry: ");
76 debug("__________________________________________");
77 debug(pageDump.toString());
78 debug("------------------------------------------");
79 // END DEBUG
80 }
81 }
82
83 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
84 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
85 this.siteID = siteID;
86 this.maoriTxtDetector = maoriTxtDetector;
87
88 pages = new ArrayList<TextDumpPage>();
89
90 String line = null;
91 StringBuilder pageDump = null;
92 try (
93 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
94 ) {
95
96 boolean readingText = false;
97 String prevLine = null;
98
99 while((line = reader.readLine()) != null) { // readLine removes newline separator
100 line = line.trim();
101 // iff outside of a page's body text, then an empty line marks the end of a page
102 // in nutch's text dump of a site.
103 // But note, there can be an empty line (or more?) between the start and end
104 // markers of a page's text, though.
105
106 if(isStartOfNewWebPageRecord(prevLine, line)) {
107
108 if(pageDump != null) { // should also be the case then: if(prevLine != null)
109 // finish old pageDump and begin new one
110
111 //debugPageDump(pageDump);
112
113 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
114 // parses the fields and body text of a webpage in nutch's txt dump of entire site
115 //page.parseFields();
116 //page.getText();
117 pages.add(page);
118 pageDump = null;
119
120 }
121
122 // begin new webpage dump
123 pageDump = new StringBuilder();
124 pageDump.append(line);
125 pageDump.append("\n");
126
127 }
128 else if(!line.equals("")) {
129 pageDump.append(line);
130 pageDump.append("\n");
131
132 }
133 // can throw away any newlines between text start and end markers.
134
135 prevLine = line;
136 }
137
138 // process final webpage record:
139 //debugPageDump(pageDump);
140
141 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
142 pages.add(page);
143 pageDump = null;
144
145 } catch (IOException ioe) {
146 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
147 }
148
149 // Just do this once: get domain of site.
150 // Passing true to get domain with protocol prefix
151 if(pages.size() > 0) {
152 TextDumpPage firstPage = pages.get(0);
153 String url = firstPage.getPageURL();
154 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
155 }
156 else {
157 this.domainOfSite = "UNKNOWN";
158 }
159
160 prepareSiteStats();
161 }
162
163 /** pageID: id into pages array */
164 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
165
166 String text = getTextForPage(pageID);
167
168 // QTODO: what to do when page body text is empty?
169 if(text.equals("")) return false;
170 return maoriTxtDetector.isTextInMaori(text);
171 }
172
173 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
174 if(pageID < 0 || pageID >= pages.size()) {
175 throw new ArrayIndexOutOfBoundsException();
176 }
177
178 TextDumpPage page = pages.get(pageID);
179 return page;
180 }
181
182 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
183 TextDumpPage page = getPage(pageID);
184 return page.getPageText();
185 }
186 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
187 TextDumpPage page = getPage(pageID);
188 return page.getPageURL();
189 }
190
191 public int totalNumPages() {
192 return pages.size();
193 }
194
195 private void prepareSiteStats() {
196 pagesInMRI = new ArrayList<MRIWebPageStats>();
197
198
199 TextDumpPage page = null;
200 for(int i = 0; i < pages.size(); i++) {
201
202
203 page = pages.get(i);
204
205 /*
206 // just do this once: get domain. Passing true to get domain with protocol prefix
207 if(this.domainOfSite == null) {
208 this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
209 }
210 */
211
212 String text = page.getPageText();
213 if(text.equals("")) {
214 page.addMRILanguageStatus(false);
215 continue;
216 }
217 else {
218 boolean isMRI = maoriTxtDetector.isTextInMaori(text);
219 page.addMRILanguageStatus(isMRI);
220
221 if(isMRI) { // add page to list of meaningful pages.
222 String url = page.getPageURL();
223 MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i);
224 pagesInMRI.add(MRIpageStats);
225 }
226
227 }
228 }
229 }
230
231 public void printSiteStats() {
232
233
234 info("------------- " + this.siteID + " SITE STATS -----------");
235
236 info("SITE DOMAIN: " + this.domainOfSite);
237 info("Total number of web pages in site: " + pages.size());
238 info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
239
240 info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
241 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
242 info("URL: " + mriWebPageInfo.URL);
243 info("siteID: " + mriWebPageInfo.siteID);
244 }
245 info(" ----------- ");
246 }
247
248
249 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
250 public static void info(String msg) {
251 System.err.println(msg);
252 logger.info(msg);
253 }
254 public static void debug(String msg) {
255 System.err.println(msg);
256 logger.debug(msg);
257 }
258 public static void warn(String msg) {
259 System.err.println(msg);
260 logger.warn(msg);
261 }
262 public static void error(String msg) {
263 System.err.println(msg);
264 logger.error(msg);
265 }
266 public static void error(String msg, Exception e) {
267 logger.error(msg, e);
268 System.err.println("\n"+msg);
269 e.printStackTrace();
270 }
271
272 public static void printUsage() {
273 info("Run this program as:");
274 info("\tNutchTextDumpProcessor <path to 'sites' folder>");
275 }
276
277 public static void main(String[] args) {
278 if(args.length != 1) {
279 printUsage();
280 return;
281 }
282
283 File sitesDir = new File(args[0]);
284 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
285 error("Error: " + args[0] + " does not exist or is not a directory");
286 return;
287 }
288
289 NutchTextDumpProcessor.DEBUG_MODE = false;
290
291 try {
292 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
293 File[] sites = sitesDir.listFiles();
294 for(File siteDir : sites) { // e.g. 00001
295 if(siteDir.isDirectory()) {
296 // look for dump.txt
297 File txtDumpFile = new File(siteDir, "dump.txt");
298 if(!txtDumpFile.exists()) {
299 error("Text dump file " + txtDumpFile + " did not exist");
300 continue;
301 }
302
303 else {
304 String siteID = siteDir.getName();
305 debug("Found siteID: " + siteID);
306 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
307 // now it's parsed all the web pages in the site's text dump
308
309 // Let's print stats on each web page's detected language being MRI or not
310 // and how many pages there were in the site in total.
311
312 nutchTxtDump.printSiteStats();
313 }
314 }
315
316 }
317
318 } catch(Exception e) {
319 // can get an exception when instantiating CCWETProcessor instance
320 error(e.getMessage(), e);
321 }
322 }
323}
Note: See TracBrowser for help on using the repository browser.