Context Navigation

NutchTextDumpProcessor.java@ 33587

Last change on this file since 33587 was 33587, checked in by ak19, 5 years ago

Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.

File size: 11.8 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.lang.ArrayIndexOutOfBoundsException;
5	import java.util.ArrayList;
6	import java.util.Arrays;
7
8	import org.apache.log4j.Logger;
9
10	/**
11	* Class to process the dump text files produced FOR EACH SITE (e.g. site "00001") that
12	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
13	* This reads in the dump.txt file contained in each site folder within the input folder.
14	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
15	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
16	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
17	* which parses out the actual text body content of each webpage's section within a dump.txt.
18	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
19	* likely to be in Maori or not.
20	*
21	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
22	* nor even newline separator, it's hard to be sure that the entire page is in language.
23	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
24	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
25	*
26	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
27	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
28	* instead of running it over the entire html body's text.
29	*
30	* TO COMPILE OR RUN, FIRST DO:
31	* cd maori-lang-detection/apache-opennlp-1.9.1
32	* export OPENNLP_HOME=`pwd`
33	* cd maori-lang-detection/src
34	*
35	* TO COMPILE:
36	* maori-lang-detection/src$
37	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
38	*
39	* TO RUN:
40	* maori-lang-detection/src$
41	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
42	*
43	* or:
44	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled > ../crawled/bla.txt 2>&1
45	*
46	*/
47	public class NutchTextDumpProcessor {
48	private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
49
50	static boolean DEBUG_MODE = true;
51
52	private final MaoriTextDetector maoriTxtDetector;
53
54	public final String siteID; // is this necessary?
55	private String domainOfSite;
56
57	/** keep a list to store the text of each page */
58	private ArrayList<TextDumpPage> pages;
59
60	/** list of pages in this site which were detected as being in MRI */
61	private ArrayList<MRIWebPageStats> pagesInMRI;
62	/** list of pages in this site which were NOT detected as being in MRI but nevertheless
63	* contain one or more sentences in MRI
64	*/
65	private ArrayList<MRIWebPageStats> pagesContainingMRI;
66
67	private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
68	// The start of a new web page's record in nutch's text dump of an entire site
69	// is denoted by a newline followed by a URL (protocol)
70	// or the very start of the file with a URL (protocol)
71	return ((prevLine == null \|\| prevLine.equals(""))
72	&& (line.startsWith("http://") \|\| line.startsWith("https://")));
73	}
74
75	public void debugPageDump(StringBuilder pageDump) {
76	if(DEBUG_MODE) {
77	// START DEBUG
78	debug("__________________________________________");
79	debug("@@@ Found page entry: ");
80	debug("__________________________________________");
81	debug(pageDump.toString());
82	debug("------------------------------------------");
83	// END DEBUG
84	}
85	}
86
87	public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
88	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
89	this.siteID = siteID;
90	this.maoriTxtDetector = maoriTxtDetector;
91
92	pages = new ArrayList<TextDumpPage>();
93
94	String line = null;
95	StringBuilder pageDump = null;
96	try (
97	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
98	) {
99
100	boolean readingText = false;
101	String prevLine = null;
102
103	while((line = reader.readLine()) != null) { // readLine removes newline separator
104	line = line.trim();
105	// iff outside of a page's body text, then an empty line marks the end of a page
106	// in nutch's text dump of a site.
107	// But note, there can be an empty line (or more?) between the start and end
108	// markers of a page's text, though.
109
110	if(isStartOfNewWebPageRecord(prevLine, line)) {
111
112	if(pageDump != null) { // should also be the case then: if(prevLine != null)
113	// finish old pageDump and begin new one
114
115	//debugPageDump(pageDump);
116
117	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
118	// parses the fields and body text of a webpage in nutch's txt dump of entire site
119	//page.parseFields();
120	//page.getText();
121	pages.add(page);
122	pageDump = null;
123
124	}
125
126	// begin new webpage dump
127	pageDump = new StringBuilder();
128	pageDump.append(line);
129	pageDump.append("\n");
130
131	}
132	else if(!line.equals("")) {
133	pageDump.append(line);
134	pageDump.append("\n");
135
136	}
137	// can throw away any newlines between text start and end markers.
138
139	prevLine = line;
140	}
141
142	// process final webpage record:
143	//debugPageDump(pageDump);
144
145	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
146	pages.add(page);
147	pageDump = null;
148
149	} catch (IOException ioe) {
150	error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
151	}
152
153	// Just do this once: get and store domain of site.
154	// Passing true to get domain with protocol prefix
155	if(pages.size() > 0) {
156	TextDumpPage firstPage = pages.get(0);
157	String url = firstPage.getPageURL();
158	this.domainOfSite = CCWETProcessor.getDomainForURL(url, true);
159	}
160	else {
161	this.domainOfSite = "UNKNOWN";
162	}
163
164	prepareSiteStats();
165	}
166
167	/** pageID: id into pages array */
168	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
169
170	String text = getTextForPage(pageID);
171
172	// QTODO: what to do when page body text is empty?
173	if(text.equals("")) return false;
174	return maoriTxtDetector.isTextInMaori(text);
175	}
176
177	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
178	if(pageID < 0 \|\| pageID >= pages.size()) {
179	throw new ArrayIndexOutOfBoundsException();
180	}
181
182	TextDumpPage page = pages.get(pageID);
183	return page;
184	}
185
186	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
187	TextDumpPage page = getPage(pageID);
188	return page.getPageText();
189	}
190	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
191	TextDumpPage page = getPage(pageID);
192	return page.getPageURL();
193	}
194
195	public int totalNumPages() {
196	return pages.size();
197	}
198
199	private void prepareSiteStats() {
200	pagesInMRI = new ArrayList<MRIWebPageStats>();
201	pagesContainingMRI = new ArrayList<MRIWebPageStats>();
202
203	TextDumpPage page = null;
204	for(int i = 0; i < pages.size(); i++) {
205
206	page = pages.get(i);
207
208	String text = page.getPageText();
209	if(text.equals("")) {
210	page.addMRILanguageStatus(false);
211	continue;
212	}
213	else {
214	boolean isMRI = maoriTxtDetector.isTextInMaori(text);
215
216	page.addMRILanguageStatus(isMRI);
217
218	// Even if the entire page is not found to be overall in MÄori,
219	// let's sitll inspect the sentences of the page and count how many (if any)
220	// are in te reo.
221	ArrayList<String> mriSentences = maoriTxtDetector.getAllSentencesInMaori(text);
222	// first element is always total num sentences
223	// remaining elements are the actual sentences that were detected as being MÄori
224	int totalSentences = Integer.parseInt(mriSentences.get(0));
225	int numSentencesInMRI = mriSentences.size() - 1;
226
227
228	// Add page to list of MRI pages if the page's body text overall was detected
229	// as MÄori
230	// Add page to list of pages containing MRI if >= 1 sentences in the page
231	// were detected as being in MRI
232	if(isMRI \|\| numSentencesInMRI >= 1) {
233	String url = page.getPageURL();
234	MRIWebPageStats MRIpageStats = new MRIWebPageStats(this.siteID, url, i, isMRI,
235	totalSentences, numSentencesInMRI);
236	if(isMRI) {
237	pagesInMRI.add(MRIpageStats);
238	} else if(numSentencesInMRI >= 1) {
239	pagesContainingMRI.add(MRIpageStats);
240	}
241
242	}
243	}
244	}
245	}
246
247	public void printSiteStats() {
248
249
250	info("------------- " + this.siteID + " SITE STATS -----------");
251
252	info("SITE DOMAIN: " + this.domainOfSite);
253	info("Total number of web pages in site: " + pages.size());
254	info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());
255
256	if(pagesInMRI.size() > 0) {
257	info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
258	for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
259	info(mriWebPageInfo.toString());
260	}
261	}
262
263	info(" ----------- ");
264	if(pagesContainingMRI.size() > 0) {
265	info("The following pages weren't detected as primarily being in MÄori");
266	info("But still contained sentences detected as MÄori");
267	for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
268	info(mriWebPageInfo.toString());
269	}
270
271	} else {
272	info("No further pages detected as containing any sentences in MRI");
273	}
274	info(" ----------- ");
275	}
276
277
278	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
279	public static void info(String msg) {
280	System.err.println(msg);
281	logger.info(msg);
282	}
283	public static void debug(String msg) {
284	System.err.println(msg);
285	logger.debug(msg);
286	}
287	public static void warn(String msg) {
288	System.err.println(msg);
289	logger.warn(msg);
290	}
291	public static void error(String msg) {
292	System.err.println(msg);
293	logger.error(msg);
294	}
295	public static void error(String msg, Exception e) {
296	logger.error(msg, e);
297	System.err.println("\n"+msg);
298	e.printStackTrace();
299	}
300
301	public static void printUsage() {
302	info("Run this program as:");
303	info("\tNutchTextDumpProcessor <path to 'sites' folder>");
304	}
305
306	public static void main(String[] args) {
307	if(args.length != 1) {
308	printUsage();
309	return;
310	}
311
312	File sitesDir = new File(args[0]);
313	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
314	error("Error: " + args[0] + " does not exist or is not a directory");
315	return;
316	}
317
318	NutchTextDumpProcessor.DEBUG_MODE = false;
319
320	try {
321	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(true); // true: run silent
322	File[] sites = sitesDir.listFiles();
323
324	// sort site folders in alphabetical order
325	// https://stackoverflow.com/questions/7199911/how-to-file-listfiles-in-alphabetical-order
326	Arrays.sort(sites);
327
328	for(File siteDir : sites) { // e.g. 00001
329	if(siteDir.isDirectory()) {
330	// look for dump.txt
331	File txtDumpFile = new File(siteDir, "dump.txt");
332	if(!txtDumpFile.exists()) {
333	error("Text dump file " + txtDumpFile + " did not exist");
334	continue;
335	}
336
337	else {
338	String siteID = siteDir.getName();
339	debug("Found siteID: " + siteID);
340	NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
341	// now it's parsed all the web pages in the site's text dump
342
343	// Let's print stats on each web page's detected language being MRI or not
344	// and how many pages there were in the site in total.
345
346	nutchTxtDump.printSiteStats();
347	}
348	}
349
350	}
351
352	} catch(Exception e) {
353	// can get an exception when instantiating CCWETProcessor instance
354	error(e.getMessage(), e);
355	}
356	}
357	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33587

Download in other formats: