Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33579

Last change on this file since 33579 was 33579, checked in by ak19, 5 years ago
Debugging. Solved one problem.
File size: 7.0 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.ArrayList;
5	//import java.util.HashMap;
6	//import java.util.Map;
7	import java.lang.ArrayIndexOutOfBoundsException;
8
9	import org.apache.log4j.Logger;
10
11	/**
12	* Class to process the dump text files produced for each site (e.g. site "00001") that
13	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
14	* This reads in the dump.txt file contained in each site folder within the input folder.
15	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
16	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
17	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
18	* which parses out the actual text body content of each webpage's section within a dump.txt.
19	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
20	* likely to be in Maori or not.
21	*
22	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
23	* nor even newline separator, it's hard to be sure that the entire page is in language.
24	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
25	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
26	*
27	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
28	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
29	* instead of running it over the entire html body's text.
30	*
31	* TO COMPILE OR RUN, FIRST DO:
32	* cd maori-lang-detection/apache-opennlp-1.9.1
33	* export OPENNLP_HOME=`pwd`
34	* cd maori-lang-detection/src
35	*
36	* TO COMPILE:
37	* maori-lang-detection/src$
38	* javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
39	*
40	* TO RUN:
41	* maori-lang-detection/src$
42	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
43	*
44	*/
45	public class NutchTextDumpProcessor {
46	private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
47
48	private final MaoriTextDetector maoriTxtDetector;
49
50	public final String siteID; // is this necessary?
51
52	/** keep a list to store the text of each page */
53	private ArrayList<TextDumpPage> pages;
54
55
56	public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
57	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
58	this.siteID = siteID;
59	this.maoriTxtDetector = maoriTxtDetector;
60
61	pages = new ArrayList<TextDumpPage>();
62
63	String line = null;
64	StringBuilder pageDump = new StringBuilder();
65	try (
66	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
67	) {
68
69	boolean readingText = false;
70
71	while((line = reader.readLine()) != null) { // readLine removes newline separator
72	line = line.trim();
73	// iff outside of a page's body text, then an empty line marks the end of a page
74	// in nutch's text dump of a site.
75	// But note, there can be an empty line (or more?) between the start and end
76	// markers of a page's text, though.
77
78	if(!readingText && line.equals("")) {
79	pageDump.append(line);
80	pageDump.append("\n");
81
82
83	// START DEBUG
84	debug("__________________________________________");
85	debug("@@@ Found page entry: ");
86	debug("__________________________________________");
87	debug(pageDump.toString());
88	debug("------------------------------------------");
89	// END DEBUG
90
91
92	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
93	// parses the fields and body text of a webpage in nutch's txt dump of entire site
94	//page.parseFields();
95	//page.getText();
96	pages.add(page);
97	pageDump = null;
98
99	pageDump = new StringBuilder();
100	}
101	else if (!line.equals("")) { // empty line
102	if(line.equals(TextDumpPage.TEXT_START_MARKER)) {
103	readingText = true;
104	}
105	if(line.equals(TextDumpPage.TEXT_END_MARKER)) {
106	readingText = false;
107	}
108	pageDump.append(line);
109	pageDump.append("\n");
110	}
111	// can throw away any newlines between text start and end markers.
112	}
113
114	} catch (IOException ioe) {
115	error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
116	}
117
118	}
119
120	/** pageID: id into pages array */
121	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
122
123	String text = getTextForPage(pageID);
124	// QTODO: what to do when page body text is empty?
125	if(text.equals("")) return false;
126	return maoriTxtDetector.isTextInMaori(text);
127	}
128
129	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
130	if(pageID < 0 \|\| pageID >= pages.size()) {
131	throw new ArrayIndexOutOfBoundsException();
132	}
133
134	TextDumpPage page = pages.get(pageID);
135	return page;
136	}
137
138	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
139	TextDumpPage page = getPage(pageID);
140	return page.getPageText();
141	}
142	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
143	TextDumpPage page = getPage(pageID);
144	return page.getPageURL();
145	}
146
147
148	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
149	public static void info(String msg) {
150	System.err.println(msg);
151	logger.info(msg);
152	}
153	public static void debug(String msg) {
154	System.err.println(msg);
155	logger.debug(msg);
156	}
157	public static void warn(String msg) {
158	System.err.println(msg);
159	logger.warn(msg);
160	}
161	public static void error(String msg) {
162	System.err.println(msg);
163	logger.error(msg);
164	}
165	public static void error(String msg, Exception e) {
166	logger.error(msg, e);
167	System.err.println("\n"+msg);
168	e.printStackTrace();
169	}
170
171	public static void printUsage() {
172	info("Run this program as:");
173	info("\tNutchTextDumpProcessor <path to 'sites' folder>");
174	}
175
176	public static void main(String[] args) {
177	if(args.length != 1) {
178	printUsage();
179	return;
180	}
181
182	File sitesDir = new File(args[0]);
183	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
184	error("Error: " + args[0] + " does not exist or is not a directory");
185	return;
186	}
187
188	try {
189	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
190	File[] sites = sitesDir.listFiles();
191	for(File siteDir : sites) { // e.g. 00001
192	// look for dump.txt
193	File txtDumpFile = new File(siteDir, "dump.txt");
194	if(!txtDumpFile.exists()) {
195	error("Text dump file " + txtDumpFile + " did not exist");
196	continue;
197	}
198
199	else {
200	String siteID = siteDir.getName();
201	debug("Found siteID: " + siteID);
202	NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
203	}
204
205	}
206
207	} catch(Exception e) {
208	// can get an exception when instantiating CCWETProcessor instance
209	error(e.getMessage(), e);
210	}
211	}
212	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: