Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33578

Last change on this file since 33578 was 33578, checked in by ak19, 5 years ago
Corrections for compiling the 2 new classes.
File size: 6.1 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.ArrayList;
5	//import java.util.HashMap;
6	//import java.util.Map;
7	import java.lang.ArrayIndexOutOfBoundsException;
8
9	import org.apache.log4j.Logger;
10
11	/**
12	* Class to process the dump text files produced for each site (e.g. site "00001") that
13	* Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
14	* This reads in the dump.txt file contained in each site folder within the input folder.
15	* (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
16	* Each dump.txt could contain the text contents for an entire site, or for individual pages.
17	* This class then uses class TextDumpPage to parse each webpage within a dump.txt,
18	* which parses out the actual text body content of each webpage's section within a dump.txt.
19	* Finally, MaoriTextDetector is run over that to determine whether the full body text is
20	* likely to be in Maori or not.
21	*
22	* Potential issues: since a web page's text is dumped out by nutch with neither paragraph
23	* nor even newline separator, it's hard to be sure that the entire page is in language.
24	* If it's in multiple languages, there's no way to be sure there aren't promising Maori language
25	* paragraphs contained in a page, if the majority/the remainder happen to be in English.
26	*
27	* So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
28	* the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
29	* instead of running it over the entire html body's text.
30	*
31	* TO COMPILE OR RUN, FIRST DO:
32	* cd maori-lang-detection/apache-opennlp-1.9.1
33	* export OPENNLP_HOME=`pwd`
34	* cd maori-lang-detection/src
35	*
36	* TO COMPILE:
37	* maori-lang-detection/src$
38	* javac -cp ".:../conf:../lib/:$OPENNLP_HOME/lib/" org/greenstone/atea/NutchTextDumpProcessor.java
39	*
40	* TO RUN:
41	* maori-lang-detection/src$
42	* java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor
43	*
44	*/
45	public class NutchTextDumpProcessor {
46	private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
47
48	private final MaoriTextDetector maoriTxtDetector;
49
50	public final String siteID; // is this necessary?
51
52	/** keep a list to store the text of each page */
53	private ArrayList<TextDumpPage> pages;
54
55
56	public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
57	// siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
58	this.siteID = siteID;
59	this.maoriTxtDetector = maoriTxtDetector;
60
61	pages = new ArrayList<TextDumpPage>();
62
63	String line = null;
64	StringBuilder pageDump = new StringBuilder();
65	try (
66	BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
67	) {
68
69	while((line = reader.readLine()) != null) { // readLine removes newline separator
70	line = line.trim();
71	// an empty line marks the end of a page in nutch's text dump of a site
72	if(!line.equals("")) {
73	pageDump.append(line);
74	pageDump.append("\n");
75	} else {
76	TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
77	// parses the fields and body text of a webpage in nutch's txt dump of entire site
78	//page.parseFields();
79	//page.getText();
80	pages.add(page);
81	pageDump = null;
82	pageDump = new StringBuilder();
83	}
84	}
85
86	} catch (IOException ioe) {
87	error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
88	}
89
90	}
91
92	/** pageID: id into pages array */
93	public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
94
95	String text = getTextForPage(pageID);
96	// QTODO: what to do when page body text is empty?
97	if(text.equals("")) return false;
98	return maoriTxtDetector.isTextInMaori(text);
99	}
100
101	private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
102	if(pageID < 0 \|\| pageID >= pages.size()) {
103	throw new ArrayIndexOutOfBoundsException();
104	}
105
106	TextDumpPage page = pages.get(pageID);
107	return page;
108	}
109
110	public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
111	TextDumpPage page = getPage(pageID);
112	return page.getPageText();
113	}
114	public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
115	TextDumpPage page = getPage(pageID);
116	return page.getPageURL();
117	}
118
119
120	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
121	public static void info(String msg) {
122	System.err.println(msg);
123	logger.info(msg);
124	}
125	public static void debug(String msg) {
126	System.err.println(msg);
127	logger.debug(msg);
128	}
129	public static void warn(String msg) {
130	System.err.println(msg);
131	logger.warn(msg);
132	}
133	public static void error(String msg) {
134	System.err.println(msg);
135	logger.error(msg);
136	}
137	public static void error(String msg, Exception e) {
138	logger.error(msg, e);
139	System.err.println("\n"+msg);
140	e.printStackTrace();
141	}
142
143	public static void printUsage() {
144	info("Run this program as:");
145	info("\tNutchTextDumpProcessor <path to 'sites' folder>");
146	}
147
148	public static void main(String[] args) {
149	if(args.length != 1) {
150	printUsage();
151	return;
152	}
153
154	File sitesDir = new File(args[0]);
155	if(!sitesDir.exists() \|\| !sitesDir.isDirectory()) {
156	error("Error: " + args[0] + " does not exist or is not a directory");
157	return;
158	}
159
160	try {
161	MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
162	File[] sites = sitesDir.listFiles();
163	for(File siteDir : sites) { // e.g. 00001
164	// look for dump.txt
165	File txtDumpFile = new File(siteDir, "dump.txt");
166	if(!txtDumpFile.exists()) {
167	error("Text dump file " + txtDumpFile + " did not exist");
168	continue;
169	}
170
171	else {
172	String siteID = siteDir.getName();
173	NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
174
175	}
176
177	}
178
179	} catch(Exception e) {
180	// can get an exception when instantiating CCWETProcessor instance
181	error(e.getMessage(), e);
182	}
183	}
184	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: