source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33579

Last change on this file since 33579 was 33579, checked in by ak19, 5 years ago

Debugging. Solved one problem.

File size: 7.0 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.ArrayList;
5//import java.util.HashMap;
6//import java.util.Map;
7import java.lang.ArrayIndexOutOfBoundsException;
8
9import org.apache.log4j.Logger;
10
11/**
12 * Class to process the dump text files produced for each site (e.g. site "00001") that
13 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
14 * This reads in the dump.txt file contained in each site folder within the input folder.
15 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
16 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
17 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
18 * which parses out the actual text body content of each webpage's section within a dump.txt.
19 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
20 * likely to be in Maori or not.
21 *
22 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
23 * nor even newline separator, it's hard to be sure that the entire page is in language.
24 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
25 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
26 *
27 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
28 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
29 * instead of running it over the entire html body's text.
30 *
31 * TO COMPILE OR RUN, FIRST DO:
32 * cd maori-lang-detection/apache-opennlp-1.9.1
33 * export OPENNLP_HOME=`pwd`
34 * cd maori-lang-detection/src
35 *
36 * TO COMPILE:
37 * maori-lang-detection/src$
38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor.java
39 *
40 * TO RUN:
41 * maori-lang-detection/src$
42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor ../crawled
43 *
44*/
45public class NutchTextDumpProcessor {
46 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
47
48 private final MaoriTextDetector maoriTxtDetector;
49
50 public final String siteID; // is this necessary?
51
52 /** keep a list to store the text of each page */
53 private ArrayList<TextDumpPage> pages;
54
55
56 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
57 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
58 this.siteID = siteID;
59 this.maoriTxtDetector = maoriTxtDetector;
60
61 pages = new ArrayList<TextDumpPage>();
62
63 String line = null;
64 StringBuilder pageDump = new StringBuilder();
65 try (
66 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
67 ) {
68
69 boolean readingText = false;
70
71 while((line = reader.readLine()) != null) { // readLine removes newline separator
72 line = line.trim();
73 // iff outside of a page's body text, then an empty line marks the end of a page
74 // in nutch's text dump of a site.
75 // But note, there can be an empty line (or more?) between the start and end
76 // markers of a page's text, though.
77
78 if(!readingText && line.equals("")) {
79 pageDump.append(line);
80 pageDump.append("\n");
81
82
83 // START DEBUG
84 debug("__________________________________________");
85 debug("@@@ Found page entry: ");
86 debug("__________________________________________");
87 debug(pageDump.toString());
88 debug("------------------------------------------");
89 // END DEBUG
90
91
92 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
93 // parses the fields and body text of a webpage in nutch's txt dump of entire site
94 //page.parseFields();
95 //page.getText();
96 pages.add(page);
97 pageDump = null;
98
99 pageDump = new StringBuilder();
100 }
101 else if (!line.equals("")) { // empty line
102 if(line.equals(TextDumpPage.TEXT_START_MARKER)) {
103 readingText = true;
104 }
105 if(line.equals(TextDumpPage.TEXT_END_MARKER)) {
106 readingText = false;
107 }
108 pageDump.append(line);
109 pageDump.append("\n");
110 }
111 // can throw away any newlines between text start and end markers.
112 }
113
114 } catch (IOException ioe) {
115 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
116 }
117
118 }
119
120 /** pageID: id into pages array */
121 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
122
123 String text = getTextForPage(pageID);
124 // QTODO: what to do when page body text is empty?
125 if(text.equals("")) return false;
126 return maoriTxtDetector.isTextInMaori(text);
127 }
128
129 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
130 if(pageID < 0 || pageID >= pages.size()) {
131 throw new ArrayIndexOutOfBoundsException();
132 }
133
134 TextDumpPage page = pages.get(pageID);
135 return page;
136 }
137
138 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
139 TextDumpPage page = getPage(pageID);
140 return page.getPageText();
141 }
142 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
143 TextDumpPage page = getPage(pageID);
144 return page.getPageURL();
145 }
146
147
148 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
149 public static void info(String msg) {
150 System.err.println(msg);
151 logger.info(msg);
152 }
153 public static void debug(String msg) {
154 System.err.println(msg);
155 logger.debug(msg);
156 }
157 public static void warn(String msg) {
158 System.err.println(msg);
159 logger.warn(msg);
160 }
161 public static void error(String msg) {
162 System.err.println(msg);
163 logger.error(msg);
164 }
165 public static void error(String msg, Exception e) {
166 logger.error(msg, e);
167 System.err.println("\n"+msg);
168 e.printStackTrace();
169 }
170
171 public static void printUsage() {
172 info("Run this program as:");
173 info("\tNutchTextDumpProcessor <path to 'sites' folder>");
174 }
175
176 public static void main(String[] args) {
177 if(args.length != 1) {
178 printUsage();
179 return;
180 }
181
182 File sitesDir = new File(args[0]);
183 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
184 error("Error: " + args[0] + " does not exist or is not a directory");
185 return;
186 }
187
188 try {
189 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
190 File[] sites = sitesDir.listFiles();
191 for(File siteDir : sites) { // e.g. 00001
192 // look for dump.txt
193 File txtDumpFile = new File(siteDir, "dump.txt");
194 if(!txtDumpFile.exists()) {
195 error("Text dump file " + txtDumpFile + " did not exist");
196 continue;
197 }
198
199 else {
200 String siteID = siteDir.getName();
201 debug("Found siteID: " + siteID);
202 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
203 }
204
205 }
206
207 } catch(Exception e) {
208 // can get an exception when instantiating CCWETProcessor instance
209 error(e.getMessage(), e);
210 }
211 }
212}
Note: See TracBrowser for help on using the repository browser.