source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpProcessor.java@ 33578

Last change on this file since 33578 was 33578, checked in by ak19, 5 years ago

Corrections for compiling the 2 new classes.

File size: 6.1 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.ArrayList;
5//import java.util.HashMap;
6//import java.util.Map;
7import java.lang.ArrayIndexOutOfBoundsException;
8
9import org.apache.log4j.Logger;
10
11/**
12 * Class to process the dump text files produced for each site (e.g. site "00001") that
13 * Nutch has finished crawling and whose text has been dumped out to a file called dump.txt.
14 * This reads in the dump.txt file contained in each site folder within the input folder.
15 * (e.g. input folder "crawled" could contain folders 00001 to 01465. Each contains a dump.txt)
16 * Each dump.txt could contain the text contents for an entire site, or for individual pages.
17 * This class then uses class TextDumpPage to parse each webpage within a dump.txt,
18 * which parses out the actual text body content of each webpage's section within a dump.txt.
19 * Finally, MaoriTextDetector is run over that to determine whether the full body text is
20 * likely to be in Maori or not.
21 *
22 * Potential issues: since a web page's text is dumped out by nutch with neither paragraph
23 * nor even newline separator, it's hard to be sure that the entire page is in language.
24 * If it's in multiple languages, there's no way to be sure there aren't promising Maori language
25 * paragraphs contained in a page, if the majority/the remainder happen to be in English.
26 *
27 * So if we're looking for any paragraphs in Maori to store in a DB, perhaps it's better to run
28 * the MaoriTextDetector.isTextInMaori(BufferedReader reader) over two "lines" at a time,
29 * instead of running it over the entire html body's text.
30 *
31 * TO COMPILE OR RUN, FIRST DO:
32 * cd maori-lang-detection/apache-opennlp-1.9.1
33 * export OPENNLP_HOME=`pwd`
34 * cd maori-lang-detection/src
35 *
36 * TO COMPILE:
37 * maori-lang-detection/src$
38 * javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/*" org/greenstone/atea/NutchTextDumpProcessor.java
39 *
40 * TO RUN:
41 * maori-lang-detection/src$
42 * java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/NutchTextDumpProcessor
43 *
44*/
45public class NutchTextDumpProcessor {
46 private static Logger logger = Logger.getLogger(org.greenstone.atea.NutchTextDumpProcessor.class.getName());
47
48 private final MaoriTextDetector maoriTxtDetector;
49
50 public final String siteID; // is this necessary?
51
52 /** keep a list to store the text of each page */
53 private ArrayList<TextDumpPage> pages;
54
55
56 public NutchTextDumpProcessor(MaoriTextDetector maoriTxtDetector, String siteID, File txtDumpFile) {
57 // siteID is of the form %5d (e.g. 00020) and is just the name of a site folder
58 this.siteID = siteID;
59 this.maoriTxtDetector = maoriTxtDetector;
60
61 pages = new ArrayList<TextDumpPage>();
62
63 String line = null;
64 StringBuilder pageDump = new StringBuilder();
65 try (
66 BufferedReader reader = new BufferedReader(new FileReader(txtDumpFile));
67 ) {
68
69 while((line = reader.readLine()) != null) { // readLine removes newline separator
70 line = line.trim();
71 // an empty line marks the end of a page in nutch's text dump of a site
72 if(!line.equals("")) {
73 pageDump.append(line);
74 pageDump.append("\n");
75 } else {
76 TextDumpPage page = new TextDumpPage(siteID, pageDump.toString());
77 // parses the fields and body text of a webpage in nutch's txt dump of entire site
78 //page.parseFields();
79 //page.getText();
80 pages.add(page);
81 pageDump = null;
82 pageDump = new StringBuilder();
83 }
84 }
85
86 } catch (IOException ioe) {
87 error("@@@@@@@@@ Error reading in nutch txtdump file " + txtDumpFile, ioe);
88 }
89
90 }
91
92 /** pageID: id into pages array */
93 public boolean isPageInMaori(int pageID) throws ArrayIndexOutOfBoundsException {
94
95 String text = getTextForPage(pageID);
96 // QTODO: what to do when page body text is empty?
97 if(text.equals("")) return false;
98 return maoriTxtDetector.isTextInMaori(text);
99 }
100
101 private TextDumpPage getPage(int pageID) throws ArrayIndexOutOfBoundsException {
102 if(pageID < 0 || pageID >= pages.size()) {
103 throw new ArrayIndexOutOfBoundsException();
104 }
105
106 TextDumpPage page = pages.get(pageID);
107 return page;
108 }
109
110 public String getTextForPage(int pageID) throws ArrayIndexOutOfBoundsException {
111 TextDumpPage page = getPage(pageID);
112 return page.getPageText();
113 }
114 public String getURLForPage(int pageID) throws ArrayIndexOutOfBoundsException {
115 TextDumpPage page = getPage(pageID);
116 return page.getPageURL();
117 }
118
119
120 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
121 public static void info(String msg) {
122 System.err.println(msg);
123 logger.info(msg);
124 }
125 public static void debug(String msg) {
126 System.err.println(msg);
127 logger.debug(msg);
128 }
129 public static void warn(String msg) {
130 System.err.println(msg);
131 logger.warn(msg);
132 }
133 public static void error(String msg) {
134 System.err.println(msg);
135 logger.error(msg);
136 }
137 public static void error(String msg, Exception e) {
138 logger.error(msg, e);
139 System.err.println("\n"+msg);
140 e.printStackTrace();
141 }
142
143 public static void printUsage() {
144 info("Run this program as:");
145 info("\tNutchTextDumpProcessor <path to 'sites' folder>");
146 }
147
148 public static void main(String[] args) {
149 if(args.length != 1) {
150 printUsage();
151 return;
152 }
153
154 File sitesDir = new File(args[0]);
155 if(!sitesDir.exists() || !sitesDir.isDirectory()) {
156 error("Error: " + args[0] + " does not exist or is not a directory");
157 return;
158 }
159
160 try {
161 MaoriTextDetector mriTxtDetector = new MaoriTextDetector(false); // false: run non-silent
162 File[] sites = sitesDir.listFiles();
163 for(File siteDir : sites) { // e.g. 00001
164 // look for dump.txt
165 File txtDumpFile = new File(siteDir, "dump.txt");
166 if(!txtDumpFile.exists()) {
167 error("Text dump file " + txtDumpFile + " did not exist");
168 continue;
169 }
170
171 else {
172 String siteID = siteDir.getName();
173 NutchTextDumpProcessor nutchTxtDump = new NutchTextDumpProcessor(mriTxtDetector, siteID, txtDumpFile);
174
175 }
176
177 }
178
179 } catch(Exception e) {
180 // can get an exception when instantiating CCWETProcessor instance
181 error(e.getMessage(), e);
182 }
183 }
184}
Note: See TracBrowser for help on using the repository browser.