source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33405

Last change on this file since 33405 was 33405, checked in by ak19, 5 years ago

Even though we're probably not going to use this code after all, will commit my fixes and current state. Got location of wget downloaded site working, also filefilter. Added recursive traversal of downloaded folder to identify files to process.

File size: 11.1 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.Properties;
5
6import org.apache.log4j.Logger;
7import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
8
9
10
11/**
12 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
13 * Then go into the src folder of this extension before compiling or running.
14 *
15 * Compile as:
16 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
17 *
18 * Run as:
19 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
20 */
21public class NZTLDProcessor
22{
23
24 private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
25
26 private Properties configProps = null;
27 private boolean DEBUG_MODE = true;
28
29 private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
30 private final String sitesDir = "../dwn-sites";
31
32 private void log(String msg) {
33 System.err.println(msg);
34 logger.info(msg);
35 }
36
37 private void debug(String msg) {
38 System.err.println(msg);
39 logger.debug(msg);
40 }
41
42 private void error(String msg) {
43 System.err.println("ERROR: " + msg);
44 logger.error(msg);
45 }
46
47 private void warn(String msg) {
48 System.err.println("WARN: " + msg);
49 logger.warn(msg);
50 }
51
52 public NZTLDProcessor(File seedURLsFile) throws Exception {
53 log("In NZTLDProcessor constructor");
54
55 if(DEBUG_MODE) {
56 warn("**** Debugger is turned ON!!!");
57 }
58
59 // TODO: why do I have to provide the path to config.props when this path is already on
60 // the classpath?
61 // In fact, I don't need it on the classpath for the following to work.
62 // How do I get it to work by specifying its path on the classpath and not here?
63 //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
64 //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
65 try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
66 configProps = new Properties();
67 configProps.load(infile);
68 infile.close();
69
70 } catch(Exception e) {
71 System.err.println("Exception attempting to read properties from config.properties.");
72 //e.printStackTrace();
73 throw e;
74 }
75
76
77 try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
78 String url = null;
79
80 int maxURLs = 1;
81 int urlCounter = 0;
82
83 while((url = br.readLine()) != null) {
84
85 log("Got URL: " + url);
86
87 processURL(url);
88
89 if(DEBUG_MODE) {
90 urlCounter++;
91 if(urlCounter == maxURLs) break;
92 }
93
94 }
95
96 br.close();
97
98 } catch (IOException ioe) {
99 System.err.println("Exception attempting to read from seedURLsFile.");
100 //ioe.printStackTrace();
101 throw ioe;
102 }
103 }
104
105 public boolean processURL(String nzDomainURL) {
106 // launch wget on URL
107 // when download done, recurse through downloaded dir
108 // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
109 // For each para, run MaoriTextDetector and store results in db
110 // Remove download directory (or only remove if site doesn't contain Maori txt)?
111
112
113 if(DEBUG_MODE) {
114 nzDomainURL = "www.waikato.ac.nz";
115 }
116 log("Processing seed URL: " + nzDomainURL);
117
118 //if(wgetURL(nzDomainURL)) {
119 if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition
120
121 processDownloadedDir(nzDomainURL);
122
123 //}
124 return true;
125 }
126
127 // wget will be launched from the specified directory, ../dwn_sites
128 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
129 public boolean wgetURL(String nzDomainURL) {
130 // use SafeProcess and wget command in lib/config.properties
131 // DONE: set up logging
132
133
134 String wgetCmd = configProps.getProperty("wget.cmd");
135 if(wgetCmd == null || wgetCmd.equals("")) {
136 System.err.println("Invalid or empty wget.cmd in config.properties");
137 return false;
138 }
139
140 // replace the placeholder in the wget cmd for the seed url
141 wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
142
143 log("Will launch wget with the command: " + wgetCmd);
144
145 String[] wgetCommandArgs = wgetCmd.split(" ");
146 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir));
147
148 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
149 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
150
151 int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
152
153 if(success != 0) {
154 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
155 return false;
156 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
157 }
158
159 return true;
160 }
161
162
163 // ----------- inner class for SafeProcess to Run Wget ----------
164 public class ProcessLineHandler extends SafeProcess.LineByLineHandler
165 {
166 //static Logger logger = Logger.getLogger(this.getClass().getName());;
167 Logger logger;
168
169 public ProcessLineHandler(Logger logger, int src)
170 {
171 super(src); // will set this.source to STDERR or STDOUT
172 //logger = Logger.getLogger(this.getClass().getName());
173 this.logger = logger;
174 }
175
176 public void gotLine(String line) { // first non-null line
177
178 // String form of this.source will be "stderr" or "stdout"
179 String msg = SafeProcess.streamToString(source) + ": " + line;
180 System.err.println(msg);
181 //NZTLDProcessor.this.logger.info(msg);
182 logger.info(msg);
183 }
184 public void gotException(Exception e) {
185 String msg = "Error in reading process' " + SafeProcess.streamToString(source);
186 //NZTLDProcessor.this.logger.error(msg, e);
187 logger.error(msg, e);
188 }
189
190 }
191
192
193 public boolean processDownloadedDir(String nzDomainURL) {
194 // recurse through the downloaded directory, then process each file
195
196 File downloadedSiteDir = new File(sitesDir, nzDomainURL);
197 log("*** Will process download dir " + downloadedSiteDir);
198
199 if(!downloadedSiteDir.exists()) {
200 error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
201 return false;
202 }
203 /*
204 if(!downloadedSiteDir.isDirectory()) {
205 error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
206 return false; // TODO: or redo wget with "www." prefixed to URL???
207 }
208 */
209 recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
210
211 debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
212
213 return true;
214 }
215
216 private void recursivelyProcessDir(String nzDomainURL, File file) {
217
218 if(file.isDirectory()) { // recursive step
219 // children array null iff IO Exception OR if file not a directory,
220 // which it can't be since we tested isDirectory() on it just above
221 File[] children = file.listFiles(extractableTxtFilesFilter);
222 if(children == null) {
223 error("IO error occurred when trying to list children of " + file);
224 } else {
225 // children array will be empty if 'file' dir was empty
226 for(int i = 0; i < children.length; i++) {
227 recursivelyProcessDir(nzDomainURL, children[i]);
228 }
229 }
230
231 } else { // base step
232 processFile(nzDomainURL, file);
233 }
234 }
235
236 /*
237 public boolean processDownloadedDir(String nzDomainURL) {
238 // recurse through the downloaded directory, then process each file
239
240 File downloadedSiteDir = new File(sitesDir, nzDomainURL);
241 if(!downloadedSiteDir.exists()) {
242 error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
243 return false;
244 }
245
246 File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
247 if(files == null) {
248 error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
249 return false;
250 }
251 recursivelyProcessDir(nzDomainURL, files);
252
253 return true;
254 }
255
256 private void recursivelyProcessDir(String nzDomainURL, File[] children) {
257 for(int i = 0; i < files.length; i++) {
258 if(files[i].isDirectory()) { // recursive step
259
260 // children array will be empty if dir empty
261 // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
262 File[] children = files[i].listFiles(extractableTxtFilesFilter);
263 if(children == null) {
264 error("IO error trying to list children of " + files[i]);
265 } else {
266 recursivelyProcessDir(nzDomainURL, children);
267 }
268 }
269 else { // base step
270 processFile(nzDomainURL, files[i]);
271 }
272 }
273 }*/
274
275 public boolean processFile(String nzDomainURL, File file) {
276 // skip if js, css, robots.txt
277 // if no-extension or htm or html, call processHTMLFile()
278 // else: skip for now, TODO: later call Tika on other file types
279
280 // TODO: update db with url info, page location and para?
281
282
283 log("*** Processing file " + file);
284
285 return true;
286 }
287
288 private class ExtractableTextFileFilter implements FilenameFilter {
289 // skip if js, css, robots.txt
290
291 // For now also skip the image files. Later, with Tika, may be able to extract text from
292 // images though OCR of those imgs representing text? We accept files with no file extension,
293 // e.g. waikato uni has lots of files without extension that contain html.
294 // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
295 public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
296 //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
297
298
299 public boolean accept(File dir, String name) {
300
301 return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
302 //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
303 //return name.matches(".*\\.html$"); // works
304 //return name.matches(".*\\.png($|\\?.*)"); // works
305 }
306 }
307
308
309 public boolean processHTMLFile() {
310 // Use JSoup to get paras
311
312 // call processParas(text);
313
314 return true;
315 }
316
317 public boolean processNonHTMLFile() {
318 // Use Tika to get text
319
320 // call processParas(text)
321
322 return true;
323 }
324
325 public boolean processParas(String text) {
326
327 // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
328
329 // for each para, call MaoriTextDetector.java
330
331 return true;
332 }
333 // --------------------------- End inner class --------------------
334
335 public static void main(String[] args) {
336 if(args.length <= 0) {
337 System.err.println("ERROR: Must provide input file of unique nz website addresses");
338 System.exit(-1);
339 }
340
341 String infilePath = args[0];
342
343 System.err.println("Requested to process URLS file: " + infilePath);
344
345 File infile = new File(infilePath);
346 if(!infile.exists()) {
347 System.err.println("ERROR: File " + infilePath + " did not exist");
348 System.exit(-1);
349 }
350
351
352 try {
353 NZTLDProcessor processor = new NZTLDProcessor(infile);
354 } catch(Exception e) {
355 e.printStackTrace();
356 }
357
358 }
359}
360
Note: See TracBrowser for help on using the repository browser.