1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.util.Properties;
|
---|
5 |
|
---|
6 | import org.apache.log4j.Logger;
|
---|
7 | import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
|
---|
8 |
|
---|
9 |
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
|
---|
13 | * Then go into the src folder of this extension before compiling or running.
|
---|
14 | *
|
---|
15 | * Compile as:
|
---|
16 | * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
|
---|
17 | *
|
---|
18 | * Run as:
|
---|
19 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
|
---|
20 | */
|
---|
21 | public class NZTLDProcessor
|
---|
22 | {
|
---|
23 |
|
---|
24 | private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
|
---|
25 |
|
---|
26 | private Properties configProps = null;
|
---|
27 | private boolean DEBUG_MODE = true;
|
---|
28 |
|
---|
29 | private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
|
---|
30 | private final String sitesDir = "../dwn-sites";
|
---|
31 |
|
---|
32 | private void log(String msg) {
|
---|
33 | System.err.println(msg);
|
---|
34 | logger.info(msg);
|
---|
35 | }
|
---|
36 |
|
---|
37 | private void debug(String msg) {
|
---|
38 | System.err.println(msg);
|
---|
39 | logger.debug(msg);
|
---|
40 | }
|
---|
41 |
|
---|
42 | private void error(String msg) {
|
---|
43 | System.err.println("ERROR: " + msg);
|
---|
44 | logger.error(msg);
|
---|
45 | }
|
---|
46 |
|
---|
47 | private void warn(String msg) {
|
---|
48 | System.err.println("WARN: " + msg);
|
---|
49 | logger.warn(msg);
|
---|
50 | }
|
---|
51 |
|
---|
52 | public NZTLDProcessor(File seedURLsFile) throws Exception {
|
---|
53 | log("In NZTLDProcessor constructor");
|
---|
54 |
|
---|
55 | if(DEBUG_MODE) {
|
---|
56 | warn("**** Debugger is turned ON!!!");
|
---|
57 | }
|
---|
58 |
|
---|
59 | // TODO: why do I have to provide the path to config.props when this path is already on
|
---|
60 | // the classpath?
|
---|
61 | // In fact, I don't need it on the classpath for the following to work.
|
---|
62 | // How do I get it to work by specifying its path on the classpath and not here?
|
---|
63 | //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
|
---|
64 | //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
|
---|
65 | try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
|
---|
66 | configProps = new Properties();
|
---|
67 | configProps.load(infile);
|
---|
68 | infile.close();
|
---|
69 |
|
---|
70 | } catch(Exception e) {
|
---|
71 | System.err.println("Exception attempting to read properties from config.properties.");
|
---|
72 | //e.printStackTrace();
|
---|
73 | throw e;
|
---|
74 | }
|
---|
75 |
|
---|
76 |
|
---|
77 | try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
|
---|
78 | String url = null;
|
---|
79 |
|
---|
80 | int maxURLs = 1;
|
---|
81 | int urlCounter = 0;
|
---|
82 |
|
---|
83 | while((url = br.readLine()) != null) {
|
---|
84 |
|
---|
85 | log("Got URL: " + url);
|
---|
86 |
|
---|
87 | processURL(url);
|
---|
88 |
|
---|
89 | if(DEBUG_MODE) {
|
---|
90 | urlCounter++;
|
---|
91 | if(urlCounter == maxURLs) break;
|
---|
92 | }
|
---|
93 |
|
---|
94 | }
|
---|
95 |
|
---|
96 | br.close();
|
---|
97 |
|
---|
98 | } catch (IOException ioe) {
|
---|
99 | System.err.println("Exception attempting to read from seedURLsFile.");
|
---|
100 | //ioe.printStackTrace();
|
---|
101 | throw ioe;
|
---|
102 | }
|
---|
103 | }
|
---|
104 |
|
---|
105 | public boolean processURL(String nzDomainURL) {
|
---|
106 | // launch wget on URL
|
---|
107 | // when download done, recurse through downloaded dir
|
---|
108 | // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
|
---|
109 | // For each para, run MaoriTextDetector and store results in db
|
---|
110 | // Remove download directory (or only remove if site doesn't contain Maori txt)?
|
---|
111 |
|
---|
112 |
|
---|
113 | if(DEBUG_MODE) {
|
---|
114 | nzDomainURL = "www.waikato.ac.nz";
|
---|
115 | }
|
---|
116 | log("Processing seed URL: " + nzDomainURL);
|
---|
117 |
|
---|
118 | //if(wgetURL(nzDomainURL)) {
|
---|
119 | if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition
|
---|
120 |
|
---|
121 | processDownloadedDir(nzDomainURL);
|
---|
122 |
|
---|
123 | //}
|
---|
124 | return true;
|
---|
125 | }
|
---|
126 |
|
---|
127 | // wget will be launched from the specified directory, ../dwn_sites
|
---|
128 | // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
|
---|
129 | public boolean wgetURL(String nzDomainURL) {
|
---|
130 | // use SafeProcess and wget command in lib/config.properties
|
---|
131 | // DONE: set up logging
|
---|
132 |
|
---|
133 |
|
---|
134 | String wgetCmd = configProps.getProperty("wget.cmd");
|
---|
135 | if(wgetCmd == null || wgetCmd.equals("")) {
|
---|
136 | System.err.println("Invalid or empty wget.cmd in config.properties");
|
---|
137 | return false;
|
---|
138 | }
|
---|
139 |
|
---|
140 | // replace the placeholder in the wget cmd for the seed url
|
---|
141 | wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
|
---|
142 |
|
---|
143 | log("Will launch wget with the command: " + wgetCmd);
|
---|
144 |
|
---|
145 | String[] wgetCommandArgs = wgetCmd.split(" ");
|
---|
146 | SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir));
|
---|
147 |
|
---|
148 | SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
|
---|
149 | SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
|
---|
150 |
|
---|
151 | int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
|
---|
152 |
|
---|
153 | if(success != 0) {
|
---|
154 | System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
|
---|
155 | return false;
|
---|
156 | // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
|
---|
157 | }
|
---|
158 |
|
---|
159 | return true;
|
---|
160 | }
|
---|
161 |
|
---|
162 |
|
---|
163 | // ----------- inner class for SafeProcess to Run Wget ----------
|
---|
164 | public class ProcessLineHandler extends SafeProcess.LineByLineHandler
|
---|
165 | {
|
---|
166 | //static Logger logger = Logger.getLogger(this.getClass().getName());;
|
---|
167 | Logger logger;
|
---|
168 |
|
---|
169 | public ProcessLineHandler(Logger logger, int src)
|
---|
170 | {
|
---|
171 | super(src); // will set this.source to STDERR or STDOUT
|
---|
172 | //logger = Logger.getLogger(this.getClass().getName());
|
---|
173 | this.logger = logger;
|
---|
174 | }
|
---|
175 |
|
---|
176 | public void gotLine(String line) { // first non-null line
|
---|
177 |
|
---|
178 | // String form of this.source will be "stderr" or "stdout"
|
---|
179 | String msg = SafeProcess.streamToString(source) + ": " + line;
|
---|
180 | System.err.println(msg);
|
---|
181 | //NZTLDProcessor.this.logger.info(msg);
|
---|
182 | logger.info(msg);
|
---|
183 | }
|
---|
184 | public void gotException(Exception e) {
|
---|
185 | String msg = "Error in reading process' " + SafeProcess.streamToString(source);
|
---|
186 | //NZTLDProcessor.this.logger.error(msg, e);
|
---|
187 | logger.error(msg, e);
|
---|
188 | }
|
---|
189 |
|
---|
190 | }
|
---|
191 |
|
---|
192 |
|
---|
193 | public boolean processDownloadedDir(String nzDomainURL) {
|
---|
194 | // recurse through the downloaded directory, then process each file
|
---|
195 |
|
---|
196 | File downloadedSiteDir = new File(sitesDir, nzDomainURL);
|
---|
197 | log("*** Will process download dir " + downloadedSiteDir);
|
---|
198 |
|
---|
199 | if(!downloadedSiteDir.exists()) {
|
---|
200 | error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
|
---|
201 | return false;
|
---|
202 | }
|
---|
203 | /*
|
---|
204 | if(!downloadedSiteDir.isDirectory()) {
|
---|
205 | error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
|
---|
206 | return false; // TODO: or redo wget with "www." prefixed to URL???
|
---|
207 | }
|
---|
208 | */
|
---|
209 | recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
|
---|
210 |
|
---|
211 | debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
|
---|
212 |
|
---|
213 | return true;
|
---|
214 | }
|
---|
215 |
|
---|
216 | private void recursivelyProcessDir(String nzDomainURL, File file) {
|
---|
217 |
|
---|
218 | if(file.isDirectory()) { // recursive step
|
---|
219 | // children array null iff IO Exception OR if file not a directory,
|
---|
220 | // which it can't be since we tested isDirectory() on it just above
|
---|
221 | File[] children = file.listFiles(extractableTxtFilesFilter);
|
---|
222 | if(children == null) {
|
---|
223 | error("IO error occurred when trying to list children of " + file);
|
---|
224 | } else {
|
---|
225 | // children array will be empty if 'file' dir was empty
|
---|
226 | for(int i = 0; i < children.length; i++) {
|
---|
227 | recursivelyProcessDir(nzDomainURL, children[i]);
|
---|
228 | }
|
---|
229 | }
|
---|
230 |
|
---|
231 | } else { // base step
|
---|
232 | processFile(nzDomainURL, file);
|
---|
233 | }
|
---|
234 | }
|
---|
235 |
|
---|
236 | /*
|
---|
237 | public boolean processDownloadedDir(String nzDomainURL) {
|
---|
238 | // recurse through the downloaded directory, then process each file
|
---|
239 |
|
---|
240 | File downloadedSiteDir = new File(sitesDir, nzDomainURL);
|
---|
241 | if(!downloadedSiteDir.exists()) {
|
---|
242 | error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
|
---|
243 | return false;
|
---|
244 | }
|
---|
245 |
|
---|
246 | File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
|
---|
247 | if(files == null) {
|
---|
248 | error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
|
---|
249 | return false;
|
---|
250 | }
|
---|
251 | recursivelyProcessDir(nzDomainURL, files);
|
---|
252 |
|
---|
253 | return true;
|
---|
254 | }
|
---|
255 |
|
---|
256 | private void recursivelyProcessDir(String nzDomainURL, File[] children) {
|
---|
257 | for(int i = 0; i < files.length; i++) {
|
---|
258 | if(files[i].isDirectory()) { // recursive step
|
---|
259 |
|
---|
260 | // children array will be empty if dir empty
|
---|
261 | // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
|
---|
262 | File[] children = files[i].listFiles(extractableTxtFilesFilter);
|
---|
263 | if(children == null) {
|
---|
264 | error("IO error trying to list children of " + files[i]);
|
---|
265 | } else {
|
---|
266 | recursivelyProcessDir(nzDomainURL, children);
|
---|
267 | }
|
---|
268 | }
|
---|
269 | else { // base step
|
---|
270 | processFile(nzDomainURL, files[i]);
|
---|
271 | }
|
---|
272 | }
|
---|
273 | }*/
|
---|
274 |
|
---|
275 | public boolean processFile(String nzDomainURL, File file) {
|
---|
276 | // skip if js, css, robots.txt
|
---|
277 | // if no-extension or htm or html, call processHTMLFile()
|
---|
278 | // else: skip for now, TODO: later call Tika on other file types
|
---|
279 |
|
---|
280 | // TODO: update db with url info, page location and para?
|
---|
281 |
|
---|
282 |
|
---|
283 | log("*** Processing file " + file);
|
---|
284 |
|
---|
285 | return true;
|
---|
286 | }
|
---|
287 |
|
---|
288 | private class ExtractableTextFileFilter implements FilenameFilter {
|
---|
289 | // skip if js, css, robots.txt
|
---|
290 |
|
---|
291 | // For now also skip the image files. Later, with Tika, may be able to extract text from
|
---|
292 | // images though OCR of those imgs representing text? We accept files with no file extension,
|
---|
293 | // e.g. waikato uni has lots of files without extension that contain html.
|
---|
294 | // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
|
---|
295 | public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
|
---|
296 | //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
|
---|
297 |
|
---|
298 |
|
---|
299 | public boolean accept(File dir, String name) {
|
---|
300 |
|
---|
301 | return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
|
---|
302 | //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
|
---|
303 | //return name.matches(".*\\.html$"); // works
|
---|
304 | //return name.matches(".*\\.png($|\\?.*)"); // works
|
---|
305 | }
|
---|
306 | }
|
---|
307 |
|
---|
308 |
|
---|
309 | public boolean processHTMLFile() {
|
---|
310 | // Use JSoup to get paras
|
---|
311 |
|
---|
312 | // call processParas(text);
|
---|
313 |
|
---|
314 | return true;
|
---|
315 | }
|
---|
316 |
|
---|
317 | public boolean processNonHTMLFile() {
|
---|
318 | // Use Tika to get text
|
---|
319 |
|
---|
320 | // call processParas(text)
|
---|
321 |
|
---|
322 | return true;
|
---|
323 | }
|
---|
324 |
|
---|
325 | public boolean processParas(String text) {
|
---|
326 |
|
---|
327 | // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
|
---|
328 |
|
---|
329 | // for each para, call MaoriTextDetector.java
|
---|
330 |
|
---|
331 | return true;
|
---|
332 | }
|
---|
333 | // --------------------------- End inner class --------------------
|
---|
334 |
|
---|
335 | public static void main(String[] args) {
|
---|
336 | if(args.length <= 0) {
|
---|
337 | System.err.println("ERROR: Must provide input file of unique nz website addresses");
|
---|
338 | System.exit(-1);
|
---|
339 | }
|
---|
340 |
|
---|
341 | String infilePath = args[0];
|
---|
342 |
|
---|
343 | System.err.println("Requested to process URLS file: " + infilePath);
|
---|
344 |
|
---|
345 | File infile = new File(infilePath);
|
---|
346 | if(!infile.exists()) {
|
---|
347 | System.err.println("ERROR: File " + infilePath + " did not exist");
|
---|
348 | System.exit(-1);
|
---|
349 | }
|
---|
350 |
|
---|
351 |
|
---|
352 | try {
|
---|
353 | NZTLDProcessor processor = new NZTLDProcessor(infile);
|
---|
354 | } catch(Exception e) {
|
---|
355 | e.printStackTrace();
|
---|
356 | }
|
---|
357 |
|
---|
358 | }
|
---|
359 | }
|
---|
360 |
|
---|