1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.util.Properties;
|
---|
5 | import java.util.zip.GZIPInputStream;
|
---|
6 |
|
---|
7 | import org.apache.log4j.Logger;
|
---|
8 | import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
|
---|
9 |
|
---|
10 |
|
---|
11 |
|
---|
12 | /**
|
---|
13 | * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
|
---|
14 | * Then go into the src folder of this extension before compiling or running.
|
---|
15 | *
|
---|
16 | * Compile as:
|
---|
17 | * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
|
---|
18 | *
|
---|
19 | * Run as:
|
---|
20 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
|
---|
21 | */
|
---|
22 | public class NZTLDProcessor
|
---|
23 | {
|
---|
24 |
|
---|
25 | private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
|
---|
26 |
|
---|
27 | private Properties configProps = null;
|
---|
28 | private boolean DEBUG_MODE = true;
|
---|
29 |
|
---|
30 | private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
|
---|
31 | private final String SITES_DIR = "../dwn-sites";
|
---|
32 | private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
|
---|
33 |
|
---|
34 |
|
---|
35 | private void log(String msg) {
|
---|
36 | System.err.println(msg);
|
---|
37 | logger.info(msg);
|
---|
38 | }
|
---|
39 |
|
---|
40 | private void debug(String msg) {
|
---|
41 | System.err.println(msg);
|
---|
42 | logger.debug(msg);
|
---|
43 | }
|
---|
44 |
|
---|
45 | private void error(String msg) {
|
---|
46 | System.err.println("### ERROR: " + msg);
|
---|
47 | logger.error(msg);
|
---|
48 | }
|
---|
49 |
|
---|
50 | private void warn(String msg) {
|
---|
51 | System.err.println("*** WARN: " + msg);
|
---|
52 | logger.warn(msg);
|
---|
53 | }
|
---|
54 |
|
---|
55 | public NZTLDProcessor(File seedURLsFile) throws Exception {
|
---|
56 | log("In NZTLDProcessor constructor");
|
---|
57 |
|
---|
58 | if(DEBUG_MODE) {
|
---|
59 | warn("Debugger is turned ON!!!");
|
---|
60 | }
|
---|
61 |
|
---|
62 | // TODO: why do I have to provide the path to config.props when this path is already on
|
---|
63 | // the classpath?
|
---|
64 | // In fact, I don't need it on the classpath for the following to work.
|
---|
65 | // How do I get it to work by specifying its path on the classpath and not here?
|
---|
66 | //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
|
---|
67 | //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
|
---|
68 | try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
|
---|
69 | configProps = new Properties();
|
---|
70 | configProps.load(infile);
|
---|
71 | infile.close();
|
---|
72 |
|
---|
73 | } catch(Exception e) {
|
---|
74 | System.err.println("Exception attempting to read properties from config.properties.");
|
---|
75 | //e.printStackTrace();
|
---|
76 | throw e;
|
---|
77 | }
|
---|
78 |
|
---|
79 |
|
---|
80 | try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
|
---|
81 | String url = null;
|
---|
82 |
|
---|
83 | int maxURLs = 1;
|
---|
84 | int urlCounter = 0;
|
---|
85 |
|
---|
86 | while((url = br.readLine()) != null) {
|
---|
87 |
|
---|
88 | log("Got URL: " + url);
|
---|
89 |
|
---|
90 | // skip urls containing "/crawldiagnostics/" and "/robotstxt/"
|
---|
91 | if(url.indexOf("/robotstxt/") != -1) continue;
|
---|
92 | if(url.indexOf("/crawldiagnostics/") != -1) continue;
|
---|
93 |
|
---|
94 | // convert the remaining WARC urls to WET urls
|
---|
95 | url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
|
---|
96 | url = url.replace(".warc.gz", ".warc.wet.gz");
|
---|
97 |
|
---|
98 | // add the prefix
|
---|
99 | url = COMMONCRAWL_DATA_PREFIX + url;
|
---|
100 |
|
---|
101 | log("Final WET URL: " + url);
|
---|
102 |
|
---|
103 | processURL(url);
|
---|
104 |
|
---|
105 | if(DEBUG_MODE) {
|
---|
106 | urlCounter++;
|
---|
107 | if(urlCounter == maxURLs) break;
|
---|
108 | }
|
---|
109 |
|
---|
110 | }
|
---|
111 |
|
---|
112 | br.close();
|
---|
113 |
|
---|
114 | } catch (IOException ioe) {
|
---|
115 | System.err.println("Exception attempting to read from seedURLsFile.");
|
---|
116 | //ioe.printStackTrace();
|
---|
117 | throw ioe;
|
---|
118 | }
|
---|
119 | }
|
---|
120 |
|
---|
121 | public boolean processURL(String ccWETfileURL) {
|
---|
122 | // launch wget on URL
|
---|
123 | // when download done, recurse through downloaded dir
|
---|
124 | // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
|
---|
125 | // For each para, run MaoriTextDetector and store results in db
|
---|
126 | // Remove download directory (or only remove if site doesn't contain Maori txt)?
|
---|
127 |
|
---|
128 |
|
---|
129 | if(DEBUG_MODE) {
|
---|
130 | ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
|
---|
131 | }
|
---|
132 | log("Processing WET file URL: " + ccWETfileURL);
|
---|
133 |
|
---|
134 | /*
|
---|
135 | //if(wgetMirrorSite(ccWETfileURL)) {
|
---|
136 | if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
|
---|
137 |
|
---|
138 | processDownloadedDir(ccWETfileURL);
|
---|
139 |
|
---|
140 | //}
|
---|
141 | */
|
---|
142 |
|
---|
143 |
|
---|
144 | processWETfile(ccWETfileURL);
|
---|
145 |
|
---|
146 |
|
---|
147 | return true;
|
---|
148 | }
|
---|
149 |
|
---|
150 | public boolean processWETfile(String ccWETfileURL) {
|
---|
151 | String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
|
---|
152 | String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
|
---|
153 |
|
---|
154 | File inZipFile = new File(SITES_DIR, zippedWETfileName);
|
---|
155 | File WETfile = new File(SITES_DIR, unzippedWETfileName);
|
---|
156 |
|
---|
157 | if(WETfile.exists()) {
|
---|
158 | log("Unzipped WET file " + WETfile + " already exists");
|
---|
159 | }
|
---|
160 | else {
|
---|
161 |
|
---|
162 | if(inZipFile.exists()) {
|
---|
163 | log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");
|
---|
164 | }
|
---|
165 | else if(!wgetWETfile(ccWETfileURL)) {
|
---|
166 | return false;
|
---|
167 | }
|
---|
168 |
|
---|
169 | // don't have the WET file yet. Get it from the zip file, which we know we should have by now
|
---|
170 |
|
---|
171 | boolean success = unzipFile(inZipFile, WETfile);
|
---|
172 | // whether we succeeded or not, get rid of the zipped file:
|
---|
173 | if(!inZipFile.delete()) {
|
---|
174 | warn("Unable to delete zipped WET file: " + zippedWETfileName);
|
---|
175 | }
|
---|
176 |
|
---|
177 | if(!success) {
|
---|
178 | return false;
|
---|
179 | }
|
---|
180 | }
|
---|
181 |
|
---|
182 | // read in the giant WET file and
|
---|
183 |
|
---|
184 | return true;
|
---|
185 | }
|
---|
186 |
|
---|
187 | // Run gunzip
|
---|
188 | // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
|
---|
189 | // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
|
---|
190 | public boolean unzipFile(File inZipFile, File outFile) {
|
---|
191 |
|
---|
192 |
|
---|
193 | byte[] buffer = new byte[1024];
|
---|
194 |
|
---|
195 | // try-with-resources will safely close streams/dispose resources on success or error and exceptions
|
---|
196 | try (
|
---|
197 | GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
|
---|
198 | FileOutputStream out = new FileOutputStream(outFile);
|
---|
199 | ) {
|
---|
200 | int len;
|
---|
201 | while ((len = gzis.read(buffer)) > 0) {
|
---|
202 | out.write(buffer, 0, len);
|
---|
203 | }
|
---|
204 |
|
---|
205 | //gzis.close();
|
---|
206 | //out.close();
|
---|
207 |
|
---|
208 | log("Unzipped " + inZipFile + " to " + outFile);
|
---|
209 |
|
---|
210 | } catch(IOException ex) {
|
---|
211 | error("Failed to unzip " + inZipFile);
|
---|
212 | ex.printStackTrace();
|
---|
213 | return false;
|
---|
214 | }
|
---|
215 |
|
---|
216 | return true;
|
---|
217 | }
|
---|
218 |
|
---|
219 | // wget will be launched from the specified directory, SITES_DIR
|
---|
220 | // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
|
---|
221 | public boolean wgetWETfile(String ccWETfileURL) {
|
---|
222 | // use SafeProcess and wget command in lib/config.properties
|
---|
223 | // DONE: set up logging
|
---|
224 |
|
---|
225 | String wgetCmd = configProps.getProperty("wget.file.cmd");
|
---|
226 | if(wgetCmd == null || wgetCmd.equals("")) {
|
---|
227 | System.err.println("Invalid or empty wget.file.cmd in config.properties");
|
---|
228 | return false;
|
---|
229 | }
|
---|
230 |
|
---|
231 | // replace the placeholder in the wget cmd for the seed url
|
---|
232 | wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
|
---|
233 |
|
---|
234 | log("Will launch wget with the command: " + wgetCmd);
|
---|
235 |
|
---|
236 | String[] wgetCommandArgs = wgetCmd.split(" ");
|
---|
237 | SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
|
---|
238 |
|
---|
239 | SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
|
---|
240 | SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
|
---|
241 |
|
---|
242 | int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
|
---|
243 |
|
---|
244 | if(success != 0) {
|
---|
245 | System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
|
---|
246 | return false;
|
---|
247 | // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
|
---|
248 | }
|
---|
249 |
|
---|
250 | return true;
|
---|
251 | }
|
---|
252 |
|
---|
253 |
|
---|
254 | // wget will be launched from the specified directory, SITES_DIR
|
---|
255 | // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
|
---|
256 | public boolean wgetMirrorSite(String nzDomainURL) {
|
---|
257 | // use SafeProcess and wget command in lib/config.properties
|
---|
258 | // DONE: set up logging
|
---|
259 |
|
---|
260 |
|
---|
261 | String wgetCmd = configProps.getProperty("wget.mirror.cmd");
|
---|
262 | if(wgetCmd == null || wgetCmd.equals("")) {
|
---|
263 | System.err.println("Invalid or empty wget.mirror.cmd in config.properties");
|
---|
264 | return false;
|
---|
265 | }
|
---|
266 |
|
---|
267 | // replace the placeholder in the wget cmd for the seed url
|
---|
268 | wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
|
---|
269 |
|
---|
270 | log("Will launch wget with the command: " + wgetCmd);
|
---|
271 |
|
---|
272 | String[] wgetCommandArgs = wgetCmd.split(" ");
|
---|
273 | SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
|
---|
274 |
|
---|
275 | SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
|
---|
276 | SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
|
---|
277 |
|
---|
278 | int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
|
---|
279 |
|
---|
280 | if(success != 0) {
|
---|
281 | System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
|
---|
282 | return false;
|
---|
283 | // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
|
---|
284 | }
|
---|
285 |
|
---|
286 | return true;
|
---|
287 | }
|
---|
288 |
|
---|
289 |
|
---|
290 | // ----------- inner class for SafeProcess to Run Wget ----------
|
---|
291 | public class ProcessLineHandler extends SafeProcess.LineByLineHandler
|
---|
292 | {
|
---|
293 | //static Logger logger = Logger.getLogger(this.getClass().getName());;
|
---|
294 | Logger logger;
|
---|
295 |
|
---|
296 | public ProcessLineHandler(Logger logger, int src)
|
---|
297 | {
|
---|
298 | super(src); // will set this.source to STDERR or STDOUT
|
---|
299 | //logger = Logger.getLogger(this.getClass().getName());
|
---|
300 | this.logger = logger;
|
---|
301 | }
|
---|
302 |
|
---|
303 | public void gotLine(String line) { // first non-null line
|
---|
304 |
|
---|
305 | // String form of this.source will be "stderr" or "stdout"
|
---|
306 | String msg = SafeProcess.streamToString(source) + ": " + line;
|
---|
307 | System.err.println(msg);
|
---|
308 | //NZTLDProcessor.this.logger.info(msg);
|
---|
309 | logger.info(msg);
|
---|
310 | }
|
---|
311 | public void gotException(Exception e) {
|
---|
312 | String msg = "Error in reading process' " + SafeProcess.streamToString(source);
|
---|
313 | //NZTLDProcessor.this.logger.error(msg, e);
|
---|
314 | logger.error(msg, e);
|
---|
315 | }
|
---|
316 |
|
---|
317 | }
|
---|
318 |
|
---|
319 |
|
---|
320 | public boolean processDownloadedDir(String ccWETfileURL) {
|
---|
321 | // recurse through the downloaded directory, then process each file
|
---|
322 |
|
---|
323 | File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
|
---|
324 | log("*** Will process download dir " + downloadedSiteDir);
|
---|
325 |
|
---|
326 | if(!downloadedSiteDir.exists()) {
|
---|
327 | error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
|
---|
328 | return false;
|
---|
329 | }
|
---|
330 | /*
|
---|
331 | if(!downloadedSiteDir.isDirectory()) {
|
---|
332 | error("Downloaded site " + downloadedSiteDir + " is not a directory!");
|
---|
333 | return false; // TODO: or redo wget with "www." prefixed to URL???
|
---|
334 | }
|
---|
335 | */
|
---|
336 | recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
|
---|
337 |
|
---|
338 | debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
|
---|
339 |
|
---|
340 | return true;
|
---|
341 | }
|
---|
342 |
|
---|
343 | private void recursivelyProcessDir(String ccWETfileURL, File file) {
|
---|
344 |
|
---|
345 | if(file.isDirectory()) { // recursive step
|
---|
346 | // children array null iff IO Exception OR if file not a directory,
|
---|
347 | // which it can't be since we tested isDirectory() on it just above
|
---|
348 | File[] children = file.listFiles(extractableTxtFilesFilter);
|
---|
349 | if(children == null) {
|
---|
350 | error("IO error occurred when trying to list children of " + file);
|
---|
351 | } else {
|
---|
352 | // children array will be empty if 'file' dir was empty
|
---|
353 | for(int i = 0; i < children.length; i++) {
|
---|
354 | recursivelyProcessDir(ccWETfileURL, children[i]);
|
---|
355 | }
|
---|
356 | }
|
---|
357 |
|
---|
358 | } else { // base step
|
---|
359 | processFile(ccWETfileURL, file);
|
---|
360 | }
|
---|
361 | }
|
---|
362 |
|
---|
363 | /*
|
---|
364 | public boolean processDownloadedDir(String ccWETfileURL) {
|
---|
365 | // recurse through the downloaded directory, then process each file
|
---|
366 |
|
---|
367 | File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
|
---|
368 | if(!downloadedSiteDir.exists()) {
|
---|
369 | error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
|
---|
370 | return false;
|
---|
371 | }
|
---|
372 |
|
---|
373 | File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
|
---|
374 | if(files == null) {
|
---|
375 | error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
|
---|
376 | return false;
|
---|
377 | }
|
---|
378 | recursivelyProcessDir(ccWETfileURL, files);
|
---|
379 |
|
---|
380 | return true;
|
---|
381 | }
|
---|
382 |
|
---|
383 | private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
|
---|
384 | for(int i = 0; i < files.length; i++) {
|
---|
385 | if(files[i].isDirectory()) { // recursive step
|
---|
386 |
|
---|
387 | // children array will be empty if dir empty
|
---|
388 | // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
|
---|
389 | File[] children = files[i].listFiles(extractableTxtFilesFilter);
|
---|
390 | if(children == null) {
|
---|
391 | error("IO error trying to list children of " + files[i]);
|
---|
392 | } else {
|
---|
393 | recursivelyProcessDir(ccWETfileURL, children);
|
---|
394 | }
|
---|
395 | }
|
---|
396 | else { // base step
|
---|
397 | processFile(ccWETfileURL, files[i]);
|
---|
398 | }
|
---|
399 | }
|
---|
400 | }*/
|
---|
401 |
|
---|
402 | public boolean processFile(String ccWETfileURL, File file) {
|
---|
403 | // skip if js, css, robots.txt
|
---|
404 | // if no-extension or htm or html, call processHTMLFile()
|
---|
405 | // else: skip for now, TODO: later call Tika on other file types
|
---|
406 |
|
---|
407 | // TODO: update db with url info, page location and para?
|
---|
408 |
|
---|
409 |
|
---|
410 | log("*** Processing file " + file);
|
---|
411 |
|
---|
412 | return true;
|
---|
413 | }
|
---|
414 |
|
---|
415 | private class ExtractableTextFileFilter implements FilenameFilter {
|
---|
416 | // skip if js, css, robots.txt
|
---|
417 |
|
---|
418 | // For now also skip the image files. Later, with Tika, may be able to extract text from
|
---|
419 | // images though OCR of those imgs representing text? We accept files with no file extension,
|
---|
420 | // e.g. waikato uni has lots of files without extension that contain html.
|
---|
421 | // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
|
---|
422 | public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
|
---|
423 | //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
|
---|
424 |
|
---|
425 |
|
---|
426 | public boolean accept(File dir, String name) {
|
---|
427 |
|
---|
428 | return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
|
---|
429 | //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
|
---|
430 | //return name.matches(".*\\.html$"); // works
|
---|
431 | //return name.matches(".*\\.png($|\\?.*)"); // works
|
---|
432 | }
|
---|
433 | }
|
---|
434 |
|
---|
435 |
|
---|
436 | public boolean processHTMLFile() {
|
---|
437 | // Use JSoup to get paras
|
---|
438 |
|
---|
439 | // call processParas(text);
|
---|
440 |
|
---|
441 | return true;
|
---|
442 | }
|
---|
443 |
|
---|
444 | public boolean processNonHTMLFile() {
|
---|
445 | // Use Tika to get text
|
---|
446 |
|
---|
447 | // call processParas(text)
|
---|
448 |
|
---|
449 | return true;
|
---|
450 | }
|
---|
451 |
|
---|
452 | public boolean processParas(String text) {
|
---|
453 |
|
---|
454 | // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
|
---|
455 |
|
---|
456 | // for each para, call MaoriTextDetector.java
|
---|
457 |
|
---|
458 | return true;
|
---|
459 | }
|
---|
460 | // --------------------------- End inner class --------------------
|
---|
461 |
|
---|
462 | public static void main(String[] args) {
|
---|
463 | if(args.length <= 0) {
|
---|
464 | System.err.println("ERROR: Must provide input file of unique nz website addresses");
|
---|
465 | System.exit(-1);
|
---|
466 | }
|
---|
467 |
|
---|
468 | String infilePath = args[0];
|
---|
469 |
|
---|
470 | System.err.println("Requested to process URLS file: " + infilePath);
|
---|
471 |
|
---|
472 | File infile = new File(infilePath);
|
---|
473 | if(!infile.exists()) {
|
---|
474 | System.err.println("ERROR: File " + infilePath + " did not exist");
|
---|
475 | System.exit(-1);
|
---|
476 | }
|
---|
477 |
|
---|
478 |
|
---|
479 | try {
|
---|
480 | NZTLDProcessor processor = new NZTLDProcessor(infile);
|
---|
481 | } catch(Exception e) {
|
---|
482 | e.printStackTrace();
|
---|
483 | }
|
---|
484 |
|
---|
485 | }
|
---|
486 | }
|
---|
487 |
|
---|