source: other-projects/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33652

Last change on this file since 33652 was 33466, checked in by ak19, 5 years ago
  1. WETProcessor.main() now processes a folder of *.warc.wet(.gz) files. Each file's WET records is written out into an individual file and put into either the keep folder or discard folder, based on amount of content (number lines and/or content-length). 2. Moved unzipFile() from NZTLDProcessor.java into new Utility.java class as a static method.
File size: 15.3 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.Properties;
5import java.util.zip.GZIPInputStream;
6
7import org.apache.log4j.Logger;
8import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
9
10
11
12/**
13 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
14 * Then go into the src folder of this extension before compiling or running.
15 *
16 * Compile as:
17 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
18 *
19 * Run as:
20 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
21 */
22public class NZTLDProcessor
23{
24
25 private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
26
27 private Properties configProps = null;
28 private boolean DEBUG_MODE = true;
29
30 private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
31 private final String SITES_DIR = "../dwn-sites";
32 private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
33
34
35 private void log(String msg) {
36 System.err.println(msg);
37 logger.info(msg);
38 }
39
40 private void debug(String msg) {
41 System.err.println(msg);
42 logger.debug(msg);
43 }
44
45 private void error(String msg) {
46 System.err.println("### ERROR: " + msg);
47 logger.error(msg);
48 }
49
50 private void warn(String msg) {
51 System.err.println("*** WARN: " + msg);
52 logger.warn(msg);
53 }
54
55 public NZTLDProcessor(File seedURLsFile) throws Exception {
56 log("In NZTLDProcessor constructor");
57
58 if(DEBUG_MODE) {
59 warn("Debugger is turned ON!!!");
60 }
61
62 // TODO: why do I have to provide the path to config.props when this path is already on
63 // the classpath?
64 // In fact, I don't need it on the classpath for the following to work.
65 // How do I get it to work by specifying its path on the classpath and not here?
66 //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
67 //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
68 try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
69 configProps = new Properties();
70 configProps.load(infile);
71 infile.close();
72
73 } catch(Exception e) {
74 System.err.println("Exception attempting to read properties from config.properties.");
75 //e.printStackTrace();
76 throw e;
77 }
78
79
80 try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
81 String url = null;
82
83 int maxURLs = 1;
84 int urlCounter = 0;
85
86 while((url = br.readLine()) != null) {
87
88 log("Got URL: " + url);
89
90 // skip urls containing "/crawldiagnostics/" and "/robotstxt/"
91 if(url.indexOf("/robotstxt/") != -1) continue;
92 if(url.indexOf("/crawldiagnostics/") != -1) continue;
93
94 // convert the remaining WARC urls to WET urls
95 url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
96 url = url.replace(".warc.gz", ".warc.wet.gz");
97
98 // add the prefix
99 url = COMMONCRAWL_DATA_PREFIX + url;
100
101 log("Final WET URL: " + url);
102
103 processURL(url);
104
105 if(DEBUG_MODE) {
106 urlCounter++;
107 if(urlCounter == maxURLs) break;
108 }
109
110 }
111
112 br.close();
113
114 } catch (IOException ioe) {
115 System.err.println("Exception attempting to read from seedURLsFile.");
116 //ioe.printStackTrace();
117 throw ioe;
118 }
119 }
120
121 public boolean processURL(String ccWETfileURL) {
122 // launch wget on URL
123 // when download done, recurse through downloaded dir
124 // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
125 // For each para, run MaoriTextDetector and store results in db
126 // Remove download directory (or only remove if site doesn't contain Maori txt)?
127
128
129 if(DEBUG_MODE) {
130 ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
131 }
132 log("Processing WET file URL: " + ccWETfileURL);
133
134 /*
135 //if(wgetMirrorSite(ccWETfileURL)) {
136 if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
137
138 processDownloadedDir(ccWETfileURL);
139
140 //}
141 */
142
143
144 processWETfile(ccWETfileURL);
145
146
147 return true;
148 }
149
150 public boolean processWETfile(String ccWETfileURL) {
151 String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
152 String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
153
154 File inZipFile = new File(SITES_DIR, zippedWETfileName);
155 File WETfile = new File(SITES_DIR, unzippedWETfileName);
156
157 if(WETfile.exists()) {
158 log("Unzipped WET file " + WETfile + " already exists");
159 }
160 else {
161
162 if(inZipFile.exists()) {
163 log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");
164 }
165 else if(!wgetWETfile(ccWETfileURL)) {
166 return false;
167 }
168
169 // don't have the WET file yet. Get it from the zip file, which we know we should have by now
170
171 boolean success = Utility.unzipFile(inZipFile, WETfile);
172 log("Unzipped " + inZipFile + " to " + WETfile);
173
174 // whether we succeeded or not, get rid of the zipped file:
175 if(!inZipFile.delete()) {
176 warn("Unable to delete zipped WET file: " + zippedWETfileName);
177 }
178
179 if(!success) {
180 return false;
181 }
182 }
183
184 // read in the giant WET file and
185
186 return true;
187 }
188
189 /*
190 // Run gunzip
191 // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
192 // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
193 public boolean unzipFile(File inZipFile, File outFile) {
194
195
196 byte[] buffer = new byte[1024];
197
198 // try-with-resources will safely close streams/dispose resources on success or error and exceptions
199 try (
200 GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
201 FileOutputStream out = new FileOutputStream(outFile);
202 ) {
203 int len;
204 while ((len = gzis.read(buffer)) > 0) {
205 out.write(buffer, 0, len);
206 }
207
208 //gzis.close();
209 //out.close();
210
211 log("Unzipped " + inZipFile + " to " + outFile);
212
213 } catch(IOException ex) {
214 error("Failed to unzip " + inZipFile);
215 ex.printStackTrace();
216 return false;
217 }
218
219 return true;
220 }
221 */
222
223 // wget will be launched from the specified directory, SITES_DIR
224 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
225 public boolean wgetWETfile(String ccWETfileURL) {
226 // use SafeProcess and wget command in lib/config.properties
227 // DONE: set up logging
228
229 String wgetCmd = configProps.getProperty("wget.file.cmd");
230 if(wgetCmd == null || wgetCmd.equals("")) {
231 System.err.println("Invalid or empty wget.file.cmd in config.properties");
232 return false;
233 }
234
235 // replace the placeholder in the wget cmd for the seed url
236 wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
237
238 log("Will launch wget with the command: " + wgetCmd);
239
240 String[] wgetCommandArgs = wgetCmd.split(" ");
241 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
242
243 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
244 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
245
246 int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
247
248 if(success != 0) {
249 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
250 return false;
251 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
252 }
253
254 return true;
255 }
256
257
258 // wget will be launched from the specified directory, SITES_DIR
259 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
260 public boolean wgetMirrorSite(String nzDomainURL) {
261 // use SafeProcess and wget command in lib/config.properties
262 // DONE: set up logging
263
264
265 String wgetCmd = configProps.getProperty("wget.mirror.cmd");
266 if(wgetCmd == null || wgetCmd.equals("")) {
267 System.err.println("Invalid or empty wget.mirror.cmd in config.properties");
268 return false;
269 }
270
271 // replace the placeholder in the wget cmd for the seed url
272 wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
273
274 log("Will launch wget with the command: " + wgetCmd);
275
276 String[] wgetCommandArgs = wgetCmd.split(" ");
277 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
278
279 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
280 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
281
282 int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
283
284 if(success != 0) {
285 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
286 return false;
287 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
288 }
289
290 return true;
291 }
292
293
294 // ----------- inner class for SafeProcess to Run Wget ----------
295 public class ProcessLineHandler extends SafeProcess.LineByLineHandler
296 {
297 //static Logger logger = Logger.getLogger(this.getClass().getName());;
298 Logger logger;
299
300 public ProcessLineHandler(Logger logger, int src)
301 {
302 super(src); // will set this.source to STDERR or STDOUT
303 //logger = Logger.getLogger(this.getClass().getName());
304 this.logger = logger;
305 }
306
307 public void gotLine(String line) { // first non-null line
308
309 // String form of this.source will be "stderr" or "stdout"
310 String msg = SafeProcess.streamToString(source) + ": " + line;
311 System.err.println(msg);
312 //NZTLDProcessor.this.logger.info(msg);
313 logger.info(msg);
314 }
315 public void gotException(Exception e) {
316 String msg = "Error in reading process' " + SafeProcess.streamToString(source);
317 //NZTLDProcessor.this.logger.error(msg, e);
318 logger.error(msg, e);
319 }
320
321 }
322
323
324 public boolean processDownloadedDir(String ccWETfileURL) {
325 // recurse through the downloaded directory, then process each file
326
327 File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
328 log("*** Will process download dir " + downloadedSiteDir);
329
330 if(!downloadedSiteDir.exists()) {
331 error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
332 return false;
333 }
334 /*
335 if(!downloadedSiteDir.isDirectory()) {
336 error("Downloaded site " + downloadedSiteDir + " is not a directory!");
337 return false; // TODO: or redo wget with "www." prefixed to URL???
338 }
339 */
340 recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
341
342 debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
343
344 return true;
345 }
346
347 private void recursivelyProcessDir(String ccWETfileURL, File file) {
348
349 if(file.isDirectory()) { // recursive step
350 // children array null iff IO Exception OR if file not a directory,
351 // which it can't be since we tested isDirectory() on it just above
352 File[] children = file.listFiles(extractableTxtFilesFilter);
353 if(children == null) {
354 error("IO error occurred when trying to list children of " + file);
355 } else {
356 // children array will be empty if 'file' dir was empty
357 for(int i = 0; i < children.length; i++) {
358 recursivelyProcessDir(ccWETfileURL, children[i]);
359 }
360 }
361
362 } else { // base step
363 processFile(ccWETfileURL, file);
364 }
365 }
366
367 /*
368 public boolean processDownloadedDir(String ccWETfileURL) {
369 // recurse through the downloaded directory, then process each file
370
371 File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
372 if(!downloadedSiteDir.exists()) {
373 error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
374 return false;
375 }
376
377 File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
378 if(files == null) {
379 error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
380 return false;
381 }
382 recursivelyProcessDir(ccWETfileURL, files);
383
384 return true;
385 }
386
387 private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
388 for(int i = 0; i < files.length; i++) {
389 if(files[i].isDirectory()) { // recursive step
390
391 // children array will be empty if dir empty
392 // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
393 File[] children = files[i].listFiles(extractableTxtFilesFilter);
394 if(children == null) {
395 error("IO error trying to list children of " + files[i]);
396 } else {
397 recursivelyProcessDir(ccWETfileURL, children);
398 }
399 }
400 else { // base step
401 processFile(ccWETfileURL, files[i]);
402 }
403 }
404 }*/
405
406 public boolean processFile(String ccWETfileURL, File file) {
407 // skip if js, css, robots.txt
408 // if no-extension or htm or html, call processHTMLFile()
409 // else: skip for now, TODO: later call Tika on other file types
410
411 // TODO: update db with url info, page location and para?
412
413
414 log("*** Processing file " + file);
415
416 return true;
417 }
418
419 private class ExtractableTextFileFilter implements FilenameFilter {
420 // skip if js, css, robots.txt
421
422 // For now also skip the image files. Later, with Tika, may be able to extract text from
423 // images though OCR of those imgs representing text? We accept files with no file extension,
424 // e.g. waikato uni has lots of files without extension that contain html.
425 // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
426 public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
427 //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
428
429
430 public boolean accept(File dir, String name) {
431
432 return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
433 //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
434 //return name.matches(".*\\.html$"); // works
435 //return name.matches(".*\\.png($|\\?.*)"); // works
436 }
437 }
438
439
440 public boolean processHTMLFile() {
441 // Use JSoup to get paras
442
443 // call processParas(text);
444
445 return true;
446 }
447
448 public boolean processNonHTMLFile() {
449 // Use Tika to get text
450
451 // call processParas(text)
452
453 return true;
454 }
455
456 public boolean processParas(String text) {
457
458 // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
459
460 // for each para, call MaoriTextDetector.java
461
462 return true;
463 }
464 // --------------------------- End inner class --------------------
465
466 public static void main(String[] args) {
467 if(args.length <= 0) {
468 System.err.println("ERROR: Must provide input file of unique nz website addresses");
469 System.exit(-1);
470 }
471
472 String infilePath = args[0];
473
474 System.err.println("Requested to process URLS file: " + infilePath);
475
476 File infile = new File(infilePath);
477 if(!infile.exists()) {
478 System.err.println("ERROR: File " + infilePath + " did not exist");
479 System.exit(-1);
480 }
481
482
483 try {
484 NZTLDProcessor processor = new NZTLDProcessor(infile);
485 } catch(Exception e) {
486 e.printStackTrace();
487 }
488
489 }
490}
491
Note: See TracBrowser for help on using the repository browser.