source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33411

Last change on this file since 33411 was 33411, checked in by ak19, 5 years ago

Newer version now doesn't mirror sites with wget but gets WET files and unzips them instead. Didn't yet get to the part where I can process the actual contents inside each 400MB WET file.

File size: 15.2 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.Properties;
5import java.util.zip.GZIPInputStream;
6
7import org.apache.log4j.Logger;
8import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
9
10
11
12/**
13 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
14 * Then go into the src folder of this extension before compiling or running.
15 *
16 * Compile as:
17 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
18 *
19 * Run as:
20 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
21 */
22public class NZTLDProcessor
23{
24
25 private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
26
27 private Properties configProps = null;
28 private boolean DEBUG_MODE = true;
29
30 private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
31 private final String SITES_DIR = "../dwn-sites";
32 private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
33
34
35 private void log(String msg) {
36 System.err.println(msg);
37 logger.info(msg);
38 }
39
40 private void debug(String msg) {
41 System.err.println(msg);
42 logger.debug(msg);
43 }
44
45 private void error(String msg) {
46 System.err.println("### ERROR: " + msg);
47 logger.error(msg);
48 }
49
50 private void warn(String msg) {
51 System.err.println("*** WARN: " + msg);
52 logger.warn(msg);
53 }
54
55 public NZTLDProcessor(File seedURLsFile) throws Exception {
56 log("In NZTLDProcessor constructor");
57
58 if(DEBUG_MODE) {
59 warn("Debugger is turned ON!!!");
60 }
61
62 // TODO: why do I have to provide the path to config.props when this path is already on
63 // the classpath?
64 // In fact, I don't need it on the classpath for the following to work.
65 // How do I get it to work by specifying its path on the classpath and not here?
66 //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
67 //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
68 try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
69 configProps = new Properties();
70 configProps.load(infile);
71 infile.close();
72
73 } catch(Exception e) {
74 System.err.println("Exception attempting to read properties from config.properties.");
75 //e.printStackTrace();
76 throw e;
77 }
78
79
80 try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
81 String url = null;
82
83 int maxURLs = 1;
84 int urlCounter = 0;
85
86 while((url = br.readLine()) != null) {
87
88 log("Got URL: " + url);
89
90 // skip urls containing "/crawldiagnostics/" and "/robotstxt/"
91 if(url.indexOf("/robotstxt/") != -1) continue;
92 if(url.indexOf("/crawldiagnostics/") != -1) continue;
93
94 // convert the remaining WARC urls to WET urls
95 url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
96 url = url.replace(".warc.gz", ".warc.wet.gz");
97
98 // add the prefix
99 url = COMMONCRAWL_DATA_PREFIX + url;
100
101 log("Final WET URL: " + url);
102
103 processURL(url);
104
105 if(DEBUG_MODE) {
106 urlCounter++;
107 if(urlCounter == maxURLs) break;
108 }
109
110 }
111
112 br.close();
113
114 } catch (IOException ioe) {
115 System.err.println("Exception attempting to read from seedURLsFile.");
116 //ioe.printStackTrace();
117 throw ioe;
118 }
119 }
120
121 public boolean processURL(String ccWETfileURL) {
122 // launch wget on URL
123 // when download done, recurse through downloaded dir
124 // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
125 // For each para, run MaoriTextDetector and store results in db
126 // Remove download directory (or only remove if site doesn't contain Maori txt)?
127
128
129 if(DEBUG_MODE) {
130 ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
131 }
132 log("Processing WET file URL: " + ccWETfileURL);
133
134 /*
135 //if(wgetMirrorSite(ccWETfileURL)) {
136 if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
137
138 processDownloadedDir(ccWETfileURL);
139
140 //}
141 */
142
143
144 processWETfile(ccWETfileURL);
145
146
147 return true;
148 }
149
150 public boolean processWETfile(String ccWETfileURL) {
151 String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
152 String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
153
154 File inZipFile = new File(SITES_DIR, zippedWETfileName);
155 File WETfile = new File(SITES_DIR, unzippedWETfileName);
156
157 if(WETfile.exists()) {
158 log("Unzipped WET file " + WETfile + " already exists");
159 }
160 else {
161
162 if(inZipFile.exists()) {
163 log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");
164 }
165 else if(!wgetWETfile(ccWETfileURL)) {
166 return false;
167 }
168
169 // don't have the WET file yet. Get it from the zip file, which we know we should have by now
170
171 boolean success = unzipFile(inZipFile, WETfile);
172 // whether we succeeded or not, get rid of the zipped file:
173 if(!inZipFile.delete()) {
174 warn("Unable to delete zipped WET file: " + zippedWETfileName);
175 }
176
177 if(!success) {
178 return false;
179 }
180 }
181
182 // read in the giant WET file and
183
184 return true;
185 }
186
187 // Run gunzip
188 // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
189 // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
190 public boolean unzipFile(File inZipFile, File outFile) {
191
192
193 byte[] buffer = new byte[1024];
194
195 // try-with-resources will safely close streams/dispose resources on success or error and exceptions
196 try (
197 GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
198 FileOutputStream out = new FileOutputStream(outFile);
199 ) {
200 int len;
201 while ((len = gzis.read(buffer)) > 0) {
202 out.write(buffer, 0, len);
203 }
204
205 //gzis.close();
206 //out.close();
207
208 log("Unzipped " + inZipFile + " to " + outFile);
209
210 } catch(IOException ex) {
211 error("Failed to unzip " + inZipFile);
212 ex.printStackTrace();
213 return false;
214 }
215
216 return true;
217 }
218
219 // wget will be launched from the specified directory, SITES_DIR
220 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
221 public boolean wgetWETfile(String ccWETfileURL) {
222 // use SafeProcess and wget command in lib/config.properties
223 // DONE: set up logging
224
225 String wgetCmd = configProps.getProperty("wget.file.cmd");
226 if(wgetCmd == null || wgetCmd.equals("")) {
227 System.err.println("Invalid or empty wget.file.cmd in config.properties");
228 return false;
229 }
230
231 // replace the placeholder in the wget cmd for the seed url
232 wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
233
234 log("Will launch wget with the command: " + wgetCmd);
235
236 String[] wgetCommandArgs = wgetCmd.split(" ");
237 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
238
239 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
240 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
241
242 int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
243
244 if(success != 0) {
245 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
246 return false;
247 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
248 }
249
250 return true;
251 }
252
253
254 // wget will be launched from the specified directory, SITES_DIR
255 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
256 public boolean wgetMirrorSite(String nzDomainURL) {
257 // use SafeProcess and wget command in lib/config.properties
258 // DONE: set up logging
259
260
261 String wgetCmd = configProps.getProperty("wget.mirror.cmd");
262 if(wgetCmd == null || wgetCmd.equals("")) {
263 System.err.println("Invalid or empty wget.mirror.cmd in config.properties");
264 return false;
265 }
266
267 // replace the placeholder in the wget cmd for the seed url
268 wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
269
270 log("Will launch wget with the command: " + wgetCmd);
271
272 String[] wgetCommandArgs = wgetCmd.split(" ");
273 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
274
275 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
276 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
277
278 int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
279
280 if(success != 0) {
281 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
282 return false;
283 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
284 }
285
286 return true;
287 }
288
289
290 // ----------- inner class for SafeProcess to Run Wget ----------
291 public class ProcessLineHandler extends SafeProcess.LineByLineHandler
292 {
293 //static Logger logger = Logger.getLogger(this.getClass().getName());;
294 Logger logger;
295
296 public ProcessLineHandler(Logger logger, int src)
297 {
298 super(src); // will set this.source to STDERR or STDOUT
299 //logger = Logger.getLogger(this.getClass().getName());
300 this.logger = logger;
301 }
302
303 public void gotLine(String line) { // first non-null line
304
305 // String form of this.source will be "stderr" or "stdout"
306 String msg = SafeProcess.streamToString(source) + ": " + line;
307 System.err.println(msg);
308 //NZTLDProcessor.this.logger.info(msg);
309 logger.info(msg);
310 }
311 public void gotException(Exception e) {
312 String msg = "Error in reading process' " + SafeProcess.streamToString(source);
313 //NZTLDProcessor.this.logger.error(msg, e);
314 logger.error(msg, e);
315 }
316
317 }
318
319
320 public boolean processDownloadedDir(String ccWETfileURL) {
321 // recurse through the downloaded directory, then process each file
322
323 File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
324 log("*** Will process download dir " + downloadedSiteDir);
325
326 if(!downloadedSiteDir.exists()) {
327 error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
328 return false;
329 }
330 /*
331 if(!downloadedSiteDir.isDirectory()) {
332 error("Downloaded site " + downloadedSiteDir + " is not a directory!");
333 return false; // TODO: or redo wget with "www." prefixed to URL???
334 }
335 */
336 recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
337
338 debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
339
340 return true;
341 }
342
343 private void recursivelyProcessDir(String ccWETfileURL, File file) {
344
345 if(file.isDirectory()) { // recursive step
346 // children array null iff IO Exception OR if file not a directory,
347 // which it can't be since we tested isDirectory() on it just above
348 File[] children = file.listFiles(extractableTxtFilesFilter);
349 if(children == null) {
350 error("IO error occurred when trying to list children of " + file);
351 } else {
352 // children array will be empty if 'file' dir was empty
353 for(int i = 0; i < children.length; i++) {
354 recursivelyProcessDir(ccWETfileURL, children[i]);
355 }
356 }
357
358 } else { // base step
359 processFile(ccWETfileURL, file);
360 }
361 }
362
363 /*
364 public boolean processDownloadedDir(String ccWETfileURL) {
365 // recurse through the downloaded directory, then process each file
366
367 File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
368 if(!downloadedSiteDir.exists()) {
369 error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
370 return false;
371 }
372
373 File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
374 if(files == null) {
375 error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
376 return false;
377 }
378 recursivelyProcessDir(ccWETfileURL, files);
379
380 return true;
381 }
382
383 private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
384 for(int i = 0; i < files.length; i++) {
385 if(files[i].isDirectory()) { // recursive step
386
387 // children array will be empty if dir empty
388 // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
389 File[] children = files[i].listFiles(extractableTxtFilesFilter);
390 if(children == null) {
391 error("IO error trying to list children of " + files[i]);
392 } else {
393 recursivelyProcessDir(ccWETfileURL, children);
394 }
395 }
396 else { // base step
397 processFile(ccWETfileURL, files[i]);
398 }
399 }
400 }*/
401
402 public boolean processFile(String ccWETfileURL, File file) {
403 // skip if js, css, robots.txt
404 // if no-extension or htm or html, call processHTMLFile()
405 // else: skip for now, TODO: later call Tika on other file types
406
407 // TODO: update db with url info, page location and para?
408
409
410 log("*** Processing file " + file);
411
412 return true;
413 }
414
415 private class ExtractableTextFileFilter implements FilenameFilter {
416 // skip if js, css, robots.txt
417
418 // For now also skip the image files. Later, with Tika, may be able to extract text from
419 // images though OCR of those imgs representing text? We accept files with no file extension,
420 // e.g. waikato uni has lots of files without extension that contain html.
421 // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
422 public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
423 //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
424
425
426 public boolean accept(File dir, String name) {
427
428 return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
429 //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
430 //return name.matches(".*\\.html$"); // works
431 //return name.matches(".*\\.png($|\\?.*)"); // works
432 }
433 }
434
435
436 public boolean processHTMLFile() {
437 // Use JSoup to get paras
438
439 // call processParas(text);
440
441 return true;
442 }
443
444 public boolean processNonHTMLFile() {
445 // Use Tika to get text
446
447 // call processParas(text)
448
449 return true;
450 }
451
452 public boolean processParas(String text) {
453
454 // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
455
456 // for each para, call MaoriTextDetector.java
457
458 return true;
459 }
460 // --------------------------- End inner class --------------------
461
462 public static void main(String[] args) {
463 if(args.length <= 0) {
464 System.err.println("ERROR: Must provide input file of unique nz website addresses");
465 System.exit(-1);
466 }
467
468 String infilePath = args[0];
469
470 System.err.println("Requested to process URLS file: " + infilePath);
471
472 File infile = new File(infilePath);
473 if(!infile.exists()) {
474 System.err.println("ERROR: File " + infilePath + " did not exist");
475 System.exit(-1);
476 }
477
478
479 try {
480 NZTLDProcessor processor = new NZTLDProcessor(infile);
481 } catch(Exception e) {
482 e.printStackTrace();
483 }
484
485 }
486}
487
Note: See TracBrowser for help on using the repository browser.