Context Navigation

NZTLDProcessor.java@ 33652

Last change on this file since 33652 was 33466, checked in by ak19, 5 years ago
WETProcessor.main() now processes a folder of *.warc.wet(.gz) files. Each file's WET records is written out into an individual file and put into either the keep folder or discard folder, based on amount of content (number lines and/or content-length). 2. Moved unzipFile() from NZTLDProcessor.java into new Utility.java class as a static method.
File size: 15.3 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.Properties;
5	import java.util.zip.GZIPInputStream;
6
7	import org.apache.log4j.Logger;
8	import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
9
10
11
12	/**
13	* Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
14	* Then go into the src folder of this extension before compiling or running.
15	*
16	* Compile as:
17	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
18	*
19	* Run as:
20	* maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
21	*/
22	public class NZTLDProcessor
23	{
24
25	private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
26
27	private Properties configProps = null;
28	private boolean DEBUG_MODE = true;
29
30	private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
31	private final String SITES_DIR = "../dwn-sites";
32	private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
33
34
35	private void log(String msg) {
36	System.err.println(msg);
37	logger.info(msg);
38	}
39
40	private void debug(String msg) {
41	System.err.println(msg);
42	logger.debug(msg);
43	}
44
45	private void error(String msg) {
46	System.err.println("### ERROR: " + msg);
47	logger.error(msg);
48	}
49
50	private void warn(String msg) {
51	System.err.println("*** WARN: " + msg);
52	logger.warn(msg);
53	}
54
55	public NZTLDProcessor(File seedURLsFile) throws Exception {
56	log("In NZTLDProcessor constructor");
57
58	if(DEBUG_MODE) {
59	warn("Debugger is turned ON!!!");
60	}
61
62	// TODO: why do I have to provide the path to config.props when this path is already on
63	// the classpath?
64	// In fact, I don't need it on the classpath for the following to work.
65	// How do I get it to work by specifying its path on the classpath and not here?
66	//try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
67	//try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
68	try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
69	configProps = new Properties();
70	configProps.load(infile);
71	infile.close();
72
73	} catch(Exception e) {
74	System.err.println("Exception attempting to read properties from config.properties.");
75	//e.printStackTrace();
76	throw e;
77	}
78
79
80	try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
81	String url = null;
82
83	int maxURLs = 1;
84	int urlCounter = 0;
85
86	while((url = br.readLine()) != null) {
87
88	log("Got URL: " + url);
89
90	// skip urls containing "/crawldiagnostics/" and "/robotstxt/"
91	if(url.indexOf("/robotstxt/") != -1) continue;
92	if(url.indexOf("/crawldiagnostics/") != -1) continue;
93
94	// convert the remaining WARC urls to WET urls
95	url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
96	url = url.replace(".warc.gz", ".warc.wet.gz");
97
98	// add the prefix
99	url = COMMONCRAWL_DATA_PREFIX + url;
100
101	log("Final WET URL: " + url);
102
103	processURL(url);
104
105	if(DEBUG_MODE) {
106	urlCounter++;
107	if(urlCounter == maxURLs) break;
108	}
109
110	}
111
112	br.close();
113
114	} catch (IOException ioe) {
115	System.err.println("Exception attempting to read from seedURLsFile.");
116	//ioe.printStackTrace();
117	throw ioe;
118	}
119	}
120
121	public boolean processURL(String ccWETfileURL) {
122	// launch wget on URL
123	// when download done, recurse through downloaded dir
124	// for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
125	// For each para, run MaoriTextDetector and store results in db
126	// Remove download directory (or only remove if site doesn't contain Maori txt)?
127
128
129	if(DEBUG_MODE) {
130	ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
131	}
132	log("Processing WET file URL: " + ccWETfileURL);
133
134	/*
135	//if(wgetMirrorSite(ccWETfileURL)) {
136	if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
137
138	processDownloadedDir(ccWETfileURL);
139
140	//}
141	*/
142
143
144	processWETfile(ccWETfileURL);
145
146
147	return true;
148	}
149
150	public boolean processWETfile(String ccWETfileURL) {
151	String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
152	String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
153
154	File inZipFile = new File(SITES_DIR, zippedWETfileName);
155	File WETfile = new File(SITES_DIR, unzippedWETfileName);
156
157	if(WETfile.exists()) {
158	log("Unzipped WET file " + WETfile + " already exists");
159	}
160	else {
161
162	if(inZipFile.exists()) {
163	log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");
164	}
165	else if(!wgetWETfile(ccWETfileURL)) {
166	return false;
167	}
168
169	// don't have the WET file yet. Get it from the zip file, which we know we should have by now
170
171	boolean success = Utility.unzipFile(inZipFile, WETfile);
172	log("Unzipped " + inZipFile + " to " + WETfile);
173
174	// whether we succeeded or not, get rid of the zipped file:
175	if(!inZipFile.delete()) {
176	warn("Unable to delete zipped WET file: " + zippedWETfileName);
177	}
178
179	if(!success) {
180	return false;
181	}
182	}
183
184	// read in the giant WET file and
185
186	return true;
187	}
188
189	/*
190	// Run gunzip
191	// To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
192	// https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
193	public boolean unzipFile(File inZipFile, File outFile) {
194
195
196	byte[] buffer = new byte[1024];
197
198	// try-with-resources will safely close streams/dispose resources on success or error and exceptions
199	try (
200	GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
201	FileOutputStream out = new FileOutputStream(outFile);
202	) {
203	int len;
204	while ((len = gzis.read(buffer)) > 0) {
205	out.write(buffer, 0, len);
206	}
207
208	//gzis.close();
209	//out.close();
210
211	log("Unzipped " + inZipFile + " to " + outFile);
212
213	} catch(IOException ex) {
214	error("Failed to unzip " + inZipFile);
215	ex.printStackTrace();
216	return false;
217	}
218
219	return true;
220	}
221	*/
222
223	// wget will be launched from the specified directory, SITES_DIR
224	// Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
225	public boolean wgetWETfile(String ccWETfileURL) {
226	// use SafeProcess and wget command in lib/config.properties
227	// DONE: set up logging
228
229	String wgetCmd = configProps.getProperty("wget.file.cmd");
230	if(wgetCmd == null \|\| wgetCmd.equals("")) {
231	System.err.println("Invalid or empty wget.file.cmd in config.properties");
232	return false;
233	}
234
235	// replace the placeholder in the wget cmd for the seed url
236	wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
237
238	log("Will launch wget with the command: " + wgetCmd);
239
240	String[] wgetCommandArgs = wgetCmd.split(" ");
241	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
242
243	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
244	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
245
246	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
247
248	if(success != 0) {
249	System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
250	return false;
251	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
252	}
253
254	return true;
255	}
256
257
258	// wget will be launched from the specified directory, SITES_DIR
259	// Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
260	public boolean wgetMirrorSite(String nzDomainURL) {
261	// use SafeProcess and wget command in lib/config.properties
262	// DONE: set up logging
263
264
265	String wgetCmd = configProps.getProperty("wget.mirror.cmd");
266	if(wgetCmd == null \|\| wgetCmd.equals("")) {
267	System.err.println("Invalid or empty wget.mirror.cmd in config.properties");
268	return false;
269	}
270
271	// replace the placeholder in the wget cmd for the seed url
272	wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
273
274	log("Will launch wget with the command: " + wgetCmd);
275
276	String[] wgetCommandArgs = wgetCmd.split(" ");
277	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
278
279	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
280	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
281
282	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
283
284	if(success != 0) {
285	System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
286	return false;
287	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
288	}
289
290	return true;
291	}
292
293
294	// ----------- inner class for SafeProcess to Run Wget ----------
295	public class ProcessLineHandler extends SafeProcess.LineByLineHandler
296	{
297	//static Logger logger = Logger.getLogger(this.getClass().getName());;
298	Logger logger;
299
300	public ProcessLineHandler(Logger logger, int src)
301	{
302	super(src); // will set this.source to STDERR or STDOUT
303	//logger = Logger.getLogger(this.getClass().getName());
304	this.logger = logger;
305	}
306
307	public void gotLine(String line) { // first non-null line
308
309	// String form of this.source will be "stderr" or "stdout"
310	String msg = SafeProcess.streamToString(source) + ": " + line;
311	System.err.println(msg);
312	//NZTLDProcessor.this.logger.info(msg);
313	logger.info(msg);
314	}
315	public void gotException(Exception e) {
316	String msg = "Error in reading process' " + SafeProcess.streamToString(source);
317	//NZTLDProcessor.this.logger.error(msg, e);
318	logger.error(msg, e);
319	}
320
321	}
322
323
324	public boolean processDownloadedDir(String ccWETfileURL) {
325	// recurse through the downloaded directory, then process each file
326
327	File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
328	log("*** Will process download dir " + downloadedSiteDir);
329
330	if(!downloadedSiteDir.exists()) {
331	error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
332	return false;
333	}
334	/*
335	if(!downloadedSiteDir.isDirectory()) {
336	error("Downloaded site " + downloadedSiteDir + " is not a directory!");
337	return false; // TODO: or redo wget with "www." prefixed to URL???
338	}
339	*/
340	recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
341
342	debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
343
344	return true;
345	}
346
347	private void recursivelyProcessDir(String ccWETfileURL, File file) {
348
349	if(file.isDirectory()) { // recursive step
350	// children array null iff IO Exception OR if file not a directory,
351	// which it can't be since we tested isDirectory() on it just above
352	File[] children = file.listFiles(extractableTxtFilesFilter);
353	if(children == null) {
354	error("IO error occurred when trying to list children of " + file);
355	} else {
356	// children array will be empty if 'file' dir was empty
357	for(int i = 0; i < children.length; i++) {
358	recursivelyProcessDir(ccWETfileURL, children[i]);
359	}
360	}
361
362	} else { // base step
363	processFile(ccWETfileURL, file);
364	}
365	}
366
367	/*
368	public boolean processDownloadedDir(String ccWETfileURL) {
369	// recurse through the downloaded directory, then process each file
370
371	File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
372	if(!downloadedSiteDir.exists()) {
373	error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
374	return false;
375	}
376
377	File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
378	if(files == null) {
379	error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
380	return false;
381	}
382	recursivelyProcessDir(ccWETfileURL, files);
383
384	return true;
385	}
386
387	private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
388	for(int i = 0; i < files.length; i++) {
389	if(files[i].isDirectory()) { // recursive step
390
391	// children array will be empty if dir empty
392	// children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
393	File[] children = files[i].listFiles(extractableTxtFilesFilter);
394	if(children == null) {
395	error("IO error trying to list children of " + files[i]);
396	} else {
397	recursivelyProcessDir(ccWETfileURL, children);
398	}
399	}
400	else { // base step
401	processFile(ccWETfileURL, files[i]);
402	}
403	}
404	}*/
405
406	public boolean processFile(String ccWETfileURL, File file) {
407	// skip if js, css, robots.txt
408	// if no-extension or htm or html, call processHTMLFile()
409	// else: skip for now, TODO: later call Tika on other file types
410
411	// TODO: update db with url info, page location and para?
412
413
414	log("*** Processing file " + file);
415
416	return true;
417	}
418
419	private class ExtractableTextFileFilter implements FilenameFilter {
420	// skip if js, css, robots.txt
421
422	// For now also skip the image files. Later, with Tika, may be able to extract text from
423	// images though OCR of those imgs representing text? We accept files with no file extension,
424	// e.g. waikato uni has lots of files without extension that contain html.
425	// Examples: Files like .png but also .png?v=QE50XMk2oY should be rejected
426	public static final String unacceptableRegex = ".\\.(json\|js\|css\|svg\|ico\|jpe?g\|png\|gif\|tif?)($\|\\?.)";
427	//Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
428
429
430	public boolean accept(File dir, String name) {
431
432	return !(name.matches("robots.txt") \|\| name.matches(unacceptableRegex));
433	//return !(name.matches("robots.txt") \|\| name.matches("/\\.png/")); // doesn't work
434	//return name.matches(".*\\.html$"); // works
435	//return name.matches(".\\.png($\|\\?.)"); // works
436	}
437	}
438
439
440	public boolean processHTMLFile() {
441	// Use JSoup to get paras
442
443	// call processParas(text);
444
445	return true;
446	}
447
448	public boolean processNonHTMLFile() {
449	// Use Tika to get text
450
451	// call processParas(text)
452
453	return true;
454	}
455
456	public boolean processParas(String text) {
457
458	// Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
459
460	// for each para, call MaoriTextDetector.java
461
462	return true;
463	}
464	// --------------------------- End inner class --------------------
465
466	public static void main(String[] args) {
467	if(args.length <= 0) {
468	System.err.println("ERROR: Must provide input file of unique nz website addresses");
469	System.exit(-1);
470	}
471
472	String infilePath = args[0];
473
474	System.err.println("Requested to process URLS file: " + infilePath);
475
476	File infile = new File(infilePath);
477	if(!infile.exists()) {
478	System.err.println("ERROR: File " + infilePath + " did not exist");
479	System.exit(-1);
480	}
481
482
483	try {
484	NZTLDProcessor processor = new NZTLDProcessor(infile);
485	} catch(Exception e) {
486	e.printStackTrace();
487	}
488
489	}
490	}
491

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33652

Download in other formats: