Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33411

Last change on this file since 33411 was 33411, checked in by ak19, 5 years ago
Newer version now doesn't mirror sites with wget but gets WET files and unzips them instead. Didn't yet get to the part where I can process the actual contents inside each 400MB WET file.
File size: 15.2 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.Properties;
5	import java.util.zip.GZIPInputStream;
6
7	import org.apache.log4j.Logger;
8	import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
9
10
11
12	/**
13	* Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
14	* Then go into the src folder of this extension before compiling or running.
15	*
16	* Compile as:
17	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
18	*
19	* Run as:
20	* maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
21	*/
22	public class NZTLDProcessor
23	{
24
25	private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
26
27	private Properties configProps = null;
28	private boolean DEBUG_MODE = true;
29
30	private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
31	private final String SITES_DIR = "../dwn-sites";
32	private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
33
34
35	private void log(String msg) {
36	System.err.println(msg);
37	logger.info(msg);
38	}
39
40	private void debug(String msg) {
41	System.err.println(msg);
42	logger.debug(msg);
43	}
44
45	private void error(String msg) {
46	System.err.println("### ERROR: " + msg);
47	logger.error(msg);
48	}
49
50	private void warn(String msg) {
51	System.err.println("*** WARN: " + msg);
52	logger.warn(msg);
53	}
54
55	public NZTLDProcessor(File seedURLsFile) throws Exception {
56	log("In NZTLDProcessor constructor");
57
58	if(DEBUG_MODE) {
59	warn("Debugger is turned ON!!!");
60	}
61
62	// TODO: why do I have to provide the path to config.props when this path is already on
63	// the classpath?
64	// In fact, I don't need it on the classpath for the following to work.
65	// How do I get it to work by specifying its path on the classpath and not here?
66	//try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
67	//try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
68	try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
69	configProps = new Properties();
70	configProps.load(infile);
71	infile.close();
72
73	} catch(Exception e) {
74	System.err.println("Exception attempting to read properties from config.properties.");
75	//e.printStackTrace();
76	throw e;
77	}
78
79
80	try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
81	String url = null;
82
83	int maxURLs = 1;
84	int urlCounter = 0;
85
86	while((url = br.readLine()) != null) {
87
88	log("Got URL: " + url);
89
90	// skip urls containing "/crawldiagnostics/" and "/robotstxt/"
91	if(url.indexOf("/robotstxt/") != -1) continue;
92	if(url.indexOf("/crawldiagnostics/") != -1) continue;
93
94	// convert the remaining WARC urls to WET urls
95	url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
96	url = url.replace(".warc.gz", ".warc.wet.gz");
97
98	// add the prefix
99	url = COMMONCRAWL_DATA_PREFIX + url;
100
101	log("Final WET URL: " + url);
102
103	processURL(url);
104
105	if(DEBUG_MODE) {
106	urlCounter++;
107	if(urlCounter == maxURLs) break;
108	}
109
110	}
111
112	br.close();
113
114	} catch (IOException ioe) {
115	System.err.println("Exception attempting to read from seedURLsFile.");
116	//ioe.printStackTrace();
117	throw ioe;
118	}
119	}
120
121	public boolean processURL(String ccWETfileURL) {
122	// launch wget on URL
123	// when download done, recurse through downloaded dir
124	// for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
125	// For each para, run MaoriTextDetector and store results in db
126	// Remove download directory (or only remove if site doesn't contain Maori txt)?
127
128
129	if(DEBUG_MODE) {
130	ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
131	}
132	log("Processing WET file URL: " + ccWETfileURL);
133
134	/*
135	//if(wgetMirrorSite(ccWETfileURL)) {
136	if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
137
138	processDownloadedDir(ccWETfileURL);
139
140	//}
141	*/
142
143
144	processWETfile(ccWETfileURL);
145
146
147	return true;
148	}
149
150	public boolean processWETfile(String ccWETfileURL) {
151	String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
152	String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
153
154	File inZipFile = new File(SITES_DIR, zippedWETfileName);
155	File WETfile = new File(SITES_DIR, unzippedWETfileName);
156
157	if(WETfile.exists()) {
158	log("Unzipped WET file " + WETfile + " already exists");
159	}
160	else {
161
162	if(inZipFile.exists()) {
163	log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");
164	}
165	else if(!wgetWETfile(ccWETfileURL)) {
166	return false;
167	}
168
169	// don't have the WET file yet. Get it from the zip file, which we know we should have by now
170
171	boolean success = unzipFile(inZipFile, WETfile);
172	// whether we succeeded or not, get rid of the zipped file:
173	if(!inZipFile.delete()) {
174	warn("Unable to delete zipped WET file: " + zippedWETfileName);
175	}
176
177	if(!success) {
178	return false;
179	}
180	}
181
182	// read in the giant WET file and
183
184	return true;
185	}
186
187	// Run gunzip
188	// To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
189	// https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
190	public boolean unzipFile(File inZipFile, File outFile) {
191
192
193	byte[] buffer = new byte[1024];
194
195	// try-with-resources will safely close streams/dispose resources on success or error and exceptions
196	try (
197	GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
198	FileOutputStream out = new FileOutputStream(outFile);
199	) {
200	int len;
201	while ((len = gzis.read(buffer)) > 0) {
202	out.write(buffer, 0, len);
203	}
204
205	//gzis.close();
206	//out.close();
207
208	log("Unzipped " + inZipFile + " to " + outFile);
209
210	} catch(IOException ex) {
211	error("Failed to unzip " + inZipFile);
212	ex.printStackTrace();
213	return false;
214	}
215
216	return true;
217	}
218
219	// wget will be launched from the specified directory, SITES_DIR
220	// Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
221	public boolean wgetWETfile(String ccWETfileURL) {
222	// use SafeProcess and wget command in lib/config.properties
223	// DONE: set up logging
224
225	String wgetCmd = configProps.getProperty("wget.file.cmd");
226	if(wgetCmd == null \|\| wgetCmd.equals("")) {
227	System.err.println("Invalid or empty wget.file.cmd in config.properties");
228	return false;
229	}
230
231	// replace the placeholder in the wget cmd for the seed url
232	wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
233
234	log("Will launch wget with the command: " + wgetCmd);
235
236	String[] wgetCommandArgs = wgetCmd.split(" ");
237	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
238
239	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
240	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
241
242	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
243
244	if(success != 0) {
245	System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
246	return false;
247	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
248	}
249
250	return true;
251	}
252
253
254	// wget will be launched from the specified directory, SITES_DIR
255	// Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
256	public boolean wgetMirrorSite(String nzDomainURL) {
257	// use SafeProcess and wget command in lib/config.properties
258	// DONE: set up logging
259
260
261	String wgetCmd = configProps.getProperty("wget.mirror.cmd");
262	if(wgetCmd == null \|\| wgetCmd.equals("")) {
263	System.err.println("Invalid or empty wget.mirror.cmd in config.properties");
264	return false;
265	}
266
267	// replace the placeholder in the wget cmd for the seed url
268	wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
269
270	log("Will launch wget with the command: " + wgetCmd);
271
272	String[] wgetCommandArgs = wgetCmd.split(" ");
273	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
274
275	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
276	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
277
278	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
279
280	if(success != 0) {
281	System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
282	return false;
283	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
284	}
285
286	return true;
287	}
288
289
290	// ----------- inner class for SafeProcess to Run Wget ----------
291	public class ProcessLineHandler extends SafeProcess.LineByLineHandler
292	{
293	//static Logger logger = Logger.getLogger(this.getClass().getName());;
294	Logger logger;
295
296	public ProcessLineHandler(Logger logger, int src)
297	{
298	super(src); // will set this.source to STDERR or STDOUT
299	//logger = Logger.getLogger(this.getClass().getName());
300	this.logger = logger;
301	}
302
303	public void gotLine(String line) { // first non-null line
304
305	// String form of this.source will be "stderr" or "stdout"
306	String msg = SafeProcess.streamToString(source) + ": " + line;
307	System.err.println(msg);
308	//NZTLDProcessor.this.logger.info(msg);
309	logger.info(msg);
310	}
311	public void gotException(Exception e) {
312	String msg = "Error in reading process' " + SafeProcess.streamToString(source);
313	//NZTLDProcessor.this.logger.error(msg, e);
314	logger.error(msg, e);
315	}
316
317	}
318
319
320	public boolean processDownloadedDir(String ccWETfileURL) {
321	// recurse through the downloaded directory, then process each file
322
323	File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
324	log("*** Will process download dir " + downloadedSiteDir);
325
326	if(!downloadedSiteDir.exists()) {
327	error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
328	return false;
329	}
330	/*
331	if(!downloadedSiteDir.isDirectory()) {
332	error("Downloaded site " + downloadedSiteDir + " is not a directory!");
333	return false; // TODO: or redo wget with "www." prefixed to URL???
334	}
335	*/
336	recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
337
338	debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
339
340	return true;
341	}
342
343	private void recursivelyProcessDir(String ccWETfileURL, File file) {
344
345	if(file.isDirectory()) { // recursive step
346	// children array null iff IO Exception OR if file not a directory,
347	// which it can't be since we tested isDirectory() on it just above
348	File[] children = file.listFiles(extractableTxtFilesFilter);
349	if(children == null) {
350	error("IO error occurred when trying to list children of " + file);
351	} else {
352	// children array will be empty if 'file' dir was empty
353	for(int i = 0; i < children.length; i++) {
354	recursivelyProcessDir(ccWETfileURL, children[i]);
355	}
356	}
357
358	} else { // base step
359	processFile(ccWETfileURL, file);
360	}
361	}
362
363	/*
364	public boolean processDownloadedDir(String ccWETfileURL) {
365	// recurse through the downloaded directory, then process each file
366
367	File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
368	if(!downloadedSiteDir.exists()) {
369	error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
370	return false;
371	}
372
373	File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
374	if(files == null) {
375	error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
376	return false;
377	}
378	recursivelyProcessDir(ccWETfileURL, files);
379
380	return true;
381	}
382
383	private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
384	for(int i = 0; i < files.length; i++) {
385	if(files[i].isDirectory()) { // recursive step
386
387	// children array will be empty if dir empty
388	// children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
389	File[] children = files[i].listFiles(extractableTxtFilesFilter);
390	if(children == null) {
391	error("IO error trying to list children of " + files[i]);
392	} else {
393	recursivelyProcessDir(ccWETfileURL, children);
394	}
395	}
396	else { // base step
397	processFile(ccWETfileURL, files[i]);
398	}
399	}
400	}*/
401
402	public boolean processFile(String ccWETfileURL, File file) {
403	// skip if js, css, robots.txt
404	// if no-extension or htm or html, call processHTMLFile()
405	// else: skip for now, TODO: later call Tika on other file types
406
407	// TODO: update db with url info, page location and para?
408
409
410	log("*** Processing file " + file);
411
412	return true;
413	}
414
415	private class ExtractableTextFileFilter implements FilenameFilter {
416	// skip if js, css, robots.txt
417
418	// For now also skip the image files. Later, with Tika, may be able to extract text from
419	// images though OCR of those imgs representing text? We accept files with no file extension,
420	// e.g. waikato uni has lots of files without extension that contain html.
421	// Examples: Files like .png but also .png?v=QE50XMk2oY should be rejected
422	public static final String unacceptableRegex = ".\\.(json\|js\|css\|svg\|ico\|jpe?g\|png\|gif\|tif?)($\|\\?.)";
423	//Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
424
425
426	public boolean accept(File dir, String name) {
427
428	return !(name.matches("robots.txt") \|\| name.matches(unacceptableRegex));
429	//return !(name.matches("robots.txt") \|\| name.matches("/\\.png/")); // doesn't work
430	//return name.matches(".*\\.html$"); // works
431	//return name.matches(".\\.png($\|\\?.)"); // works
432	}
433	}
434
435
436	public boolean processHTMLFile() {
437	// Use JSoup to get paras
438
439	// call processParas(text);
440
441	return true;
442	}
443
444	public boolean processNonHTMLFile() {
445	// Use Tika to get text
446
447	// call processParas(text)
448
449	return true;
450	}
451
452	public boolean processParas(String text) {
453
454	// Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
455
456	// for each para, call MaoriTextDetector.java
457
458	return true;
459	}
460	// --------------------------- End inner class --------------------
461
462	public static void main(String[] args) {
463	if(args.length <= 0) {
464	System.err.println("ERROR: Must provide input file of unique nz website addresses");
465	System.exit(-1);
466	}
467
468	String infilePath = args[0];
469
470	System.err.println("Requested to process URLS file: " + infilePath);
471
472	File infile = new File(infilePath);
473	if(!infile.exists()) {
474	System.err.println("ERROR: File " + infilePath + " did not exist");
475	System.exit(-1);
476	}
477
478
479	try {
480	NZTLDProcessor processor = new NZTLDProcessor(infile);
481	} catch(Exception e) {
482	e.printStackTrace();
483	}
484
485	}
486	}
487

Note: See TracBrowser for help on using the repository browser.

Download in other formats: