Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33405

Last change on this file since 33405 was 33405, checked in by ak19, 5 years ago
Even though we're probably not going to use this code after all, will commit my fixes and current state. Got location of wget downloaded site working, also filefilter. Added recursive traversal of downloaded folder to identify files to process.
File size: 11.1 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.Properties;
5
6	import org.apache.log4j.Logger;
7	import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
8
9
10
11	/**
12	* Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
13	* Then go into the src folder of this extension before compiling or running.
14	*
15	* Compile as:
16	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
17	*
18	* Run as:
19	* maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
20	*/
21	public class NZTLDProcessor
22	{
23
24	private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
25
26	private Properties configProps = null;
27	private boolean DEBUG_MODE = true;
28
29	private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
30	private final String sitesDir = "../dwn-sites";
31
32	private void log(String msg) {
33	System.err.println(msg);
34	logger.info(msg);
35	}
36
37	private void debug(String msg) {
38	System.err.println(msg);
39	logger.debug(msg);
40	}
41
42	private void error(String msg) {
43	System.err.println("ERROR: " + msg);
44	logger.error(msg);
45	}
46
47	private void warn(String msg) {
48	System.err.println("WARN: " + msg);
49	logger.warn(msg);
50	}
51
52	public NZTLDProcessor(File seedURLsFile) throws Exception {
53	log("In NZTLDProcessor constructor");
54
55	if(DEBUG_MODE) {
56	warn("**** Debugger is turned ON!!!");
57	}
58
59	// TODO: why do I have to provide the path to config.props when this path is already on
60	// the classpath?
61	// In fact, I don't need it on the classpath for the following to work.
62	// How do I get it to work by specifying its path on the classpath and not here?
63	//try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
64	//try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
65	try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
66	configProps = new Properties();
67	configProps.load(infile);
68	infile.close();
69
70	} catch(Exception e) {
71	System.err.println("Exception attempting to read properties from config.properties.");
72	//e.printStackTrace();
73	throw e;
74	}
75
76
77	try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
78	String url = null;
79
80	int maxURLs = 1;
81	int urlCounter = 0;
82
83	while((url = br.readLine()) != null) {
84
85	log("Got URL: " + url);
86
87	processURL(url);
88
89	if(DEBUG_MODE) {
90	urlCounter++;
91	if(urlCounter == maxURLs) break;
92	}
93
94	}
95
96	br.close();
97
98	} catch (IOException ioe) {
99	System.err.println("Exception attempting to read from seedURLsFile.");
100	//ioe.printStackTrace();
101	throw ioe;
102	}
103	}
104
105	public boolean processURL(String nzDomainURL) {
106	// launch wget on URL
107	// when download done, recurse through downloaded dir
108	// for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
109	// For each para, run MaoriTextDetector and store results in db
110	// Remove download directory (or only remove if site doesn't contain Maori txt)?
111
112
113	if(DEBUG_MODE) {
114	nzDomainURL = "www.waikato.ac.nz";
115	}
116	log("Processing seed URL: " + nzDomainURL);
117
118	//if(wgetURL(nzDomainURL)) {
119	if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition
120
121	processDownloadedDir(nzDomainURL);
122
123	//}
124	return true;
125	}
126
127	// wget will be launched from the specified directory, ../dwn_sites
128	// Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
129	public boolean wgetURL(String nzDomainURL) {
130	// use SafeProcess and wget command in lib/config.properties
131	// DONE: set up logging
132
133
134	String wgetCmd = configProps.getProperty("wget.cmd");
135	if(wgetCmd == null \|\| wgetCmd.equals("")) {
136	System.err.println("Invalid or empty wget.cmd in config.properties");
137	return false;
138	}
139
140	// replace the placeholder in the wget cmd for the seed url
141	wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
142
143	log("Will launch wget with the command: " + wgetCmd);
144
145	String[] wgetCommandArgs = wgetCmd.split(" ");
146	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir));
147
148	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
149	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
150
151	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
152
153	if(success != 0) {
154	System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
155	return false;
156	// TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
157	}
158
159	return true;
160	}
161
162
163	// ----------- inner class for SafeProcess to Run Wget ----------
164	public class ProcessLineHandler extends SafeProcess.LineByLineHandler
165	{
166	//static Logger logger = Logger.getLogger(this.getClass().getName());;
167	Logger logger;
168
169	public ProcessLineHandler(Logger logger, int src)
170	{
171	super(src); // will set this.source to STDERR or STDOUT
172	//logger = Logger.getLogger(this.getClass().getName());
173	this.logger = logger;
174	}
175
176	public void gotLine(String line) { // first non-null line
177
178	// String form of this.source will be "stderr" or "stdout"
179	String msg = SafeProcess.streamToString(source) + ": " + line;
180	System.err.println(msg);
181	//NZTLDProcessor.this.logger.info(msg);
182	logger.info(msg);
183	}
184	public void gotException(Exception e) {
185	String msg = "Error in reading process' " + SafeProcess.streamToString(source);
186	//NZTLDProcessor.this.logger.error(msg, e);
187	logger.error(msg, e);
188	}
189
190	}
191
192
193	public boolean processDownloadedDir(String nzDomainURL) {
194	// recurse through the downloaded directory, then process each file
195
196	File downloadedSiteDir = new File(sitesDir, nzDomainURL);
197	log("*** Will process download dir " + downloadedSiteDir);
198
199	if(!downloadedSiteDir.exists()) {
200	error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
201	return false;
202	}
203	/*
204	if(!downloadedSiteDir.isDirectory()) {
205	error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
206	return false; // TODO: or redo wget with "www." prefixed to URL???
207	}
208	*/
209	recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
210
211	debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
212
213	return true;
214	}
215
216	private void recursivelyProcessDir(String nzDomainURL, File file) {
217
218	if(file.isDirectory()) { // recursive step
219	// children array null iff IO Exception OR if file not a directory,
220	// which it can't be since we tested isDirectory() on it just above
221	File[] children = file.listFiles(extractableTxtFilesFilter);
222	if(children == null) {
223	error("IO error occurred when trying to list children of " + file);
224	} else {
225	// children array will be empty if 'file' dir was empty
226	for(int i = 0; i < children.length; i++) {
227	recursivelyProcessDir(nzDomainURL, children[i]);
228	}
229	}
230
231	} else { // base step
232	processFile(nzDomainURL, file);
233	}
234	}
235
236	/*
237	public boolean processDownloadedDir(String nzDomainURL) {
238	// recurse through the downloaded directory, then process each file
239
240	File downloadedSiteDir = new File(sitesDir, nzDomainURL);
241	if(!downloadedSiteDir.exists()) {
242	error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
243	return false;
244	}
245
246	File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
247	if(files == null) {
248	error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
249	return false;
250	}
251	recursivelyProcessDir(nzDomainURL, files);
252
253	return true;
254	}
255
256	private void recursivelyProcessDir(String nzDomainURL, File[] children) {
257	for(int i = 0; i < files.length; i++) {
258	if(files[i].isDirectory()) { // recursive step
259
260	// children array will be empty if dir empty
261	// children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
262	File[] children = files[i].listFiles(extractableTxtFilesFilter);
263	if(children == null) {
264	error("IO error trying to list children of " + files[i]);
265	} else {
266	recursivelyProcessDir(nzDomainURL, children);
267	}
268	}
269	else { // base step
270	processFile(nzDomainURL, files[i]);
271	}
272	}
273	}*/
274
275	public boolean processFile(String nzDomainURL, File file) {
276	// skip if js, css, robots.txt
277	// if no-extension or htm or html, call processHTMLFile()
278	// else: skip for now, TODO: later call Tika on other file types
279
280	// TODO: update db with url info, page location and para?
281
282
283	log("*** Processing file " + file);
284
285	return true;
286	}
287
288	private class ExtractableTextFileFilter implements FilenameFilter {
289	// skip if js, css, robots.txt
290
291	// For now also skip the image files. Later, with Tika, may be able to extract text from
292	// images though OCR of those imgs representing text? We accept files with no file extension,
293	// e.g. waikato uni has lots of files without extension that contain html.
294	// Examples: Files like .png but also .png?v=QE50XMk2oY should be rejected
295	public static final String unacceptableRegex = ".\\.(json\|js\|css\|svg\|ico\|jpe?g\|png\|gif\|tif?)($\|\\?.)";
296	//Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
297
298
299	public boolean accept(File dir, String name) {
300
301	return !(name.matches("robots.txt") \|\| name.matches(unacceptableRegex));
302	//return !(name.matches("robots.txt") \|\| name.matches("/\\.png/")); // doesn't work
303	//return name.matches(".*\\.html$"); // works
304	//return name.matches(".\\.png($\|\\?.)"); // works
305	}
306	}
307
308
309	public boolean processHTMLFile() {
310	// Use JSoup to get paras
311
312	// call processParas(text);
313
314	return true;
315	}
316
317	public boolean processNonHTMLFile() {
318	// Use Tika to get text
319
320	// call processParas(text)
321
322	return true;
323	}
324
325	public boolean processParas(String text) {
326
327	// Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
328
329	// for each para, call MaoriTextDetector.java
330
331	return true;
332	}
333	// --------------------------- End inner class --------------------
334
335	public static void main(String[] args) {
336	if(args.length <= 0) {
337	System.err.println("ERROR: Must provide input file of unique nz website addresses");
338	System.exit(-1);
339	}
340
341	String infilePath = args[0];
342
343	System.err.println("Requested to process URLS file: " + infilePath);
344
345	File infile = new File(infilePath);
346	if(!infile.exists()) {
347	System.err.println("ERROR: File " + infilePath + " did not exist");
348	System.exit(-1);
349	}
350
351
352	try {
353	NZTLDProcessor processor = new NZTLDProcessor(infile);
354	} catch(Exception e) {
355	e.printStackTrace();
356	}
357
358	}
359	}
360

Note: See TracBrowser for help on using the repository browser.

Download in other formats: