Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33402

Last change on this file since 33402 was 33402, checked in by ak19, 5 years ago
Beginnings of the Java class to wget sites and process its pages to detect if any contain Maori language text. So far, only got wget to run but can't yet seem to launch it from a specified directory (it annoyingly runs inside src, ignoring the launch dir parameter to SafeProcess.runProcess()).
File size: 6.8 KB

Line
1	package org.greenstone.atea;
2
3	import java.io.*;
4	import java.util.Properties;
5
6	import org.apache.log4j.Logger;
7	import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
8
9
10
11	/**
12	* Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
13	*
14	*
15	* Compile as:
16	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
17	*
18	* Run as:
19	* maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
20	*/
21	public class NZTLDProcessor
22	{
23
24	static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
25
26	Properties configProps = null;
27
28	boolean DEBUG_MODE = true;
29
30
31	private void log(String msg) {
32	System.err.println(msg);
33	logger.info(msg);
34	}
35
36	private void error(String msg) {
37	System.err.println("ERROR: " + msg);
38	logger.error(msg);
39	}
40
41	private void warn(String msg) {
42	System.err.println("WARN: " + msg);
43	logger.warn(msg);
44	}
45
46	public NZTLDProcessor(File seedURLsFile) throws Exception {
47
48	if(DEBUG_MODE) {
49	warn("**** Debugger is turned ON!!!");
50	}
51
52	// TODO: why do I have to provide the path to config.props when this path is already on
53	// the classpath?
54	// In fact, I don't need it on the classpath for the following to work.
55	// How do I get it to work by specifying its path on the classpath and not here?
56	//try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
57	//try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
58	try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
59	configProps = new Properties();
60	configProps.load(infile);
61	infile.close();
62
63	} catch(Exception e) {
64	System.err.println("Exception attempting to read properties from config.properties.");
65	//e.printStackTrace();
66	throw e;
67	}
68
69
70	try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
71	String url = null;
72
73	int maxURLs = 1;
74	int urlCounter = 0;
75
76	while((url = br.readLine()) != null) {
77
78	log("Got URL: " + url);
79
80	processURL(url);
81
82	if(DEBUG_MODE) {
83	urlCounter++;
84	if(urlCounter == maxURLs) break;
85	}
86
87	}
88
89	br.close();
90
91	} catch (IOException ioe) {
92	System.err.println("Exception attempting to read from seedURLsFile.");
93	//ioe.printStackTrace();
94	throw ioe;
95	}
96	}
97
98	public boolean processURL(String nzDomainURL) {
99	// launch wget on URL
100	// when download done, recurse through downloaded dir
101	// for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
102	// For each para, run MaoriTextDetector and store results in db
103	// Remove download directory (or only remove if site doesn't contain Maori txt)?
104
105
106	if(DEBUG_MODE) {
107	nzDomainURL = "www.waikato.ac.nz";
108	}
109	log("Processing seed URL: " + nzDomainURL);
110
111	wgetURL(nzDomainURL);
112
113	return true;
114	}
115
116	// TODO: Still need to get wget to be launched from a specified directory (../dwn_sites).
117	public boolean wgetURL(String nzDomainURL) {
118	// use SafeProcess and wget command in lib/config.properties
119	// DONE: set up logging
120
121
122	String wgetCmd = configProps.getProperty("wget.cmd");
123	if(wgetCmd == null \|\| wgetCmd.equals("")) {
124	System.err.println("Invalid or empty wget.cmd in config.properties");
125	return false;
126	}
127
128	// replace the placeholder in the wget cmd for the seed url
129	wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
130
131	log("Will launch wget with the command: " + wgetCmd);
132
133	String[] wgetCommandArgs = wgetCmd.split(" ");
134	SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File("../dwn_sites"));
135
136	SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
137	SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
138
139	int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
140
141	if(success != 0) {
142	System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
143	return false;
144	}
145
146	return true;
147	}
148
149	public boolean processDownloadedDir() {
150	// recursion then process each file
151
152	return true;
153	}
154
155	public boolean processFile() {
156	// skip if js, css, robots.txt
157	// if no-extension or htm or html, call processHTMLFile()
158	// else: skip for now, TODO: later call Tika on other file types
159
160	// TODO: update db with url info, page location and para?
161
162	return true;
163	}
164
165
166	public boolean processHTMLFile() {
167	// Use JSoup to get paras
168
169	// call processParas(text);
170
171	return true;
172	}
173
174	public boolean processNonHTMLFile() {
175	// Use Tika to get text
176
177	// call processParas(text)
178
179	return true;
180	}
181
182	public boolean processParas(String text) {
183
184	// Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
185
186	// for each para, call MaoriTextDetector.java
187
188	return true;
189	}
190
191	// ----------- inner class for SafeProcess to Run Wget ----------
192	public class ProcessLineHandler extends SafeProcess.LineByLineHandler
193	{
194	//static Logger logger = Logger.getLogger(this.getClass().getName());;
195	Logger logger;
196
197	public ProcessLineHandler(Logger logger, int src)
198	{
199	super(src); // will set this.source to STDERR or STDOUT
200	//logger = Logger.getLogger(this.getClass().getName());
201	this.logger = logger;
202	}
203
204	public void gotLine(String line) { // first non-null line
205
206	// String form of "src" will be "stderr" or "stdout"
207	String msg = SafeProcess.streamToString(source) + ": " + line;
208	System.err.println(msg);
209	//NZTLDProcessor.this.logger.info(msg);
210	logger.info(msg);
211	}
212	public void gotException(Exception e) {
213	//NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
214	logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
215	}
216
217	}
218	// --------------------------- End inner class --------------------
219
220	public static void main(String[] args) {
221	if(args.length <= 0) {
222	System.err.println("ERROR: Must provide input file of unique nz website addresses");
223	System.exit(-1);
224	}
225
226	String infilePath = args[0];
227
228	System.err.println("Requested to process URLS file: " + infilePath);
229
230	File infile = new File(infilePath);
231	if(!infile.exists()) {
232	System.err.println("ERROR: File " + infilePath + " did not exist");
233	System.exit(-1);
234	}
235
236
237	try {
238	NZTLDProcessor processor = new NZTLDProcessor(infile);
239	} catch(Exception e) {
240	e.printStackTrace();
241	}
242
243	}
244	}
245

Note: See TracBrowser for help on using the repository browser.

Download in other formats: