source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java@ 33402

Last change on this file since 33402 was 33402, checked in by ak19, 5 years ago

Beginnings of the Java class to wget sites and process its pages to detect if any contain Maori language text. So far, only got wget to run but can't yet seem to launch it from a specified directory (it annoyingly runs inside src, ignoring the launch dir parameter to SafeProcess.runProcess()).

File size: 6.8 KB
Line 
1package org.greenstone.atea;
2
3import java.io.*;
4import java.util.Properties;
5
6import org.apache.log4j.Logger;
7import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
8
9
10
11/**
12 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
13 *
14 *
15 * Compile as:
16 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
17 *
18 * Run as:
19 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
20 */
21public class NZTLDProcessor
22{
23
24 static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
25
26 Properties configProps = null;
27
28 boolean DEBUG_MODE = true;
29
30
31 private void log(String msg) {
32 System.err.println(msg);
33 logger.info(msg);
34 }
35
36 private void error(String msg) {
37 System.err.println("ERROR: " + msg);
38 logger.error(msg);
39 }
40
41 private void warn(String msg) {
42 System.err.println("WARN: " + msg);
43 logger.warn(msg);
44 }
45
46 public NZTLDProcessor(File seedURLsFile) throws Exception {
47
48 if(DEBUG_MODE) {
49 warn("**** Debugger is turned ON!!!");
50 }
51
52 // TODO: why do I have to provide the path to config.props when this path is already on
53 // the classpath?
54 // In fact, I don't need it on the classpath for the following to work.
55 // How do I get it to work by specifying its path on the classpath and not here?
56 //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
57 //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
58 try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
59 configProps = new Properties();
60 configProps.load(infile);
61 infile.close();
62
63 } catch(Exception e) {
64 System.err.println("Exception attempting to read properties from config.properties.");
65 //e.printStackTrace();
66 throw e;
67 }
68
69
70 try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
71 String url = null;
72
73 int maxURLs = 1;
74 int urlCounter = 0;
75
76 while((url = br.readLine()) != null) {
77
78 log("Got URL: " + url);
79
80 processURL(url);
81
82 if(DEBUG_MODE) {
83 urlCounter++;
84 if(urlCounter == maxURLs) break;
85 }
86
87 }
88
89 br.close();
90
91 } catch (IOException ioe) {
92 System.err.println("Exception attempting to read from seedURLsFile.");
93 //ioe.printStackTrace();
94 throw ioe;
95 }
96 }
97
98 public boolean processURL(String nzDomainURL) {
99 // launch wget on URL
100 // when download done, recurse through downloaded dir
101 // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
102 // For each para, run MaoriTextDetector and store results in db
103 // Remove download directory (or only remove if site doesn't contain Maori txt)?
104
105
106 if(DEBUG_MODE) {
107 nzDomainURL = "www.waikato.ac.nz";
108 }
109 log("Processing seed URL: " + nzDomainURL);
110
111 wgetURL(nzDomainURL);
112
113 return true;
114 }
115
116 // TODO: Still need to get wget to be launched from a specified directory (../dwn_sites).
117 public boolean wgetURL(String nzDomainURL) {
118 // use SafeProcess and wget command in lib/config.properties
119 // DONE: set up logging
120
121
122 String wgetCmd = configProps.getProperty("wget.cmd");
123 if(wgetCmd == null || wgetCmd.equals("")) {
124 System.err.println("Invalid or empty wget.cmd in config.properties");
125 return false;
126 }
127
128 // replace the placeholder in the wget cmd for the seed url
129 wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
130
131 log("Will launch wget with the command: " + wgetCmd);
132
133 String[] wgetCommandArgs = wgetCmd.split(" ");
134 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File("../dwn_sites"));
135
136 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
137 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
138
139 int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
140
141 if(success != 0) {
142 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
143 return false;
144 }
145
146 return true;
147 }
148
149 public boolean processDownloadedDir() {
150 // recursion then process each file
151
152 return true;
153 }
154
155 public boolean processFile() {
156 // skip if js, css, robots.txt
157 // if no-extension or htm or html, call processHTMLFile()
158 // else: skip for now, TODO: later call Tika on other file types
159
160 // TODO: update db with url info, page location and para?
161
162 return true;
163 }
164
165
166 public boolean processHTMLFile() {
167 // Use JSoup to get paras
168
169 // call processParas(text);
170
171 return true;
172 }
173
174 public boolean processNonHTMLFile() {
175 // Use Tika to get text
176
177 // call processParas(text)
178
179 return true;
180 }
181
182 public boolean processParas(String text) {
183
184 // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
185
186 // for each para, call MaoriTextDetector.java
187
188 return true;
189 }
190
191 // ----------- inner class for SafeProcess to Run Wget ----------
192 public class ProcessLineHandler extends SafeProcess.LineByLineHandler
193 {
194 //static Logger logger = Logger.getLogger(this.getClass().getName());;
195 Logger logger;
196
197 public ProcessLineHandler(Logger logger, int src)
198 {
199 super(src); // will set this.source to STDERR or STDOUT
200 //logger = Logger.getLogger(this.getClass().getName());
201 this.logger = logger;
202 }
203
204 public void gotLine(String line) { // first non-null line
205
206 // String form of "src" will be "stderr" or "stdout"
207 String msg = SafeProcess.streamToString(source) + ": " + line;
208 System.err.println(msg);
209 //NZTLDProcessor.this.logger.info(msg);
210 logger.info(msg);
211 }
212 public void gotException(Exception e) {
213 //NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
214 logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
215 }
216
217 }
218 // --------------------------- End inner class --------------------
219
220 public static void main(String[] args) {
221 if(args.length <= 0) {
222 System.err.println("ERROR: Must provide input file of unique nz website addresses");
223 System.exit(-1);
224 }
225
226 String infilePath = args[0];
227
228 System.err.println("Requested to process URLS file: " + infilePath);
229
230 File infile = new File(infilePath);
231 if(!infile.exists()) {
232 System.err.println("ERROR: File " + infilePath + " did not exist");
233 System.exit(-1);
234 }
235
236
237 try {
238 NZTLDProcessor processor = new NZTLDProcessor(infile);
239 } catch(Exception e) {
240 e.printStackTrace();
241 }
242
243 }
244}
245
Note: See TracBrowser for help on using the repository browser.