1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import java.util.Properties;
|
---|
5 |
|
---|
6 | import org.apache.log4j.Logger;
|
---|
7 | import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java.
|
---|
8 |
|
---|
9 |
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
|
---|
13 | *
|
---|
14 | *
|
---|
15 | * Compile as:
|
---|
16 | * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java
|
---|
17 | *
|
---|
18 | * Run as:
|
---|
19 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
|
---|
20 | */
|
---|
21 | public class NZTLDProcessor
|
---|
22 | {
|
---|
23 |
|
---|
24 | static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
|
---|
25 |
|
---|
26 | Properties configProps = null;
|
---|
27 |
|
---|
28 | boolean DEBUG_MODE = true;
|
---|
29 |
|
---|
30 |
|
---|
31 | private void log(String msg) {
|
---|
32 | System.err.println(msg);
|
---|
33 | logger.info(msg);
|
---|
34 | }
|
---|
35 |
|
---|
36 | private void error(String msg) {
|
---|
37 | System.err.println("ERROR: " + msg);
|
---|
38 | logger.error(msg);
|
---|
39 | }
|
---|
40 |
|
---|
41 | private void warn(String msg) {
|
---|
42 | System.err.println("WARN: " + msg);
|
---|
43 | logger.warn(msg);
|
---|
44 | }
|
---|
45 |
|
---|
46 | public NZTLDProcessor(File seedURLsFile) throws Exception {
|
---|
47 |
|
---|
48 | if(DEBUG_MODE) {
|
---|
49 | warn("**** Debugger is turned ON!!!");
|
---|
50 | }
|
---|
51 |
|
---|
52 | // TODO: why do I have to provide the path to config.props when this path is already on
|
---|
53 | // the classpath?
|
---|
54 | // In fact, I don't need it on the classpath for the following to work.
|
---|
55 | // How do I get it to work by specifying its path on the classpath and not here?
|
---|
56 | //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) {
|
---|
57 | //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) {
|
---|
58 | try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) {
|
---|
59 | configProps = new Properties();
|
---|
60 | configProps.load(infile);
|
---|
61 | infile.close();
|
---|
62 |
|
---|
63 | } catch(Exception e) {
|
---|
64 | System.err.println("Exception attempting to read properties from config.properties.");
|
---|
65 | //e.printStackTrace();
|
---|
66 | throw e;
|
---|
67 | }
|
---|
68 |
|
---|
69 |
|
---|
70 | try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) {
|
---|
71 | String url = null;
|
---|
72 |
|
---|
73 | int maxURLs = 1;
|
---|
74 | int urlCounter = 0;
|
---|
75 |
|
---|
76 | while((url = br.readLine()) != null) {
|
---|
77 |
|
---|
78 | log("Got URL: " + url);
|
---|
79 |
|
---|
80 | processURL(url);
|
---|
81 |
|
---|
82 | if(DEBUG_MODE) {
|
---|
83 | urlCounter++;
|
---|
84 | if(urlCounter == maxURLs) break;
|
---|
85 | }
|
---|
86 |
|
---|
87 | }
|
---|
88 |
|
---|
89 | br.close();
|
---|
90 |
|
---|
91 | } catch (IOException ioe) {
|
---|
92 | System.err.println("Exception attempting to read from seedURLsFile.");
|
---|
93 | //ioe.printStackTrace();
|
---|
94 | throw ioe;
|
---|
95 | }
|
---|
96 | }
|
---|
97 |
|
---|
98 | public boolean processURL(String nzDomainURL) {
|
---|
99 | // launch wget on URL
|
---|
100 | // when download done, recurse through downloaded dir
|
---|
101 | // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files)
|
---|
102 | // For each para, run MaoriTextDetector and store results in db
|
---|
103 | // Remove download directory (or only remove if site doesn't contain Maori txt)?
|
---|
104 |
|
---|
105 |
|
---|
106 | if(DEBUG_MODE) {
|
---|
107 | nzDomainURL = "www.waikato.ac.nz";
|
---|
108 | }
|
---|
109 | log("Processing seed URL: " + nzDomainURL);
|
---|
110 |
|
---|
111 | wgetURL(nzDomainURL);
|
---|
112 |
|
---|
113 | return true;
|
---|
114 | }
|
---|
115 |
|
---|
116 | // TODO: Still need to get wget to be launched from a specified directory (../dwn_sites).
|
---|
117 | public boolean wgetURL(String nzDomainURL) {
|
---|
118 | // use SafeProcess and wget command in lib/config.properties
|
---|
119 | // DONE: set up logging
|
---|
120 |
|
---|
121 |
|
---|
122 | String wgetCmd = configProps.getProperty("wget.cmd");
|
---|
123 | if(wgetCmd == null || wgetCmd.equals("")) {
|
---|
124 | System.err.println("Invalid or empty wget.cmd in config.properties");
|
---|
125 | return false;
|
---|
126 | }
|
---|
127 |
|
---|
128 | // replace the placeholder in the wget cmd for the seed url
|
---|
129 | wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL);
|
---|
130 |
|
---|
131 | log("Will launch wget with the command: " + wgetCmd);
|
---|
132 |
|
---|
133 | String[] wgetCommandArgs = wgetCmd.split(" ");
|
---|
134 | SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File("../dwn_sites"));
|
---|
135 |
|
---|
136 | SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
|
---|
137 | SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
|
---|
138 |
|
---|
139 | int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
|
---|
140 |
|
---|
141 | if(success != 0) {
|
---|
142 | System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\"");
|
---|
143 | return false;
|
---|
144 | }
|
---|
145 |
|
---|
146 | return true;
|
---|
147 | }
|
---|
148 |
|
---|
149 | public boolean processDownloadedDir() {
|
---|
150 | // recursion then process each file
|
---|
151 |
|
---|
152 | return true;
|
---|
153 | }
|
---|
154 |
|
---|
155 | public boolean processFile() {
|
---|
156 | // skip if js, css, robots.txt
|
---|
157 | // if no-extension or htm or html, call processHTMLFile()
|
---|
158 | // else: skip for now, TODO: later call Tika on other file types
|
---|
159 |
|
---|
160 | // TODO: update db with url info, page location and para?
|
---|
161 |
|
---|
162 | return true;
|
---|
163 | }
|
---|
164 |
|
---|
165 |
|
---|
166 | public boolean processHTMLFile() {
|
---|
167 | // Use JSoup to get paras
|
---|
168 |
|
---|
169 | // call processParas(text);
|
---|
170 |
|
---|
171 | return true;
|
---|
172 | }
|
---|
173 |
|
---|
174 | public boolean processNonHTMLFile() {
|
---|
175 | // Use Tika to get text
|
---|
176 |
|
---|
177 | // call processParas(text)
|
---|
178 |
|
---|
179 | return true;
|
---|
180 | }
|
---|
181 |
|
---|
182 | public boolean processParas(String text) {
|
---|
183 |
|
---|
184 | // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
|
---|
185 |
|
---|
186 | // for each para, call MaoriTextDetector.java
|
---|
187 |
|
---|
188 | return true;
|
---|
189 | }
|
---|
190 |
|
---|
191 | // ----------- inner class for SafeProcess to Run Wget ----------
|
---|
192 | public class ProcessLineHandler extends SafeProcess.LineByLineHandler
|
---|
193 | {
|
---|
194 | //static Logger logger = Logger.getLogger(this.getClass().getName());;
|
---|
195 | Logger logger;
|
---|
196 |
|
---|
197 | public ProcessLineHandler(Logger logger, int src)
|
---|
198 | {
|
---|
199 | super(src); // will set this.source to STDERR or STDOUT
|
---|
200 | //logger = Logger.getLogger(this.getClass().getName());
|
---|
201 | this.logger = logger;
|
---|
202 | }
|
---|
203 |
|
---|
204 | public void gotLine(String line) { // first non-null line
|
---|
205 |
|
---|
206 | // String form of "src" will be "stderr" or "stdout"
|
---|
207 | String msg = SafeProcess.streamToString(source) + ": " + line;
|
---|
208 | System.err.println(msg);
|
---|
209 | //NZTLDProcessor.this.logger.info(msg);
|
---|
210 | logger.info(msg);
|
---|
211 | }
|
---|
212 | public void gotException(Exception e) {
|
---|
213 | //NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
|
---|
214 | logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
|
---|
215 | }
|
---|
216 |
|
---|
217 | }
|
---|
218 | // --------------------------- End inner class --------------------
|
---|
219 |
|
---|
220 | public static void main(String[] args) {
|
---|
221 | if(args.length <= 0) {
|
---|
222 | System.err.println("ERROR: Must provide input file of unique nz website addresses");
|
---|
223 | System.exit(-1);
|
---|
224 | }
|
---|
225 |
|
---|
226 | String infilePath = args[0];
|
---|
227 |
|
---|
228 | System.err.println("Requested to process URLS file: " + infilePath);
|
---|
229 |
|
---|
230 | File infile = new File(infilePath);
|
---|
231 | if(!infile.exists()) {
|
---|
232 | System.err.println("ERROR: File " + infilePath + " did not exist");
|
---|
233 | System.exit(-1);
|
---|
234 | }
|
---|
235 |
|
---|
236 |
|
---|
237 | try {
|
---|
238 | NZTLDProcessor processor = new NZTLDProcessor(infile);
|
---|
239 | } catch(Exception e) {
|
---|
240 | e.printStackTrace();
|
---|
241 | }
|
---|
242 |
|
---|
243 | }
|
---|
244 | }
|
---|
245 |
|
---|