Changeset 33405 for gs3-extensions
- Timestamp:
- 2019-08-12T20:37:44+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java
r33402 r33405 11 11 /** 12 12 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path. 13 * 13 * Then go into the src folder of this extension before compiling or running. 14 14 * 15 15 * Compile as: … … 22 22 { 23 23 24 static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName()); 25 26 Properties configProps = null; 27 28 boolean DEBUG_MODE = true; 29 24 private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName()); 25 26 private Properties configProps = null; 27 private boolean DEBUG_MODE = true; 28 29 private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter(); 30 private final String sitesDir = "../dwn-sites"; 30 31 31 32 private void log(String msg) { … … 34 35 } 35 36 37 private void debug(String msg) { 38 System.err.println(msg); 39 logger.debug(msg); 40 } 41 36 42 private void error(String msg) { 37 43 System.err.println("ERROR: " + msg); … … 45 51 46 52 public NZTLDProcessor(File seedURLsFile) throws Exception { 47 53 log("In NZTLDProcessor constructor"); 54 48 55 if(DEBUG_MODE) { 49 56 warn("**** Debugger is turned ON!!!"); … … 109 116 log("Processing seed URL: " + nzDomainURL); 110 117 111 wgetURL(nzDomainURL); 112 113 return true; 114 } 115 116 // TODO: Still need to get wget to be launched from a specified directory (../dwn_sites). 118 //if(wgetURL(nzDomainURL)) { 119 if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition 120 121 processDownloadedDir(nzDomainURL); 122 123 //} 124 return true; 125 } 126 127 // wget will be launched from the specified directory, ../dwn_sites 128 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html 117 129 public boolean wgetURL(String nzDomainURL) { 118 130 // use SafeProcess and wget command in lib/config.properties … … 132 144 133 145 String[] wgetCommandArgs = wgetCmd.split(" "); 134 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File( "../dwn_sites"));146 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir)); 135 147 136 148 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT); … … 142 154 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\""); 143 155 return false; 144 } 145 146 return true; 147 } 148 149 public boolean processDownloadedDir() { 150 // recursion then process each file 151 152 return true; 153 } 154 155 public boolean processFile() { 156 // skip if js, css, robots.txt 157 // if no-extension or htm or html, call processHTMLFile() 158 // else: skip for now, TODO: later call Tika on other file types 159 160 // TODO: update db with url info, page location and para? 161 162 return true; 163 } 164 165 166 public boolean processHTMLFile() { 167 // Use JSoup to get paras 168 169 // call processParas(text); 170 171 return true; 172 } 173 174 public boolean processNonHTMLFile() { 175 // Use Tika to get text 176 177 // call processParas(text) 178 179 return true; 180 } 181 182 public boolean processParas(String text) { 183 184 // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous? 185 186 // for each para, call MaoriTextDetector.java 187 188 return true; 189 } 156 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz? 157 } 158 159 return true; 160 } 161 190 162 191 163 // ----------- inner class for SafeProcess to Run Wget ---------- … … 204 176 public void gotLine(String line) { // first non-null line 205 177 206 // String form of "src"will be "stderr" or "stdout"178 // String form of this.source will be "stderr" or "stdout" 207 179 String msg = SafeProcess.streamToString(source) + ": " + line; 208 180 System.err.println(msg); … … 211 183 } 212 184 public void gotException(Exception e) { 213 //NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e); 214 logger.error("Error in reading process' " + SafeProcess.streamToString(source), e); 215 } 216 185 String msg = "Error in reading process' " + SafeProcess.streamToString(source); 186 //NZTLDProcessor.this.logger.error(msg, e); 187 logger.error(msg, e); 188 } 189 190 } 191 192 193 public boolean processDownloadedDir(String nzDomainURL) { 194 // recurse through the downloaded directory, then process each file 195 196 File downloadedSiteDir = new File(sitesDir, nzDomainURL); 197 log("*** Will process download dir " + downloadedSiteDir); 198 199 if(!downloadedSiteDir.exists()) { 200 error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 201 return false; 202 } 203 /* 204 if(!downloadedSiteDir.isDirectory()) { 205 error("*** Downloaded site " + downloadedSiteDir + " is not a directory!"); 206 return false; // TODO: or redo wget with "www." prefixed to URL??? 207 } 208 */ 209 recursivelyProcessDir(nzDomainURL, downloadedSiteDir); 210 211 debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex); 212 213 return true; 214 } 215 216 private void recursivelyProcessDir(String nzDomainURL, File file) { 217 218 if(file.isDirectory()) { // recursive step 219 // children array null iff IO Exception OR if file not a directory, 220 // which it can't be since we tested isDirectory() on it just above 221 File[] children = file.listFiles(extractableTxtFilesFilter); 222 if(children == null) { 223 error("IO error occurred when trying to list children of " + file); 224 } else { 225 // children array will be empty if 'file' dir was empty 226 for(int i = 0; i < children.length; i++) { 227 recursivelyProcessDir(nzDomainURL, children[i]); 228 } 229 } 230 231 } else { // base step 232 processFile(nzDomainURL, file); 233 } 234 } 235 236 /* 237 public boolean processDownloadedDir(String nzDomainURL) { 238 // recurse through the downloaded directory, then process each file 239 240 File downloadedSiteDir = new File(sitesDir, nzDomainURL); 241 if(!downloadedSiteDir.exists()) { 242 error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 243 return false; 244 } 245 246 File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter); 247 if(files == null) { 248 error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred"); 249 return false; 250 } 251 recursivelyProcessDir(nzDomainURL, files); 252 253 return true; 254 } 255 256 private void recursivelyProcessDir(String nzDomainURL, File[] children) { 257 for(int i = 0; i < files.length; i++) { 258 if(files[i].isDirectory()) { // recursive step 259 260 // children array will be empty if dir empty 261 // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above 262 File[] children = files[i].listFiles(extractableTxtFilesFilter); 263 if(children == null) { 264 error("IO error trying to list children of " + files[i]); 265 } else { 266 recursivelyProcessDir(nzDomainURL, children); 267 } 268 } 269 else { // base step 270 processFile(nzDomainURL, files[i]); 271 } 272 } 273 }*/ 274 275 public boolean processFile(String nzDomainURL, File file) { 276 // skip if js, css, robots.txt 277 // if no-extension or htm or html, call processHTMLFile() 278 // else: skip for now, TODO: later call Tika on other file types 279 280 // TODO: update db with url info, page location and para? 281 282 283 log("*** Processing file " + file); 284 285 return true; 286 } 287 288 private class ExtractableTextFileFilter implements FilenameFilter { 289 // skip if js, css, robots.txt 290 291 // For now also skip the image files. Later, with Tika, may be able to extract text from 292 // images though OCR of those imgs representing text? We accept files with no file extension, 293 // e.g. waikato uni has lots of files without extension that contain html. 294 // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected 295 public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)"; 296 //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex); 297 298 299 public boolean accept(File dir, String name) { 300 301 return !(name.matches("robots.txt") || name.matches(unacceptableRegex)); 302 //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work 303 //return name.matches(".*\\.html$"); // works 304 //return name.matches(".*\\.png($|\\?.*)"); // works 305 } 306 } 307 308 309 public boolean processHTMLFile() { 310 // Use JSoup to get paras 311 312 // call processParas(text); 313 314 return true; 315 } 316 317 public boolean processNonHTMLFile() { 318 // Use Tika to get text 319 320 // call processParas(text) 321 322 return true; 323 } 324 325 public boolean processParas(String text) { 326 327 // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous? 328 329 // for each para, call MaoriTextDetector.java 330 331 return true; 217 332 } 218 333 // --------------------------- End inner class --------------------
Note:
See TracChangeset
for help on using the changeset viewer.