Changeset 33411
- Timestamp:
- 2019-08-13T21:50:29+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java
r33410 r33411 3 3 import java.io.*; 4 4 import java.util.Properties; 5 import java.util.zip.GZIPInputStream; 5 6 6 7 import org.apache.log4j.Logger; … … 17 18 * 18 19 * Run as: 19 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz- urls-2019-08-09.txt20 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt 20 21 */ 21 22 public class NZTLDProcessor … … 29 30 private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter(); 30 31 private final String SITES_DIR = "../dwn-sites"; 31 32 private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/"; 33 34 32 35 private void log(String msg) { 33 36 System.err.println(msg); … … 41 44 42 45 private void error(String msg) { 43 System.err.println(" ERROR: " + msg);46 System.err.println("### ERROR: " + msg); 44 47 logger.error(msg); 45 48 } 46 49 47 50 private void warn(String msg) { 48 System.err.println(" WARN: " + msg);51 System.err.println("*** WARN: " + msg); 49 52 logger.warn(msg); 50 53 } … … 54 57 55 58 if(DEBUG_MODE) { 56 warn(" ****Debugger is turned ON!!!");59 warn("Debugger is turned ON!!!"); 57 60 } 58 61 … … 85 88 log("Got URL: " + url); 86 89 90 // skip urls containing "/crawldiagnostics/" and "/robotstxt/" 91 if(url.indexOf("/robotstxt/") != -1) continue; 92 if(url.indexOf("/crawldiagnostics/") != -1) continue; 93 94 // convert the remaining WARC urls to WET urls 95 url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-"); 96 url = url.replace(".warc.gz", ".warc.wet.gz"); 97 98 // add the prefix 99 url = COMMONCRAWL_DATA_PREFIX + url; 100 101 log("Final WET URL: " + url); 102 87 103 processURL(url); 88 104 … … 103 119 } 104 120 105 public boolean processURL(String nzDomainURL) {121 public boolean processURL(String ccWETfileURL) { 106 122 // launch wget on URL 107 123 // when download done, recurse through downloaded dir … … 112 128 113 129 if(DEBUG_MODE) { 114 nzDomainURL = "www.waikato.ac.nz"; 115 } 116 log("Processing seed URL: " + nzDomainURL); 117 118 //if(wgetMirrorSite(nzDomainURL)) { 119 if(!DEBUG_MODE) { wgetMirrorSite(nzDomainURL); } // TODO: take out debug_mode condition 120 121 processDownloadedDir(nzDomainURL); 130 ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz"; 131 } 132 log("Processing WET file URL: " + ccWETfileURL); 133 134 /* 135 //if(wgetMirrorSite(ccWETfileURL)) { 136 if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition 137 138 processDownloadedDir(ccWETfileURL); 122 139 123 140 //} 124 return true; 125 } 126 141 */ 142 143 144 processWETfile(ccWETfileURL); 145 146 147 return true; 148 } 149 150 public boolean processWETfile(String ccWETfileURL) { 151 String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1); 152 String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz")); 153 154 File inZipFile = new File(SITES_DIR, zippedWETfileName); 155 File WETfile = new File(SITES_DIR, unzippedWETfileName); 156 157 if(WETfile.exists()) { 158 log("Unzipped WET file " + WETfile + " already exists"); 159 } 160 else { 161 162 if(inZipFile.exists()) { 163 log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists"); 164 } 165 else if(!wgetWETfile(ccWETfileURL)) { 166 return false; 167 } 168 169 // don't have the WET file yet. Get it from the zip file, which we know we should have by now 170 171 boolean success = unzipFile(inZipFile, WETfile); 172 // whether we succeeded or not, get rid of the zipped file: 173 if(!inZipFile.delete()) { 174 warn("Unable to delete zipped WET file: " + zippedWETfileName); 175 } 176 177 if(!success) { 178 return false; 179 } 180 } 181 182 // read in the giant WET file and 183 184 return true; 185 } 186 187 // Run gunzip 188 // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process 189 // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/ 190 public boolean unzipFile(File inZipFile, File outFile) { 191 192 193 byte[] buffer = new byte[1024]; 194 195 // try-with-resources will safely close streams/dispose resources on success or error and exceptions 196 try ( 197 GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile)); 198 FileOutputStream out = new FileOutputStream(outFile); 199 ) { 200 int len; 201 while ((len = gzis.read(buffer)) > 0) { 202 out.write(buffer, 0, len); 203 } 204 205 //gzis.close(); 206 //out.close(); 207 208 log("Unzipped " + inZipFile + " to " + outFile); 209 210 } catch(IOException ex) { 211 error("Failed to unzip " + inZipFile); 212 ex.printStackTrace(); 213 return false; 214 } 215 216 return true; 217 } 218 219 // wget will be launched from the specified directory, SITES_DIR 220 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html 221 public boolean wgetWETfile(String ccWETfileURL) { 222 // use SafeProcess and wget command in lib/config.properties 223 // DONE: set up logging 224 225 String wgetCmd = configProps.getProperty("wget.file.cmd"); 226 if(wgetCmd == null || wgetCmd.equals("")) { 227 System.err.println("Invalid or empty wget.file.cmd in config.properties"); 228 return false; 229 } 230 231 // replace the placeholder in the wget cmd for the seed url 232 wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL); 233 234 log("Will launch wget with the command: " + wgetCmd); 235 236 String[] wgetCommandArgs = wgetCmd.split(" "); 237 SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR)); 238 239 SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT); 240 SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR); 241 242 int success = wgetProcess.runProcess(outLineHandler, errLineHandler); 243 244 if(success != 0) { 245 System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\""); 246 return false; 247 // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz? 248 } 249 250 return true; 251 } 252 253 127 254 // wget will be launched from the specified directory, SITES_DIR 128 255 // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html … … 158 285 159 286 return true; 160 } 287 } 161 288 162 289 … … 191 318 192 319 193 public boolean processDownloadedDir(String nzDomainURL) {320 public boolean processDownloadedDir(String ccWETfileURL) { 194 321 // recurse through the downloaded directory, then process each file 195 322 196 File downloadedSiteDir = new File(SITES_DIR, nzDomainURL);323 File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL); 197 324 log("*** Will process download dir " + downloadedSiteDir); 198 325 199 326 if(!downloadedSiteDir.exists()) { 200 error(" ***Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");327 error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 201 328 return false; 202 329 } 203 330 /* 204 331 if(!downloadedSiteDir.isDirectory()) { 205 error(" ***Downloaded site " + downloadedSiteDir + " is not a directory!");332 error("Downloaded site " + downloadedSiteDir + " is not a directory!"); 206 333 return false; // TODO: or redo wget with "www." prefixed to URL??? 207 334 } 208 335 */ 209 recursivelyProcessDir( nzDomainURL, downloadedSiteDir);336 recursivelyProcessDir(ccWETfileURL, downloadedSiteDir); 210 337 211 338 debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex); … … 214 341 } 215 342 216 private void recursivelyProcessDir(String nzDomainURL, File file) {343 private void recursivelyProcessDir(String ccWETfileURL, File file) { 217 344 218 345 if(file.isDirectory()) { // recursive step … … 225 352 // children array will be empty if 'file' dir was empty 226 353 for(int i = 0; i < children.length; i++) { 227 recursivelyProcessDir( nzDomainURL, children[i]);354 recursivelyProcessDir(ccWETfileURL, children[i]); 228 355 } 229 356 } 230 357 231 358 } else { // base step 232 processFile( nzDomainURL, file);359 processFile(ccWETfileURL, file); 233 360 } 234 361 } 235 362 236 363 /* 237 public boolean processDownloadedDir(String nzDomainURL) {364 public boolean processDownloadedDir(String ccWETfileURL) { 238 365 // recurse through the downloaded directory, then process each file 239 366 240 File downloadedSiteDir = new File(SITES_DIR, nzDomainURL);367 File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL); 241 368 if(!downloadedSiteDir.exists()) { 242 error(" ***Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");369 error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 243 370 return false; 244 371 } … … 249 376 return false; 250 377 } 251 recursivelyProcessDir( nzDomainURL, files);252 253 return true; 254 } 255 256 private void recursivelyProcessDir(String nzDomainURL, File[] children) {378 recursivelyProcessDir(ccWETfileURL, files); 379 380 return true; 381 } 382 383 private void recursivelyProcessDir(String ccWETfileURL, File[] children) { 257 384 for(int i = 0; i < files.length; i++) { 258 385 if(files[i].isDirectory()) { // recursive step … … 264 391 error("IO error trying to list children of " + files[i]); 265 392 } else { 266 recursivelyProcessDir( nzDomainURL, children);393 recursivelyProcessDir(ccWETfileURL, children); 267 394 } 268 395 } 269 396 else { // base step 270 processFile( nzDomainURL, files[i]);397 processFile(ccWETfileURL, files[i]); 271 398 } 272 399 } 273 400 }*/ 274 401 275 public boolean processFile(String nzDomainURL, File file) {402 public boolean processFile(String ccWETfileURL, File file) { 276 403 // skip if js, css, robots.txt 277 404 // if no-extension or htm or html, call processHTMLFile()
Note:
See TracChangeset
for help on using the changeset viewer.