Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33411

Timestamp:

2019-08-13T21:50:29+12:00 (5 years ago)

Author:

ak19

Message:

Newer version now doesn't mirror sites with wget but gets WET files and unzips them instead. Didn't yet get to the part where I can process the actual contents inside each 400MB WET file.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java (modified) (14 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

-              r33410
+              r33411
 import java.io.*;
 import java.util.Properties;
+import java.util.zip.GZIPInputStream;
 import org.apache.log4j.Logger;
 …
+ *
  * Run as:
  *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
+ *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
  */
 public class NZTLDProcessor
 …
     private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
     private final String SITES_DIR = "../dwn-sites";
+    private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
     private void log(String msg) {
     System.err.println(msg);
 …
     private void error(String msg) {
     System.err.println("ERROR: " + msg);
+    System.err.println("### ERROR: " + msg);
     logger.error(msg);
+    }
     private void warn(String msg) {
     System.err.println("WARN: " + msg);
+    System.err.println("*** WARN: " + msg);
     logger.warn(msg);
+    }
 …
     if(DEBUG_MODE) {
         warn("**** Debugger is turned ON!!!");
+        warn("Debugger is turned ON!!!");
+    }
 …
         log("Got URL: " + url);
+        // skip urls containing "/crawldiagnostics/" and "/robotstxt/"
+        if(url.indexOf("/robotstxt/") != -1) continue;
+        if(url.indexOf("/crawldiagnostics/") != -1) continue;
+        // convert the remaining WARC urls to WET urls
+        url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
+        url = url.replace(".warc.gz", ".warc.wet.gz");
+        // add the prefix
+        url = COMMONCRAWL_DATA_PREFIX + url;
+        log("Final WET URL: " + url);
         processURL(url);
 …
+    }
     public boolean processURL(String nzDomainURL) {
+    public boolean processURL(String ccWETfileURL) {
     // launch wget on URL
     // when download done, recurse through downloaded dir
 …
     if(DEBUG_MODE) {
+        nzDomainURL = "www.waikato.ac.nz";
+    }
+    log("Processing seed URL: " + nzDomainURL);
+    //if(wgetMirrorSite(nzDomainURL)) {
+    if(!DEBUG_MODE) { wgetMirrorSite(nzDomainURL); } // TODO: take out debug_mode condition
+    processDownloadedDir(nzDomainURL);
+        ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
+    }
+    log("Processing WET file URL: " + ccWETfileURL);
+    /*
+    //if(wgetMirrorSite(ccWETfileURL)) {
+    if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
+    processDownloadedDir(ccWETfileURL);
     //}
+    return true;
+    }
+    */
+    processWETfile(ccWETfileURL);
+    return true;
+    }
+    public boolean processWETfile(String ccWETfileURL) {
+    String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
+    String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
+    File inZipFile = new File(SITES_DIR, zippedWETfileName);
+    File WETfile = new File(SITES_DIR, unzippedWETfileName);
+    if(WETfile.exists()) {
+        log("Unzipped WET file " + WETfile + " already exists");
+    }
+    else {
+        if(inZipFile.exists()) {
+        log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");
+        }
+        else if(!wgetWETfile(ccWETfileURL)) {
+        return false;
+        }
+        // don't have the WET file yet. Get it from the zip file, which we know we should have by now
+        boolean success = unzipFile(inZipFile, WETfile);
+        // whether we succeeded or not, get rid of the zipped file:
+        if(!inZipFile.delete()) {
+        warn("Unable to delete zipped WET file: " + zippedWETfileName);
+        }
+        if(!success) {
+        return false;
+        }
+    }
+    // read in the giant WET file and
+    return true;
+    }
+    // Run gunzip
+    // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
+    // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
+    public boolean unzipFile(File inZipFile, File outFile) {
+    byte[] buffer = new byte[1024];
+    // try-with-resources will safely close streams/dispose resources on success or error and exceptions
+    try (
+        GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));
+        FileOutputStream out = new FileOutputStream(outFile);
+    ) {
+        int len;
+        while ((len = gzis.read(buffer)) > 0) {
+            out.write(buffer, 0, len);
+        }
+        //gzis.close();
+        //out.close();
+        log("Unzipped " + inZipFile + " to " + outFile);
+    } catch(IOException ex) {
+        error("Failed to unzip " + inZipFile);
+        ex.printStackTrace();
+        return false;
+    }
+    return true;
+    }
+    // wget will be launched from the specified directory, SITES_DIR
+    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
+    public boolean wgetWETfile(String ccWETfileURL) {
+    // use SafeProcess and wget command in lib/config.properties
+    // DONE: set up logging
+    String wgetCmd = configProps.getProperty("wget.file.cmd");
+    if(wgetCmd == null || wgetCmd.equals("")) {
+        System.err.println("Invalid or empty wget.file.cmd in config.properties");
+        return false;
+    }
+    // replace the placeholder in the wget cmd for the seed url
+    wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
+    log("Will launch wget with the command: " + wgetCmd);
+    String[] wgetCommandArgs = wgetCmd.split(" ");
+    SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
+    SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
+    SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
+    int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
+    if(success != 0) {
+        System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\"");
+        return false;
+            // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
+    }
+    return true;
+    }
     // wget will be launched from the specified directory, SITES_DIR
     // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
 …
     return true;
+    }
+    }
 …
     public boolean processDownloadedDir(String nzDomainURL) {
+    public boolean processDownloadedDir(String ccWETfileURL) {
     // recurse through the downloaded directory, then process each file
     File downloadedSiteDir = new File(SITES_DIR, nzDomainURL);
+    File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
     log("*** Will process download dir " + downloadedSiteDir);
     if(!downloadedSiteDir.exists()) {
         error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
+        error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
         return false;
+    }
     /*
     if(!downloadedSiteDir.isDirectory()) {
         error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
+        error("Downloaded site " + downloadedSiteDir + " is not a directory!");
         return false; // TODO: or redo wget with "www." prefixed to URL???
+    }
     */
     recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
+    recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
     debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
 …
+    }
     private void recursivelyProcessDir(String nzDomainURL, File file) {
+    private void recursivelyProcessDir(String ccWETfileURL, File file) {
     if(file.isDirectory()) { // recursive step
 …
         // children array will be empty if 'file' dir was empty
         for(int i = 0; i < children.length; i++) {
             recursivelyProcessDir(nzDomainURL, children[i]);
+            recursivelyProcessDir(ccWETfileURL, children[i]);
+        }
+        }
     } else { // base step
         processFile(nzDomainURL, file);
+        processFile(ccWETfileURL, file);
+    }
+    }
     /*
     public boolean processDownloadedDir(String nzDomainURL) {
+    public boolean processDownloadedDir(String ccWETfileURL) {
     // recurse through the downloaded directory, then process each file
     File downloadedSiteDir = new File(SITES_DIR, nzDomainURL);
+    File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
     if(!downloadedSiteDir.exists()) {
         error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
+        error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
         return false;
+    }
 …
         return false;
+    }
     recursivelyProcessDir(nzDomainURL, files);
     return true;
+    }
     private void recursivelyProcessDir(String nzDomainURL, File[] children) {
+    recursivelyProcessDir(ccWETfileURL, files);
+    return true;
+    }
+    private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
     for(int i = 0; i < files.length; i++) {
         if(files[i].isDirectory()) { // recursive step
 …
             error("IO error trying to list children of " + files[i]);
         } else {
             recursivelyProcessDir(nzDomainURL, children);
+            recursivelyProcessDir(ccWETfileURL, children);
+        }
+        }
         else { // base step
         processFile(nzDomainURL, files[i]);
+        processFile(ccWETfileURL, files[i]);
+        }
+    }
     }*/
     public boolean processFile(String nzDomainURL, File file) {
+    public boolean processFile(String ccWETfileURL, File file) {
     // skip if js, css, robots.txt
     // if no-extension or htm or html, call processHTMLFile()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33411

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

Download in other formats: