Changeset 33411


Ignore:
Timestamp:
2019-08-13T21:50:29+12:00 (5 years ago)
Author:
ak19
Message:

Newer version now doesn't mirror sites with wget but gets WET files and unzips them instead. Didn't yet get to the part where I can process the actual contents inside each 400MB WET file.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

    r33410 r33411  
    33import java.io.*;
    44import java.util.Properties;
     5import java.util.zip.GZIPInputStream;
    56
    67import org.apache.log4j.Logger;
     
    1718 *
    1819 * Run as:
    19  *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt
     20 *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt
    2021 */
    2122public class NZTLDProcessor
     
    2930    private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();   
    3031    private final String SITES_DIR = "../dwn-sites";
    31 
     32    private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/";
     33   
     34   
    3235    private void log(String msg) {
    3336    System.err.println(msg);
     
    4144   
    4245    private void error(String msg) {
    43     System.err.println("ERROR: " + msg);
     46    System.err.println("### ERROR: " + msg);
    4447    logger.error(msg);
    4548    }
    4649   
    4750    private void warn(String msg) {
    48     System.err.println("WARN: " + msg);
     51    System.err.println("*** WARN: " + msg);
    4952    logger.warn(msg);
    5053    }
     
    5457   
    5558    if(DEBUG_MODE) {
    56         warn("**** Debugger is turned ON!!!");
     59        warn("Debugger is turned ON!!!");
    5760    }
    5861   
     
    8588        log("Got URL: " + url);
    8689
     90        // skip urls containing "/crawldiagnostics/" and "/robotstxt/"
     91        if(url.indexOf("/robotstxt/") != -1) continue;
     92        if(url.indexOf("/crawldiagnostics/") != -1) continue;
     93
     94        // convert the remaining WARC urls to WET urls
     95        url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-");
     96        url = url.replace(".warc.gz", ".warc.wet.gz");
     97
     98        // add the prefix
     99        url = COMMONCRAWL_DATA_PREFIX + url;
     100       
     101        log("Final WET URL: " + url);
     102           
    87103        processURL(url);
    88104       
     
    103119    }
    104120
    105     public boolean processURL(String nzDomainURL) {
     121    public boolean processURL(String ccWETfileURL) {
    106122    // launch wget on URL
    107123    // when download done, recurse through downloaded dir
     
    112128
    113129    if(DEBUG_MODE) {
    114         nzDomainURL = "www.waikato.ac.nz";
    115     }
    116     log("Processing seed URL: " + nzDomainURL);
    117 
    118     //if(wgetMirrorSite(nzDomainURL)) {
    119     if(!DEBUG_MODE) { wgetMirrorSite(nzDomainURL); } // TODO: take out debug_mode condition
    120    
    121     processDownloadedDir(nzDomainURL);
     130        ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz";
     131    }
     132    log("Processing WET file URL: " + ccWETfileURL);
     133
     134    /*
     135    //if(wgetMirrorSite(ccWETfileURL)) {
     136    if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition
     137   
     138    processDownloadedDir(ccWETfileURL);
    122139   
    123140    //}
    124     return true;
    125     }
    126 
     141    */
     142
     143   
     144    processWETfile(ccWETfileURL);
     145   
     146   
     147    return true;
     148    }
     149
     150    public boolean processWETfile(String ccWETfileURL) {
     151    String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1);
     152    String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz"));
     153
     154    File inZipFile = new File(SITES_DIR, zippedWETfileName);
     155    File WETfile = new File(SITES_DIR, unzippedWETfileName);
     156   
     157    if(WETfile.exists()) {
     158        log("Unzipped WET file " + WETfile + " already exists");
     159    }
     160    else {
     161
     162        if(inZipFile.exists()) {
     163        log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");   
     164        }
     165        else if(!wgetWETfile(ccWETfileURL)) {
     166        return false;
     167        }   
     168
     169        // don't have the WET file yet. Get it from the zip file, which we know we should have by now
     170       
     171        boolean success = unzipFile(inZipFile, WETfile);
     172        // whether we succeeded or not, get rid of the zipped file:
     173        if(!inZipFile.delete()) {
     174        warn("Unable to delete zipped WET file: " + zippedWETfileName);
     175        }
     176       
     177        if(!success) {
     178        return false;
     179        }
     180    }
     181
     182    // read in the giant WET file and
     183   
     184    return true;
     185    }
     186
     187    // Run gunzip
     188    // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process
     189    // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/
     190    public boolean unzipFile(File inZipFile, File outFile) {
     191   
     192   
     193    byte[] buffer = new byte[1024];
     194
     195    // try-with-resources will safely close streams/dispose resources on success or error and exceptions
     196    try (       
     197        GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));     
     198        FileOutputStream out = new FileOutputStream(outFile);
     199    ) {
     200        int len;
     201        while ((len = gzis.read(buffer)) > 0) {
     202            out.write(buffer, 0, len);
     203        }
     204       
     205        //gzis.close();
     206        //out.close();
     207       
     208        log("Unzipped " + inZipFile + " to " + outFile);
     209       
     210    } catch(IOException ex) {
     211        error("Failed to unzip " + inZipFile);
     212        ex.printStackTrace();
     213        return false;
     214    }
     215
     216    return true;
     217    }
     218   
     219    // wget will be launched from the specified directory, SITES_DIR
     220    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
     221    public boolean wgetWETfile(String ccWETfileURL) {
     222    // use SafeProcess and wget command in lib/config.properties
     223    // DONE: set up logging
     224   
     225    String wgetCmd = configProps.getProperty("wget.file.cmd");
     226    if(wgetCmd == null || wgetCmd.equals("")) {
     227        System.err.println("Invalid or empty wget.file.cmd in config.properties");
     228        return false;
     229    }
     230
     231    // replace the placeholder in the wget cmd for the seed url
     232    wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL);
     233   
     234    log("Will launch wget with the command: " + wgetCmd);
     235
     236    String[] wgetCommandArgs = wgetCmd.split(" ");
     237    SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR));
     238   
     239    SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
     240    SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR);
     241       
     242    int success = wgetProcess.runProcess(outLineHandler, errLineHandler);
     243
     244    if(success != 0) {
     245        System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\"");
     246        return false;
     247            // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
     248    }
     249   
     250    return true;
     251    }
     252   
     253   
    127254    // wget will be launched from the specified directory, SITES_DIR
    128255    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
     
    158285   
    159286    return true;
    160     }
     287    }   
    161288
    162289   
     
    191318
    192319
    193     public boolean processDownloadedDir(String nzDomainURL) {
     320    public boolean processDownloadedDir(String ccWETfileURL) {
    194321    // recurse through the downloaded directory, then process each file
    195322
    196     File downloadedSiteDir = new File(SITES_DIR, nzDomainURL);
     323    File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
    197324    log("*** Will process download dir " + downloadedSiteDir);
    198325   
    199326    if(!downloadedSiteDir.exists()) {
    200         error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
     327        error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
    201328        return false;
    202329    }
    203330    /*
    204331    if(!downloadedSiteDir.isDirectory()) {
    205         error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
     332        error("Downloaded site " + downloadedSiteDir + " is not a directory!");
    206333        return false; // TODO: or redo wget with "www." prefixed to URL???
    207334    }
    208335    */
    209     recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
     336    recursivelyProcessDir(ccWETfileURL, downloadedSiteDir);
    210337
    211338    debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
     
    214341    }
    215342
    216     private void recursivelyProcessDir(String nzDomainURL, File file) {
     343    private void recursivelyProcessDir(String ccWETfileURL, File file) {
    217344
    218345    if(file.isDirectory()) { // recursive step
     
    225352        // children array will be empty if 'file' dir was empty
    226353        for(int i = 0; i < children.length; i++) {
    227             recursivelyProcessDir(nzDomainURL, children[i]);
     354            recursivelyProcessDir(ccWETfileURL, children[i]);
    228355        }
    229356        }
    230357       
    231358    } else { // base step
    232         processFile(nzDomainURL, file);     
     359        processFile(ccWETfileURL, file);       
    233360    }
    234361    }   
    235362   
    236363    /*
    237     public boolean processDownloadedDir(String nzDomainURL) {
     364    public boolean processDownloadedDir(String ccWETfileURL) {
    238365    // recurse through the downloaded directory, then process each file
    239366
    240     File downloadedSiteDir = new File(SITES_DIR, nzDomainURL);
     367    File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL);
    241368    if(!downloadedSiteDir.exists()) {
    242         error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
     369        error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
    243370        return false;
    244371    }
     
    249376        return false;
    250377    }
    251     recursivelyProcessDir(nzDomainURL, files);
    252    
    253     return true;
    254     }
    255 
    256     private void recursivelyProcessDir(String nzDomainURL, File[] children) {
     378    recursivelyProcessDir(ccWETfileURL, files);
     379   
     380    return true;
     381    }
     382
     383    private void recursivelyProcessDir(String ccWETfileURL, File[] children) {
    257384    for(int i = 0; i < files.length; i++) {
    258385        if(files[i].isDirectory()) { // recursive step
     
    264391            error("IO error trying to list children of " + files[i]);
    265392        } else {
    266             recursivelyProcessDir(nzDomainURL, children);
     393            recursivelyProcessDir(ccWETfileURL, children);
    267394        }
    268395        }
    269396        else { // base step
    270         processFile(nzDomainURL, files[i]);
     397        processFile(ccWETfileURL, files[i]);
    271398        }
    272399    }
    273400    }*/
    274401   
    275     public boolean processFile(String nzDomainURL, File file) {
     402    public boolean processFile(String ccWETfileURL, File file) {
    276403    // skip if js, css, robots.txt
    277404    // if no-extension or htm or html, call processHTMLFile()
Note: See TracChangeset for help on using the changeset viewer.