Changeset 33411

Show
Ignore:
Timestamp:
13.08.2019 21:50:29 (9 days ago)
Author:
ak19
Message:

Newer version now doesn't mirror sites with wget but gets WET files and unzips them instead. Didn't yet get to the part where I can process the actual contents inside each 400MB WET file.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

    r33410 r33411  
    33import java.io.*; 
    44import java.util.Properties; 
     5import java.util.zip.GZIPInputStream; 
    56 
    67import org.apache.log4j.Logger; 
     
    1718 *  
    1819 * Run as: 
    19  *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-urls-2019-08-09.txt 
     20 *     maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt 
    2021 */ 
    2122public class NZTLDProcessor 
     
    2930    private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();     
    3031    private final String SITES_DIR = "../dwn-sites"; 
    31  
     32    private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/"; 
     33     
     34     
    3235    private void log(String msg) { 
    3336    System.err.println(msg); 
     
    4144     
    4245    private void error(String msg) { 
    43     System.err.println("ERROR: " + msg); 
     46    System.err.println("### ERROR: " + msg); 
    4447    logger.error(msg); 
    4548    } 
    4649     
    4750    private void warn(String msg) { 
    48     System.err.println("WARN: " + msg); 
     51    System.err.println("*** WARN: " + msg); 
    4952    logger.warn(msg); 
    5053    } 
     
    5457     
    5558    if(DEBUG_MODE) { 
    56         warn("**** Debugger is turned ON!!!"); 
     59        warn("Debugger is turned ON!!!"); 
    5760    } 
    5861     
     
    8588        log("Got URL: " + url); 
    8689 
     90        // skip urls containing "/crawldiagnostics/" and "/robotstxt/" 
     91        if(url.indexOf("/robotstxt/") != -1) continue; 
     92        if(url.indexOf("/crawldiagnostics/") != -1) continue; 
     93 
     94        // convert the remaining WARC urls to WET urls 
     95        url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-"); 
     96        url = url.replace(".warc.gz", ".warc.wet.gz"); 
     97 
     98        // add the prefix 
     99        url = COMMONCRAWL_DATA_PREFIX + url; 
     100         
     101        log("Final WET URL: " + url); 
     102             
    87103        processURL(url); 
    88104         
     
    103119    } 
    104120 
    105     public boolean processURL(String nzDomainURL) { 
     121    public boolean processURL(String ccWETfileURL) { 
    106122    // launch wget on URL 
    107123    // when download done, recurse through downloaded dir 
     
    112128 
    113129    if(DEBUG_MODE) { 
    114         nzDomainURL = "www.waikato.ac.nz"; 
    115     } 
    116     log("Processing seed URL: " + nzDomainURL); 
    117  
    118     //if(wgetMirrorSite(nzDomainURL)) { 
    119     if(!DEBUG_MODE) { wgetMirrorSite(nzDomainURL); } // TODO: take out debug_mode condition 
    120      
    121     processDownloadedDir(nzDomainURL); 
     130        ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz"; 
     131    } 
     132    log("Processing WET file URL: " + ccWETfileURL); 
     133 
     134    /* 
     135    //if(wgetMirrorSite(ccWETfileURL)) { 
     136    if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition 
     137     
     138    processDownloadedDir(ccWETfileURL); 
    122139     
    123140    //} 
    124     return true; 
    125     } 
    126  
     141    */ 
     142 
     143     
     144    processWETfile(ccWETfileURL); 
     145     
     146     
     147    return true; 
     148    } 
     149 
     150    public boolean processWETfile(String ccWETfileURL) { 
     151    String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1); 
     152    String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz")); 
     153 
     154    File inZipFile = new File(SITES_DIR, zippedWETfileName); 
     155    File WETfile = new File(SITES_DIR, unzippedWETfileName); 
     156     
     157    if(WETfile.exists()) { 
     158        log("Unzipped WET file " + WETfile + " already exists"); 
     159    }  
     160    else { 
     161 
     162        if(inZipFile.exists()) { 
     163        log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists");    
     164        } 
     165        else if(!wgetWETfile(ccWETfileURL)) { 
     166        return false; 
     167        }    
     168 
     169        // don't have the WET file yet. Get it from the zip file, which we know we should have by now 
     170         
     171        boolean success = unzipFile(inZipFile, WETfile); 
     172        // whether we succeeded or not, get rid of the zipped file: 
     173        if(!inZipFile.delete()) { 
     174        warn("Unable to delete zipped WET file: " + zippedWETfileName); 
     175        } 
     176         
     177        if(!success) { 
     178        return false; 
     179        } 
     180    } 
     181 
     182    // read in the giant WET file and  
     183     
     184    return true; 
     185    } 
     186 
     187    // Run gunzip 
     188    // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process 
     189    // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/ 
     190    public boolean unzipFile(File inZipFile, File outFile) { 
     191     
     192     
     193    byte[] buffer = new byte[1024]; 
     194 
     195    // try-with-resources will safely close streams/dispose resources on success or error and exceptions 
     196    try (        
     197        GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile));      
     198        FileOutputStream out = new FileOutputStream(outFile); 
     199    ) { 
     200        int len; 
     201        while ((len = gzis.read(buffer)) > 0) { 
     202            out.write(buffer, 0, len); 
     203        } 
     204         
     205        //gzis.close(); 
     206        //out.close(); 
     207         
     208        log("Unzipped " + inZipFile + " to " + outFile); 
     209         
     210    } catch(IOException ex) { 
     211        error("Failed to unzip " + inZipFile); 
     212        ex.printStackTrace(); 
     213        return false; 
     214    } 
     215 
     216    return true; 
     217    } 
     218     
     219    // wget will be launched from the specified directory, SITES_DIR 
     220    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html 
     221    public boolean wgetWETfile(String ccWETfileURL) { 
     222    // use SafeProcess and wget command in lib/config.properties 
     223    // DONE: set up logging  
     224     
     225    String wgetCmd = configProps.getProperty("wget.file.cmd"); 
     226    if(wgetCmd == null || wgetCmd.equals("")) { 
     227        System.err.println("Invalid or empty wget.file.cmd in config.properties"); 
     228        return false; 
     229    } 
     230 
     231    // replace the placeholder in the wget cmd for the seed url 
     232    wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL); 
     233     
     234    log("Will launch wget with the command: " + wgetCmd); 
     235 
     236    String[] wgetCommandArgs = wgetCmd.split(" "); 
     237    SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR)); 
     238     
     239    SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT); 
     240    SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR); 
     241         
     242    int success = wgetProcess.runProcess(outLineHandler, errLineHandler); 
     243 
     244    if(success != 0) { 
     245        System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\""); 
     246        return false; 
     247            // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz? 
     248    } 
     249     
     250    return true; 
     251    } 
     252     
     253     
    127254    // wget will be launched from the specified directory, SITES_DIR 
    128255    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html 
     
    158285     
    159286    return true; 
    160     } 
     287    }     
    161288 
    162289     
     
    191318 
    192319 
    193     public boolean processDownloadedDir(String nzDomainURL) { 
     320    public boolean processDownloadedDir(String ccWETfileURL) { 
    194321    // recurse through the downloaded directory, then process each file 
    195322 
    196     File downloadedSiteDir = new File(SITES_DIR, nzDomainURL); 
     323    File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL); 
    197324    log("*** Will process download dir " + downloadedSiteDir); 
    198325     
    199326    if(!downloadedSiteDir.exists()) { 
    200         error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 
     327        error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 
    201328        return false; 
    202329    } 
    203330    /* 
    204331    if(!downloadedSiteDir.isDirectory()) { 
    205         error("*** Downloaded site " + downloadedSiteDir + " is not a directory!"); 
     332        error("Downloaded site " + downloadedSiteDir + " is not a directory!"); 
    206333        return false; // TODO: or redo wget with "www." prefixed to URL??? 
    207334    } 
    208335    */ 
    209     recursivelyProcessDir(nzDomainURL, downloadedSiteDir); 
     336    recursivelyProcessDir(ccWETfileURL, downloadedSiteDir); 
    210337 
    211338    debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex); 
     
    214341    } 
    215342 
    216     private void recursivelyProcessDir(String nzDomainURL, File file) { 
     343    private void recursivelyProcessDir(String ccWETfileURL, File file) { 
    217344 
    218345    if(file.isDirectory()) { // recursive step 
     
    225352        // children array will be empty if 'file' dir was empty 
    226353        for(int i = 0; i < children.length; i++) { 
    227             recursivelyProcessDir(nzDomainURL, children[i]); 
     354            recursivelyProcessDir(ccWETfileURL, children[i]); 
    228355        } 
    229356        } 
    230357         
    231358    } else { // base step 
    232         processFile(nzDomainURL, file);      
     359        processFile(ccWETfileURL, file);         
    233360    } 
    234361    }    
    235362     
    236363    /* 
    237     public boolean processDownloadedDir(String nzDomainURL) { 
     364    public boolean processDownloadedDir(String ccWETfileURL) { 
    238365    // recurse through the downloaded directory, then process each file 
    239366 
    240     File downloadedSiteDir = new File(SITES_DIR, nzDomainURL); 
     367    File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL); 
    241368    if(!downloadedSiteDir.exists()) { 
    242         error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 
     369        error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 
    243370        return false; 
    244371    } 
     
    249376        return false; 
    250377    } 
    251     recursivelyProcessDir(nzDomainURL, files); 
    252      
    253     return true; 
    254     } 
    255  
    256     private void recursivelyProcessDir(String nzDomainURL, File[] children) { 
     378    recursivelyProcessDir(ccWETfileURL, files); 
     379     
     380    return true; 
     381    } 
     382 
     383    private void recursivelyProcessDir(String ccWETfileURL, File[] children) { 
    257384    for(int i = 0; i < files.length; i++) { 
    258385        if(files[i].isDirectory()) { // recursive step 
     
    264391            error("IO error trying to list children of " + files[i]); 
    265392        } else { 
    266             recursivelyProcessDir(nzDomainURL, children); 
     393            recursivelyProcessDir(ccWETfileURL, children); 
    267394        } 
    268395        } 
    269396        else { // base step 
    270         processFile(nzDomainURL, files[i]); 
     397        processFile(ccWETfileURL, files[i]); 
    271398        } 
    272399    } 
    273400    }*/ 
    274401     
    275     public boolean processFile(String nzDomainURL, File file) { 
     402    public boolean processFile(String ccWETfileURL, File file) { 
    276403    // skip if js, css, robots.txt 
    277404    // if no-extension or htm or html, call processHTMLFile()