Changeset 33466

Show
Ignore:
Timestamp:
12.09.2019 21:37:39 (5 weeks ago)
Author:
ak19
Message:

1. WETProcessor.main() now processes a folder of *.warc.wet(.gz) files. Each file's WET records is written out into an individual file and put into either the keep folder or discard folder, based on amount of content (number lines and/or content-length). 2. Moved unzipFile() from NZTLDProcessor.java into new Utility.java class as a static method.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

    r33411 r33466  
    169169        // don't have the WET file yet. Get it from the zip file, which we know we should have by now 
    170170         
    171         boolean success = unzipFile(inZipFile, WETfile); 
     171        boolean success = Utility.unzipFile(inZipFile, WETfile); 
     172        log("Unzipped " + inZipFile + " to " + WETfile); 
     173         
    172174        // whether we succeeded or not, get rid of the zipped file: 
    173175        if(!inZipFile.delete()) { 
     
    185187    } 
    186188 
     189    /* 
    187190    // Run gunzip 
    188191    // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process 
     
    216219    return true; 
    217220    } 
     221    */ 
    218222     
    219223    // wget will be launched from the specified directory, SITES_DIR 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33465 r33466  
    1  
    2  
    31package org.greenstone.atea; 
    42 
    53 
    64import java.io.*; 
     5import java.util.Properties; 
     6import java.util.zip.GZIPInputStream; 
     7import org.apache.log4j.Logger; 
    78 
    89/** 
    9  * To compile: 
    10  *      maori-lang-detection/src>javac -cp "." org/greenstone/atea/WETProcessor.java 
    11  * To run: 
    12  *      java org.greenstone.atea.WETProcessor <wetfile>.wet.warc <outputFolder> 
     10 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through 
     11 * the WET records in each, putting each WET record into a file. Each file is put into a 
     12 * keep or discard folder, based on content-length and number of lines. 
     13 * A single instance of the WETProcessor class processes a single unzipped warc.wet file. 
     14 * 
     15 * To compile, including the jars in lib/ for compiling. 
     16 *      maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java  
     17 * 
     18 * To run, passing the log4j and other properties files in conf/ folder: 
     19 *      maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor &lt;folder containing warc.wet(.gz) files&gt; &lt;outputFolder&gt; 
     20 * 
    1321 * e.g. 
    14  *    - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
    15  *    - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 
     22 *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
     23 *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
     24 *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2&gt;&amp;1 | less 
     25 * 
    1626*/ 
    1727public class WETProcessor { 
     28    //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 
     29    private Properties configProperties = null; 
     30 
     31     
    1832    // arbitrary cut-off values, TODO: put in .properties file 
    1933    private static final int MIN_CONTENT_LENGTH = 100; 
     
    2135    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500; 
    2236    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10;     
    23      
     37 
     38    // keep a count of all the records that all WETProcessors instantiated 
     39    // by our main method have processed 
     40    private static int recordCount = 0; 
     41 
    2442    private final File outputFolder; 
    25     private final String WETFileID;     
    26     private int recordCount = -1; 
    27      
    28     /* 
    29     public WETProcessor(File inFile) { 
    30  
     43    private final String WETFileID; 
     44 
     45 
     46    public static int getRecordCount() { return recordCount; } 
     47 
     48    public WETProcessor(File inFile, File outFolder, Properties configProps) { 
     49    this.outputFolder = outFolder; 
     50    this.configProperties = configProps; 
     51     
    3152    StringBuilder record = null; 
    3253    String line = null;  
    3354    boolean readingRecord = false; 
    34      
    35     // read from WETfile 
    36     try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 
    37         while((line = reader.readLine()) != null) { // readLine removes newline separator 
    38  
    39         if(line.startsWith("Content-Length:")) { 
    40             readingRecord = true; 
    41             record = new StringBuilder(); 
    42             continue; 
    43         } 
    44  
    45         if(readingRecord) { 
    46             if(line.startsWith("WARC/1.0")) { // finished previous WET record 
    47             readingRecord = false; 
    48              
    49             System.err.println("WET record:"); 
    50             System.err.println(record); 
    51             System.err.println("--------------------------"); 
    52             record = null; 
    53  
    54             processWETrecord(record); 
    55             } 
    56             else { 
    57             record.append(line + "\n"); // add back (unix style) line ending 
    58             } 
    59         } 
    60         } 
    61     } catch(IOException ioe) { 
    62         ioe.printStackTrace(); 
    63     } 
    64     } 
    65  
    66     public WETProcessor(File inFile, File outFolder) { 
    67     this.outputFolder = outFolder; 
    68      
    69     StringBuilder record = null; 
    70     String line = null;  
    71     boolean readingRecord = false; 
    72  
    73     int recordCount = 0; 
    74      
    75     // read from WETfile 
    76     try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 
    77          
    78         while((line = reader.readLine()) != null) { // readLine removes newline separator 
    79  
    80         if(line.startsWith("WARC-Type: warcinfo")) { 
    81             readingRecord = false; 
    82             record = null; // drop this record, which is just an info record not actual web page's text 
    83             recordCount--; 
    84             continue; 
    85         } 
    86          
    87         if(line.startsWith("WARC/1.0")) { // finished previous WET record 
    88             // process any previous record 
    89             if(record != null) { 
    90             processWETrecord(record, recordCount); 
    91             record = null; 
    92             } 
    93              
    94             recordCount++;           
    95             // get ready to start a new record 
    96             readingRecord = true; 
    97             record = new StringBuilder(); 
    98         } 
    99  
    100         if(readingRecord) { // append current line to current record 
    101             record.append(line + "\n"); // add back (unix style) line ending 
    102         } 
    103          
    104         } 
    105  
    106         // flush the last record. If it was a warcinfo record, record would be null here 
    107         if(record != null) { 
    108         processWETrecord(record, recordCount); 
    109         record = null; 
    110         } 
    111          
    112     } catch(IOException ioe) { 
    113         ioe.printStackTrace(); 
    114     } 
    115     } 
    116     */ 
    117  
    118     public WETProcessor(File inFile, File outFolder) { 
    119     this.outputFolder = outFolder; 
    120      
    121     StringBuilder record = null; 
    122     String line = null;  
    123     boolean readingRecord = false; 
    124  
    125     this.recordCount = 0; 
     55 
     56    //this.recordCount = 0; 
     57     
    12658    int contentLength = -1; // of record 
    12759    int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) 
     
    197129    } 
    198130 
    199     /* 
    200     private void processWETrecord(StringBuilder str, int recordID) { 
    201     String record = str.toString(); 
    202     //System.err.println("WET record:"); 
    203     //System.err.println(record); 
    204     //System.err.println("--------------------------"); 
    205  
    206     String paddedFileName = String.format("%04d.txt", recordID); 
    207     File outFile = new File(this.outputFolder, paddedFileName); 
    208     try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) { 
    209         writer.write(record); 
    210         writer.close(); 
    211     } catch(IOException ioe) { 
    212         ioe.printStackTrace(); 
    213         System.err.println("\n@@@@@@@@@ Error writing to file " + outFile); 
    214     } 
    215     } 
    216     */ 
    217  
    218     public int getRecordCount() { return this.recordCount; } 
    219      
    220     private void processWETrecord(int recordID, int contentLength, int lineCount, String record) { 
    221     System.err.println("WET record - contentLength: " + contentLength + " - lineCount: " + lineCount); 
     131     
     132    private void processWETrecord(int recordID, int contentLength, int lineCount, String record) 
     133    { 
     134    System.err.println("WET #" + this.WETFileID + " record #" + recordID 
     135               + " - contentLength: " + contentLength 
     136               + " - lineCount: " + lineCount); 
    222137    //System.err.println(record); 
    223138    //System.err.println("--------------------------"); 
     
    266181    public static void printUsage() { 
    267182    System.err.println("Run this program as:"); 
    268     System.err.println("\tWetProcessor <infile>.warc.wet <output folder path>");     
     183    System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");    
     184    } 
     185 
     186    /** Filename filter that only lists warc.wet files or else warc.wet.gz files 
     187     * for which unzipped warc.wet equivalents don't yet exist. 
     188     */ 
     189    private static class WETFilenameFilter implements FilenameFilter { 
     190     
     191    public boolean accept(File dir, String name) { 
     192        if(name.endsWith(".warc.wet")) { 
     193        System.err.println("Will include " + name + " for processing."); 
     194        return true; 
     195        } 
     196 
     197        if(name.endsWith(".warc.wet.gz")) { 
     198        String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz")); 
     199        File unzippedVersion = new File(dir, nameWithoutGZext); 
     200        if(unzippedVersion.exists()) { 
     201            System.err.println("--- Unzipped version " + unzippedVersion + " exists."); 
     202            System.err.println("Skipping " + name); 
     203            return false; // don't count gzipped version if unzipped version exists. 
     204        } 
     205        else { 
     206            System.err.println("Only zipped version " + name + " exists."); 
     207            return true; // No unzipped version, so have to work with gzipped version 
     208        } 
     209        } 
     210         
     211        System.err.println("Skipping " + name); 
     212 
     213        // we're not even interested in any other file extensions 
     214        return false; 
     215    } 
    269216    } 
    270217     
     
    275222    } 
    276223 
    277     File WETfile = new File(args[0]); 
    278     if(!WETfile.exists() || !WETfile.isFile()) { 
    279         System.err.println("Error: " + args[0] + " does not exist or is not a file"); 
     224     
     225    File WETFileDir = new File(args[0]); 
     226    if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 
     227        System.err.println("Error: " + args[0] + " does not exist or is not a directory"); 
    280228        return; 
    281229    } 
     
    285233        System.err.println("Error: " + args[1] + " does not exist or is not a directory."); 
    286234        return; 
    287     } 
    288  
    289      
     235    }    
    290236     
    291237    File discardFolder = new File(outFolder, "discard"); 
     
    296242    if(!keepFolder.exists()) { 
    297243        keepFolder.mkdir(); 
    298     }    
    299      
    300     WETProcessor processor = new WETProcessor(WETfile, outFolder); 
    301     return; 
    302      
     244    } 
     245 
     246    // load up the properties from the config file 
     247    Properties configProps; 
     248    try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) { 
     249        configProps = new Properties(); 
     250        configProps.load(infile); 
     251        infile.close(); 
     252         
     253    } catch(Exception e) { 
     254        System.err.println("Exception attempting to read properties from config.properties."); 
     255        e.printStackTrace(); 
     256        return; 
     257    } 
     258 
     259    // Will list all the warc.wet files in the input directory or else their gzipped versions 
     260    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 
     261 
     262    for(int i = 0; i < WETFiles.length; i++) { 
     263        File WETFile = WETFiles[i];      
     264        System.err.println("Processing WETfile: " + WETFile); 
     265 
     266        // Any .gz files listed means they haven't been unzipped yet. So unzip. 
     267        String WETFilename = WETFile.toString(); 
     268        if(WETFilename.endsWith(".gz")) { 
     269        File GZippedWETFile = WETFile; 
     270        String WETGZippedFilename = WETFilename; 
     271        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));      
     272 
     273        WETFile = new File(WETFilename); 
     274        Utility.unzipFile(GZippedWETFile, WETFile); 
     275        } 
     276        // hereafter all WETFiles should refer to the unzipped version 
     277        // Check the unzipped WETFile exists         
     278 
     279        if(!WETFile.exists() || !WETFile.isFile()) { 
     280        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 
     281        return; 
     282        } 
     283 
     284        // Finally, we can process this WETFile's records into the keep and discard pile 
     285        System.err.println("Off to process " + WETFile); 
     286        WETProcessor processor = new WETProcessor(WETFile, outFolder, configProps);      
     287         
     288    } 
     289     
     290    return;  
    303291    } 
    304292}