Ignore:
Timestamp:
2019-09-12T21:37:39+12:00 (5 years ago)
Author:
ak19
Message:
  1. WETProcessor.main() now processes a folder of *.warc.wet(.gz) files. Each file's WET records is written out into an individual file and put into either the keep folder or discard folder, based on amount of content (number lines and/or content-length). 2. Moved unzipFile() from NZTLDProcessor.java into new Utility.java class as a static method.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33465 r33466  
    1 
    2 
    31package org.greenstone.atea;
    42
    53
    64import java.io.*;
     5import java.util.Properties;
     6import java.util.zip.GZIPInputStream;
     7import org.apache.log4j.Logger;
    78
    89/**
    9  * To compile:
    10  *      maori-lang-detection/src>javac -cp "." org/greenstone/atea/WETProcessor.java
    11  * To run:
    12  *      java org.greenstone.atea.WETProcessor <wetfile>.wet.warc <outputFolder>
     10 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
     11 * the WET records in each, putting each WET record into a file. Each file is put into a
     12 * keep or discard folder, based on content-length and number of lines.
     13 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
     14 *
     15 * To compile, including the jars in lib/ for compiling.
     16 *      maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
     17 *
     18 * To run, passing the log4j and other properties files in conf/ folder:
     19 *      maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor &lt;folder containing warc.wet(.gz) files&gt; &lt;outputFolder&gt;
     20 *
    1321 * e.g.
    14  *    - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
    15  *    - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
     22 *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
     23 *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
     24 *    - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2&gt;&amp;1 | less
     25 *
    1626*/
    1727public class WETProcessor {
     28    //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
     29    private Properties configProperties = null;
     30
     31   
    1832    // arbitrary cut-off values, TODO: put in .properties file
    1933    private static final int MIN_CONTENT_LENGTH = 100;
     
    2135    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500;
    2236    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10;   
    23    
     37
     38    // keep a count of all the records that all WETProcessors instantiated
     39    // by our main method have processed
     40    private static int recordCount = 0;
     41
    2442    private final File outputFolder;
    25     private final String WETFileID;   
    26     private int recordCount = -1;
    27    
    28     /*
    29     public WETProcessor(File inFile) {
    30 
     43    private final String WETFileID;
     44
     45
     46    public static int getRecordCount() { return recordCount; }
     47
     48    public WETProcessor(File inFile, File outFolder, Properties configProps) {
     49    this.outputFolder = outFolder;
     50    this.configProperties = configProps;
     51   
    3152    StringBuilder record = null;
    3253    String line = null;
    3354    boolean readingRecord = false;
    34    
    35     // read from WETfile
    36     try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
    37         while((line = reader.readLine()) != null) { // readLine removes newline separator
    38 
    39         if(line.startsWith("Content-Length:")) {
    40             readingRecord = true;
    41             record = new StringBuilder();
    42             continue;
    43         }
    44 
    45         if(readingRecord) {
    46             if(line.startsWith("WARC/1.0")) { // finished previous WET record
    47             readingRecord = false;
    48            
    49             System.err.println("WET record:");
    50             System.err.println(record);
    51             System.err.println("--------------------------");
    52             record = null;
    53 
    54             processWETrecord(record);
    55             }
    56             else {
    57             record.append(line + "\n"); // add back (unix style) line ending
    58             }
    59         }
    60         }
    61     } catch(IOException ioe) {
    62         ioe.printStackTrace();
    63     }
    64     }
    65 
    66     public WETProcessor(File inFile, File outFolder) {
    67     this.outputFolder = outFolder;
    68    
    69     StringBuilder record = null;
    70     String line = null;
    71     boolean readingRecord = false;
    72 
    73     int recordCount = 0;
    74    
    75     // read from WETfile
    76     try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
    77        
    78         while((line = reader.readLine()) != null) { // readLine removes newline separator
    79 
    80         if(line.startsWith("WARC-Type: warcinfo")) {
    81             readingRecord = false;
    82             record = null; // drop this record, which is just an info record not actual web page's text
    83             recordCount--;
    84             continue;
    85         }
    86        
    87         if(line.startsWith("WARC/1.0")) { // finished previous WET record
    88             // process any previous record
    89             if(record != null) {
    90             processWETrecord(record, recordCount);
    91             record = null;
    92             }
    93            
    94             recordCount++;         
    95             // get ready to start a new record
    96             readingRecord = true;
    97             record = new StringBuilder();
    98         }
    99 
    100         if(readingRecord) { // append current line to current record
    101             record.append(line + "\n"); // add back (unix style) line ending
    102         }
    103        
    104         }
    105 
    106         // flush the last record. If it was a warcinfo record, record would be null here
    107         if(record != null) {
    108         processWETrecord(record, recordCount);
    109         record = null;
    110         }
    111        
    112     } catch(IOException ioe) {
    113         ioe.printStackTrace();
    114     }
    115     }
    116     */
    117 
    118     public WETProcessor(File inFile, File outFolder) {
    119     this.outputFolder = outFolder;
    120    
    121     StringBuilder record = null;
    122     String line = null;
    123     boolean readingRecord = false;
    124 
    125     this.recordCount = 0;
     55
     56    //this.recordCount = 0;
     57   
    12658    int contentLength = -1; // of record
    12759    int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
     
    197129    }
    198130
    199     /*
    200     private void processWETrecord(StringBuilder str, int recordID) {
    201     String record = str.toString();
    202     //System.err.println("WET record:");
    203     //System.err.println(record);
    204     //System.err.println("--------------------------");
    205 
    206     String paddedFileName = String.format("%04d.txt", recordID);
    207     File outFile = new File(this.outputFolder, paddedFileName);
    208     try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
    209         writer.write(record);
    210         writer.close();
    211     } catch(IOException ioe) {
    212         ioe.printStackTrace();
    213         System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
    214     }
    215     }
    216     */
    217 
    218     public int getRecordCount() { return this.recordCount; }
    219    
    220     private void processWETrecord(int recordID, int contentLength, int lineCount, String record) {
    221     System.err.println("WET record - contentLength: " + contentLength + " - lineCount: " + lineCount);
     131   
     132    private void processWETrecord(int recordID, int contentLength, int lineCount, String record)
     133    {
     134    System.err.println("WET #" + this.WETFileID + " record #" + recordID
     135               + " - contentLength: " + contentLength
     136               + " - lineCount: " + lineCount);
    222137    //System.err.println(record);
    223138    //System.err.println("--------------------------");
     
    266181    public static void printUsage() {
    267182    System.err.println("Run this program as:");
    268     System.err.println("\tWetProcessor <infile>.warc.wet <output folder path>");   
     183    System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");   
     184    }
     185
     186    /** Filename filter that only lists warc.wet files or else warc.wet.gz files
     187     * for which unzipped warc.wet equivalents don't yet exist.
     188     */
     189    private static class WETFilenameFilter implements FilenameFilter {
     190   
     191    public boolean accept(File dir, String name) {
     192        if(name.endsWith(".warc.wet")) {
     193        System.err.println("Will include " + name + " for processing.");
     194        return true;
     195        }
     196
     197        if(name.endsWith(".warc.wet.gz")) {
     198        String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
     199        File unzippedVersion = new File(dir, nameWithoutGZext);
     200        if(unzippedVersion.exists()) {
     201            System.err.println("--- Unzipped version " + unzippedVersion + " exists.");
     202            System.err.println("Skipping " + name);
     203            return false; // don't count gzipped version if unzipped version exists.
     204        }
     205        else {
     206            System.err.println("Only zipped version " + name + " exists.");
     207            return true; // No unzipped version, so have to work with gzipped version
     208        }
     209        }
     210       
     211        System.err.println("Skipping " + name);
     212
     213        // we're not even interested in any other file extensions
     214        return false;
     215    }
    269216    }
    270217   
     
    275222    }
    276223
    277     File WETfile = new File(args[0]);
    278     if(!WETfile.exists() || !WETfile.isFile()) {
    279         System.err.println("Error: " + args[0] + " does not exist or is not a file");
     224   
     225    File WETFileDir = new File(args[0]);
     226    if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
     227        System.err.println("Error: " + args[0] + " does not exist or is not a directory");
    280228        return;
    281229    }
     
    285233        System.err.println("Error: " + args[1] + " does not exist or is not a directory.");
    286234        return;
    287     }
    288 
    289    
     235    }   
    290236   
    291237    File discardFolder = new File(outFolder, "discard");
     
    296242    if(!keepFolder.exists()) {
    297243        keepFolder.mkdir();
    298     }   
    299    
    300     WETProcessor processor = new WETProcessor(WETfile, outFolder);
    301     return;
    302    
     244    }
     245
     246    // load up the properties from the config file
     247    Properties configProps;
     248    try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) {
     249        configProps = new Properties();
     250        configProps.load(infile);
     251        infile.close();
     252       
     253    } catch(Exception e) {
     254        System.err.println("Exception attempting to read properties from config.properties.");
     255        e.printStackTrace();
     256        return;
     257    }
     258
     259    // Will list all the warc.wet files in the input directory or else their gzipped versions
     260    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
     261
     262    for(int i = 0; i < WETFiles.length; i++) {
     263        File WETFile = WETFiles[i];     
     264        System.err.println("Processing WETfile: " + WETFile);
     265
     266        // Any .gz files listed means they haven't been unzipped yet. So unzip.
     267        String WETFilename = WETFile.toString();
     268        if(WETFilename.endsWith(".gz")) {
     269        File GZippedWETFile = WETFile;
     270        String WETGZippedFilename = WETFilename;
     271        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));     
     272
     273        WETFile = new File(WETFilename);
     274        Utility.unzipFile(GZippedWETFile, WETFile);
     275        }
     276        // hereafter all WETFiles should refer to the unzipped version
     277        // Check the unzipped WETFile exists       
     278
     279        if(!WETFile.exists() || !WETFile.isFile()) {
     280        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
     281        return;
     282        }
     283
     284        // Finally, we can process this WETFile's records into the keep and discard pile
     285        System.err.println("Off to process " + WETFile);
     286        WETProcessor processor = new WETProcessor(WETFile, outFolder, configProps);     
     287       
     288    }
     289   
     290    return;
    303291    }
    304292}
Note: See TracChangeset for help on using the changeset viewer.