Changeset 33467


Ignore:
Timestamp:
2019-09-13T17:44:41+12:00 (5 years ago)
Author:
ak19
Message:

Improved the code to use a static block to load the needed properties from config.properties and initialise some static final ints from there. Code now uses the logger for debugging. New properties in config.properties. Returned code to use a counter, recordCount, re-zeroed for each WETProcessor since the count was used for unique filenames, and filename prefixes are unique for each warc.wet file. So these prefixes, in combination with keeping track of the recordcount per warc.wet file, each WET record written out to a file is assigned a unique filename. (No longer need a running total of all WET records across warc.wet files processed ensuring uniqueness of filenames.) All appears to still work similarly to previous commit in creating discard and keep subfolders.

Location:
gs3-extensions/maori-lang-detection
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/CommonCrawl.txt

    r33457 r33467  
    4747Sebastian
    4848
     49====================
     50wharariki:[239]/Scratch/ak19/gs3-extensions/maori-lang-detection/src>java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
     51
     52wharariki:[188]/Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET>ls keep | wc
     53   4090    4090   65440
     54wharariki:[189]/Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET>ls discard | wc
     55   1515    1515   24240
     56
     57We keep 4090 WET records and are discarding 1515.
     58
    4959=======================
    5060Latest version of the index's schema:
  • gs3-extensions/maori-lang-detection/MoreReading/Vagrant-Spark-Hadoop.txt

    r33457 r33467  
    221221 
    222222
    223 
    224223vagrant@node1:~$ locate guava.jar
    225224/usr/share/java/guava.jar
     
    243242vagrant@node1:~/ia-hadoop-tools$ hdfs dfs -put /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/.
    244243put: `/usr/local/hadoop/share/hadoop/common/.': No such file or directory
    245 # hadoop classpath locations are not hdfs filesystem
     244# hadoop classpath locations are not on the hdfs filesystem, but on the regular fs
    246245
    247246vagrant@node1:~/ia-hadoop-tools$ sudo cp /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/.
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33412 r33467  
    99# for downloading a single file
    1010wget.file.cmd=wget %%FILE_URL%%
     11
     12# Arbitrary cutoff values for WETProcessor.java
     13WETprocessor.min.content.length=100
     14WETprocessor.min.line.count=2
     15WETprocessor.min.content.length.wrapped.line=500
     16WETprocessor.min.spaces.per.wrapped.line=10
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33466 r33467  
    44import java.util.zip.GZIPInputStream;
    55
     6import org.apache.log4j.Logger;
     7
    68public class Utility {
     9    private static Logger logger = Logger.getLogger(org.greenstone.atea.Utility.class.getName());
    710   
    811    // Run gunzip
     
    2629        //out.close();
    2730       
    28         //log("Unzipped " + inZipFile + " to " + outFile);
     31        logger.debug("Unzipped " + inZipFile + " to " + outFile);
    2932       
    3033    } catch(IOException ex) {
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33466 r33467  
    55import java.util.Properties;
    66import java.util.zip.GZIPInputStream;
     7
    78import org.apache.log4j.Logger;
    89
     
    2627*/
    2728public class WETProcessor {
    28     //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
    29     private Properties configProperties = null;
    30 
     29    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());   
     30    private static Properties configProperties = new Properties();
     31
     32    // In Java, can initialize static final variables inside a static block
     33    // But the unavoidable try/catch in this static block prevents initialization of
     34    // the static final int variables further below that therefore need to be declared
     35    // and initialized thereafter.
     36    static {   
     37   
     38    // load up the properties from the config file
     39    try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
     40        configProperties = new Properties();
     41        configProperties.load(infile);     
     42        //infile.close();
     43       
     44    } catch(Exception e) {
     45        System.err.println("Exception attempting to read properties from config.properties.");
     46        logger.error("Exception attempting to read properties from config.properties.");
     47        e.printStackTrace();
     48    }
     49    }
    3150   
    32     // arbitrary cut-off values, TODO: put in .properties file
    33     private static final int MIN_CONTENT_LENGTH = 100;
    34     private static final int MIN_LINE_COUNT = 2;
    35     private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500;
    36     private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10;   
    37 
    38     // keep a count of all the records that all WETProcessors instantiated
    39     // by our main method have processed
    40     private static int recordCount = 0;
     51    // Providing fall-back cuttoff values if config.properties doesn't load
     52    // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
     53    private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
     54    private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
     55    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
     56    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
     57   
     58    // Keep a count of all the records that all WETProcessors instantiated
     59    // by our main method combined have processed
     60    //private static int recordCount = 0;
    4161
    4262    private final File outputFolder;
     
    4464
    4565
    46     public static int getRecordCount() { return recordCount; }
    47 
    48     public WETProcessor(File inFile, File outFolder, Properties configProps) {
     66    /**
     67     * WET processor processes a single warc.wet file containing multiple WET records
     68     * containing text identified as primary langcode=mri. Each individual WET record is written
     69     * out to a uniquely named file in either the keep or discard folder depending on the WET
     70     * record's content length and number of lines of actual content (excluding WARC headers).
     71     */
     72    public WETProcessor(File inFile, File outFolder) {
    4973    this.outputFolder = outFolder;
    50     this.configProperties = configProps;
    5174   
    5275    StringBuilder record = null;
    5376    String line = null;
    5477    boolean readingRecord = false;
    55 
    56     //this.recordCount = 0;
     78   
     79    int recordCount = 0;
    5780   
    5881    int contentLength = -1; // of record
     
    6285    // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
    6386    // the prefix will be everything after the last hyphen and without file extension,
    64     // so "000000" in our example.
     87    // so "000000" in our example. Then suffix the recordCount (keeping track of the current
     88    // WET record) to get a unique filename to store each WET record into.
    6589
    6690    String fileID = inFile.getName();
     
    129153    }
    130154
    131    
     155    /**
     156     * Determines if a WET record belongs in the keep or discard pile depending on if it
     157     * contains enough text, based on contentLength and line count of the record body.
     158     * Then writes out the WET record to a uniquely named file in the keep or discard folder.
     159     */
    132160    private void processWETrecord(int recordID, int contentLength, int lineCount, String record)
    133161    {
     
    171199    try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
    172200        writer.write(record);
    173         writer.close();
     201        // Try-with-resources examples don't call close() explicitly:
     202        // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
     203        //writer.close();
    174204    } catch(IOException ioe) {
    175205        ioe.printStackTrace();
     
    178208    }
    179209   
     210
     211    //public static int getRecordCount() { return recordCount; }
    180212   
    181213    public static void printUsage() {
     
    184216    }
    185217
    186     /** Filename filter that only lists warc.wet files or else warc.wet.gz files
     218    /** Filename filter to only list warc.wet files or else warc.wet.gz files
    187219     * for which unzipped warc.wet equivalents don't yet exist.
    188220     */
     
    191223    public boolean accept(File dir, String name) {
    192224        if(name.endsWith(".warc.wet")) {
    193         System.err.println("Will include " + name + " for processing.");
     225        logger.debug("Will include " + name + " for processing.");
    194226        return true;
    195227        }
     
    199231        File unzippedVersion = new File(dir, nameWithoutGZext);
    200232        if(unzippedVersion.exists()) {
    201             System.err.println("--- Unzipped version " + unzippedVersion + " exists.");
    202             System.err.println("Skipping " + name);
     233            logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
     234            logger.debug("Skipping " + name);
    203235            return false; // don't count gzipped version if unzipped version exists.
    204236        }
    205237        else {
    206             System.err.println("Only zipped version " + name + " exists.");
     238            logger.debug("Only zipped version " + name + " exists.");
    207239            return true; // No unzipped version, so have to work with gzipped version
    208240        }
    209241        }
    210        
    211         System.err.println("Skipping " + name);
    212242
    213243        // we're not even interested in any other file extensions
     244        logger.debug("Not a WET file. Skipping " + name);
    214245        return false;
    215246    }
     
    225256    File WETFileDir = new File(args[0]);
    226257    if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
    227         System.err.println("Error: " + args[0] + " does not exist or is not a directory");
     258        System.out.println("Error: " + args[0] + " does not exist or is not a directory");
    228259        return;
    229260    }
     
    231262    File outFolder = new File(args[1]);
    232263    if(!outFolder.exists() || !outFolder.isDirectory()) {
    233         System.err.println("Error: " + args[1] + " does not exist or is not a directory.");
     264        System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
    234265        return;
    235266    }   
     
    244275    }
    245276
    246     // load up the properties from the config file
    247     Properties configProps;
    248     try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) {
    249         configProps = new Properties();
    250         configProps.load(infile);
    251         infile.close();
    252        
    253     } catch(Exception e) {
    254         System.err.println("Exception attempting to read properties from config.properties.");
    255         e.printStackTrace();
    256         return;
    257     }
    258 
    259277    // Will list all the warc.wet files in the input directory or else their gzipped versions
    260278    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
     
    262280    for(int i = 0; i < WETFiles.length; i++) {
    263281        File WETFile = WETFiles[i];     
    264         System.err.println("Processing WETfile: " + WETFile);
     282        logger.debug("Processing WETfile: " + WETFile);
    265283
    266284        // Any .gz files listed means they haven't been unzipped yet. So unzip.
     
    279297        if(!WETFile.exists() || !WETFile.isFile()) {
    280298        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
     299        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
    281300        return;
    282301        }
    283302
    284303        // Finally, we can process this WETFile's records into the keep and discard pile
    285         System.err.println("Off to process " + WETFile);
    286         WETProcessor processor = new WETProcessor(WETFile, outFolder, configProps);     
     304        logger.debug("Off to process " + WETFile);
     305        WETProcessor processor = new WETProcessor(WETFile, outFolder);     
    287306       
    288307    }
Note: See TracChangeset for help on using the changeset viewer.