Changeset 33467

Show
Ignore:
Timestamp:
13.09.2019 17:44:41 (5 weeks ago)
Author:
ak19
Message:

Improved the code to use a static block to load the needed properties from config.properties and initialise some static final ints from there. Code now uses the logger for debugging. New properties in config.properties. Returned code to use a counter, recordCount, re-zeroed for each WETProcessor since the count was used for unique filenames, and filename prefixes are unique for each warc.wet file. So these prefixes, in combination with keeping track of the recordcount per warc.wet file, each WET record written out to a file is assigned a unique filename. (No longer need a running total of all WET records across warc.wet files processed ensuring uniqueness of filenames.) All appears to still work similarly to previous commit in creating discard and keep subfolders.

Location:
gs3-extensions/maori-lang-detection
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/MoreReading/CommonCrawl.txt

    r33457 r33467  
    4747Sebastian 
    4848 
     49==================== 
     50wharariki:[239]/Scratch/ak19/gs3-extensions/maori-lang-detection/src>java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 
     51 
     52wharariki:[188]/Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET>ls keep | wc 
     53   4090    4090   65440 
     54wharariki:[189]/Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET>ls discard | wc 
     55   1515    1515   24240 
     56 
     57We keep 4090 WET records and are discarding 1515. 
     58 
    4959======================= 
    5060Latest version of the index's schema: 
  • gs3-extensions/maori-lang-detection/MoreReading/Vagrant-Spark-Hadoop.txt

    r33457 r33467  
    221221  
    222222 
    223  
    224223vagrant@node1:~$ locate guava.jar 
    225224/usr/share/java/guava.jar 
     
    243242vagrant@node1:~/ia-hadoop-tools$ hdfs dfs -put /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/. 
    244243put: `/usr/local/hadoop/share/hadoop/common/.': No such file or directory 
    245 # hadoop classpath locations are not hdfs filesystem 
     244# hadoop classpath locations are not on the hdfs filesystem, but on the regular fs 
    246245 
    247246vagrant@node1:~/ia-hadoop-tools$ sudo cp /usr/share/java/guava.jar /usr/local/hadoop/share/hadoop/common/. 
  • gs3-extensions/maori-lang-detection/conf/config.properties

    r33412 r33467  
    99# for downloading a single file 
    1010wget.file.cmd=wget %%FILE_URL%% 
     11 
     12# Arbitrary cutoff values for WETProcessor.java 
     13WETprocessor.min.content.length=100 
     14WETprocessor.min.line.count=2 
     15WETprocessor.min.content.length.wrapped.line=500 
     16WETprocessor.min.spaces.per.wrapped.line=10 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33466 r33467  
    44import java.util.zip.GZIPInputStream; 
    55 
     6import org.apache.log4j.Logger; 
     7 
    68public class Utility { 
     9    private static Logger logger = Logger.getLogger(org.greenstone.atea.Utility.class.getName()); 
    710     
    811    // Run gunzip 
     
    2629        //out.close(); 
    2730         
    28         //log("Unzipped " + inZipFile + " to " + outFile); 
     31        logger.debug("Unzipped " + inZipFile + " to " + outFile); 
    2932         
    3033    } catch(IOException ex) { 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33466 r33467  
    55import java.util.Properties; 
    66import java.util.zip.GZIPInputStream; 
     7 
    78import org.apache.log4j.Logger; 
    89 
     
    2627*/ 
    2728public class WETProcessor { 
    28     //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 
    29     private Properties configProperties = null; 
    30  
     29    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());     
     30    private static Properties configProperties = new Properties(); 
     31 
     32    // In Java, can initialize static final variables inside a static block 
     33    // But the unavoidable try/catch in this static block prevents initialization of 
     34    // the static final int variables further below that therefore need to be declared 
     35    // and initialized thereafter. 
     36    static {     
     37     
     38    // load up the properties from the config file 
     39    try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 
     40        configProperties = new Properties(); 
     41        configProperties.load(infile);       
     42        //infile.close(); 
     43         
     44    } catch(Exception e) { 
     45        System.err.println("Exception attempting to read properties from config.properties."); 
     46        logger.error("Exception attempting to read properties from config.properties."); 
     47        e.printStackTrace(); 
     48    } 
     49    } 
    3150     
    32     // arbitrary cut-off values, TODO: put in .properties file 
    33     private static final int MIN_CONTENT_LENGTH = 100; 
    34     private static final int MIN_LINE_COUNT = 2; 
    35     private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500; 
    36     private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10;     
    37  
    38     // keep a count of all the records that all WETProcessors instantiated 
    39     // by our main method have processed 
    40     private static int recordCount = 0; 
     51    // Providing fall-back cuttoff values if config.properties doesn't load 
     52    // or doesn't have the named props. But what happens when Integer.parseInt throws an exception? 
     53    private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100")); 
     54    private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2")); 
     55    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 
     56    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 
     57     
     58    // Keep a count of all the records that all WETProcessors instantiated 
     59    // by our main method combined have processed 
     60    //private static int recordCount = 0; 
    4161 
    4262    private final File outputFolder; 
     
    4464 
    4565 
    46     public static int getRecordCount() { return recordCount; } 
    47  
    48     public WETProcessor(File inFile, File outFolder, Properties configProps) { 
     66    /** 
     67     * WET processor processes a single warc.wet file containing multiple WET records 
     68     * containing text identified as primary langcode=mri. Each individual WET record is written 
     69     * out to a uniquely named file in either the keep or discard folder depending on the WET 
     70     * record's content length and number of lines of actual content (excluding WARC headers). 
     71     */ 
     72    public WETProcessor(File inFile, File outFolder) { 
    4973    this.outputFolder = outFolder; 
    50     this.configProperties = configProps; 
    5174     
    5275    StringBuilder record = null; 
    5376    String line = null;  
    5477    boolean readingRecord = false; 
    55  
    56     //this.recordCount = 0; 
     78     
     79    int recordCount = 0; 
    5780     
    5881    int contentLength = -1; // of record 
     
    6285    // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet 
    6386    // the prefix will be everything after the last hyphen and without file extension, 
    64     // so "000000" in our example. 
     87    // so "000000" in our example. Then suffix the recordCount (keeping track of the current 
     88    // WET record) to get a unique filename to store each WET record into. 
    6589 
    6690    String fileID = inFile.getName(); 
     
    129153    } 
    130154 
    131      
     155    /** 
     156     * Determines if a WET record belongs in the keep or discard pile depending on if it 
     157     * contains enough text, based on contentLength and line count of the record body. 
     158     * Then writes out the WET record to a uniquely named file in the keep or discard folder. 
     159     */ 
    132160    private void processWETrecord(int recordID, int contentLength, int lineCount, String record) 
    133161    { 
     
    171199    try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) { 
    172200        writer.write(record); 
    173         writer.close(); 
     201        // Try-with-resources examples don't call close() explicitly: 
     202        // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html 
     203        //writer.close(); 
    174204    } catch(IOException ioe) { 
    175205        ioe.printStackTrace(); 
     
    178208    } 
    179209     
     210 
     211    //public static int getRecordCount() { return recordCount; } 
    180212     
    181213    public static void printUsage() { 
     
    184216    } 
    185217 
    186     /** Filename filter that only lists warc.wet files or else warc.wet.gz files 
     218    /** Filename filter to only list warc.wet files or else warc.wet.gz files 
    187219     * for which unzipped warc.wet equivalents don't yet exist. 
    188220     */ 
     
    191223    public boolean accept(File dir, String name) { 
    192224        if(name.endsWith(".warc.wet")) { 
    193         System.err.println("Will include " + name + " for processing."); 
     225        logger.debug("Will include " + name + " for processing."); 
    194226        return true; 
    195227        } 
     
    199231        File unzippedVersion = new File(dir, nameWithoutGZext); 
    200232        if(unzippedVersion.exists()) { 
    201             System.err.println("--- Unzipped version " + unzippedVersion + " exists."); 
    202             System.err.println("Skipping " + name); 
     233            logger.debug("--- Unzipped version " + unzippedVersion + " exists."); 
     234            logger.debug("Skipping " + name); 
    203235            return false; // don't count gzipped version if unzipped version exists. 
    204236        } 
    205237        else { 
    206             System.err.println("Only zipped version " + name + " exists."); 
     238            logger.debug("Only zipped version " + name + " exists."); 
    207239            return true; // No unzipped version, so have to work with gzipped version 
    208240        } 
    209241        } 
    210          
    211         System.err.println("Skipping " + name); 
    212242 
    213243        // we're not even interested in any other file extensions 
     244        logger.debug("Not a WET file. Skipping " + name); 
    214245        return false; 
    215246    } 
     
    225256    File WETFileDir = new File(args[0]); 
    226257    if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 
    227         System.err.println("Error: " + args[0] + " does not exist or is not a directory"); 
     258        System.out.println("Error: " + args[0] + " does not exist or is not a directory"); 
    228259        return; 
    229260    } 
     
    231262    File outFolder = new File(args[1]); 
    232263    if(!outFolder.exists() || !outFolder.isDirectory()) { 
    233         System.err.println("Error: " + args[1] + " does not exist or is not a directory."); 
     264        System.out.println("Error: " + args[1] + " does not exist or is not a directory."); 
    234265        return; 
    235266    }    
     
    244275    } 
    245276 
    246     // load up the properties from the config file 
    247     Properties configProps; 
    248     try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) { 
    249         configProps = new Properties(); 
    250         configProps.load(infile); 
    251         infile.close(); 
    252          
    253     } catch(Exception e) { 
    254         System.err.println("Exception attempting to read properties from config.properties."); 
    255         e.printStackTrace(); 
    256         return; 
    257     } 
    258  
    259277    // Will list all the warc.wet files in the input directory or else their gzipped versions 
    260278    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 
     
    262280    for(int i = 0; i < WETFiles.length; i++) { 
    263281        File WETFile = WETFiles[i];      
    264         System.err.println("Processing WETfile: " + WETFile); 
     282        logger.debug("Processing WETfile: " + WETFile); 
    265283 
    266284        // Any .gz files listed means they haven't been unzipped yet. So unzip. 
     
    279297        if(!WETFile.exists() || !WETFile.isFile()) { 
    280298        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 
     299        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 
    281300        return; 
    282301        } 
    283302 
    284303        // Finally, we can process this WETFile's records into the keep and discard pile 
    285         System.err.println("Off to process " + WETFile); 
    286         WETProcessor processor = new WETProcessor(WETFile, outFolder, configProps);      
     304        logger.debug("Off to process " + WETFile); 
     305        WETProcessor processor = new WETProcessor(WETFile, outFolder);       
    287306         
    288307    }