- Timestamp:
- 2019-09-13T17:44:41+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33466 r33467 5 5 import java.util.Properties; 6 6 import java.util.zip.GZIPInputStream; 7 7 8 import org.apache.log4j.Logger; 8 9 … … 26 27 */ 27 28 public class WETProcessor { 28 //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 29 private Properties configProperties = null; 30 29 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 30 private static Properties configProperties = new Properties(); 31 32 // In Java, can initialize static final variables inside a static block 33 // But the unavoidable try/catch in this static block prevents initialization of 34 // the static final int variables further below that therefore need to be declared 35 // and initialized thereafter. 36 static { 37 38 // load up the properties from the config file 39 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 40 configProperties = new Properties(); 41 configProperties.load(infile); 42 //infile.close(); 43 44 } catch(Exception e) { 45 System.err.println("Exception attempting to read properties from config.properties."); 46 logger.error("Exception attempting to read properties from config.properties."); 47 e.printStackTrace(); 48 } 49 } 31 50 32 // arbitrary cut-off values, TODO: put in .properties file 33 private static final int MIN_CONTENT_LENGTH = 100; 34 private static final int MIN_LINE_COUNT = 2; 35 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500; 36 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10; 37 38 // keep a count of all the records that all WETProcessors instantiated 39 // by our main method have processed 40 private static int recordCount = 0; 51 // Providing fall-back cuttoff values if config.properties doesn't load 52 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception? 53 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100")); 54 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2")); 55 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 56 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 57 58 // Keep a count of all the records that all WETProcessors instantiated 59 // by our main method combined have processed 60 //private static int recordCount = 0; 41 61 42 62 private final File outputFolder; … … 44 64 45 65 46 public static int getRecordCount() { return recordCount; } 47 48 public WETProcessor(File inFile, File outFolder, Properties configProps) { 66 /** 67 * WET processor processes a single warc.wet file containing multiple WET records 68 * containing text identified as primary langcode=mri. Each individual WET record is written 69 * out to a uniquely named file in either the keep or discard folder depending on the WET 70 * record's content length and number of lines of actual content (excluding WARC headers). 71 */ 72 public WETProcessor(File inFile, File outFolder) { 49 73 this.outputFolder = outFolder; 50 this.configProperties = configProps;51 74 52 75 StringBuilder record = null; 53 76 String line = null; 54 77 boolean readingRecord = false; 55 56 //this.recordCount = 0;78 79 int recordCount = 0; 57 80 58 81 int contentLength = -1; // of record … … 62 85 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet 63 86 // the prefix will be everything after the last hyphen and without file extension, 64 // so "000000" in our example. 87 // so "000000" in our example. Then suffix the recordCount (keeping track of the current 88 // WET record) to get a unique filename to store each WET record into. 65 89 66 90 String fileID = inFile.getName(); … … 129 153 } 130 154 131 155 /** 156 * Determines if a WET record belongs in the keep or discard pile depending on if it 157 * contains enough text, based on contentLength and line count of the record body. 158 * Then writes out the WET record to a uniquely named file in the keep or discard folder. 159 */ 132 160 private void processWETrecord(int recordID, int contentLength, int lineCount, String record) 133 161 { … … 171 199 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) { 172 200 writer.write(record); 173 writer.close(); 201 // Try-with-resources examples don't call close() explicitly: 202 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html 203 //writer.close(); 174 204 } catch(IOException ioe) { 175 205 ioe.printStackTrace(); … … 178 208 } 179 209 210 211 //public static int getRecordCount() { return recordCount; } 180 212 181 213 public static void printUsage() { … … 184 216 } 185 217 186 /** Filename filter t hat only listswarc.wet files or else warc.wet.gz files218 /** Filename filter to only list warc.wet files or else warc.wet.gz files 187 219 * for which unzipped warc.wet equivalents don't yet exist. 188 220 */ … … 191 223 public boolean accept(File dir, String name) { 192 224 if(name.endsWith(".warc.wet")) { 193 System.err.println("Will include " + name + " for processing.");225 logger.debug("Will include " + name + " for processing."); 194 226 return true; 195 227 } … … 199 231 File unzippedVersion = new File(dir, nameWithoutGZext); 200 232 if(unzippedVersion.exists()) { 201 System.err.println("--- Unzipped version " + unzippedVersion + " exists.");202 System.err.println("Skipping " + name);233 logger.debug("--- Unzipped version " + unzippedVersion + " exists."); 234 logger.debug("Skipping " + name); 203 235 return false; // don't count gzipped version if unzipped version exists. 204 236 } 205 237 else { 206 System.err.println("Only zipped version " + name + " exists.");238 logger.debug("Only zipped version " + name + " exists."); 207 239 return true; // No unzipped version, so have to work with gzipped version 208 240 } 209 241 } 210 211 System.err.println("Skipping " + name);212 242 213 243 // we're not even interested in any other file extensions 244 logger.debug("Not a WET file. Skipping " + name); 214 245 return false; 215 246 } … … 225 256 File WETFileDir = new File(args[0]); 226 257 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 227 System. err.println("Error: " + args[0] + " does not exist or is not a directory");258 System.out.println("Error: " + args[0] + " does not exist or is not a directory"); 228 259 return; 229 260 } … … 231 262 File outFolder = new File(args[1]); 232 263 if(!outFolder.exists() || !outFolder.isDirectory()) { 233 System. err.println("Error: " + args[1] + " does not exist or is not a directory.");264 System.out.println("Error: " + args[1] + " does not exist or is not a directory."); 234 265 return; 235 266 } … … 244 275 } 245 276 246 // load up the properties from the config file247 Properties configProps;248 try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) {249 configProps = new Properties();250 configProps.load(infile);251 infile.close();252 253 } catch(Exception e) {254 System.err.println("Exception attempting to read properties from config.properties.");255 e.printStackTrace();256 return;257 }258 259 277 // Will list all the warc.wet files in the input directory or else their gzipped versions 260 278 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); … … 262 280 for(int i = 0; i < WETFiles.length; i++) { 263 281 File WETFile = WETFiles[i]; 264 System.err.println("Processing WETfile: " + WETFile);282 logger.debug("Processing WETfile: " + WETFile); 265 283 266 284 // Any .gz files listed means they haven't been unzipped yet. So unzip. … … 279 297 if(!WETFile.exists() || !WETFile.isFile()) { 280 298 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 299 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 281 300 return; 282 301 } 283 302 284 303 // Finally, we can process this WETFile's records into the keep and discard pile 285 System.err.println("Off to process " + WETFile);286 WETProcessor processor = new WETProcessor(WETFile, outFolder , configProps);304 logger.debug("Off to process " + WETFile); 305 WETProcessor processor = new WETProcessor(WETFile, outFolder); 287 306 288 307 }
Note:
See TracChangeset
for help on using the changeset viewer.