Changeset 33466 for gs3-extensions
- Timestamp:
- 2019-09-12T21:37:39+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java
r33411 r33466 169 169 // don't have the WET file yet. Get it from the zip file, which we know we should have by now 170 170 171 boolean success = unzipFile(inZipFile, WETfile); 171 boolean success = Utility.unzipFile(inZipFile, WETfile); 172 log("Unzipped " + inZipFile + " to " + WETfile); 173 172 174 // whether we succeeded or not, get rid of the zipped file: 173 175 if(!inZipFile.delete()) { … … 185 187 } 186 188 189 /* 187 190 // Run gunzip 188 191 // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process … … 216 219 return true; 217 220 } 221 */ 218 222 219 223 // wget will be launched from the specified directory, SITES_DIR -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33465 r33466 1 2 3 1 package org.greenstone.atea; 4 2 5 3 6 4 import java.io.*; 5 import java.util.Properties; 6 import java.util.zip.GZIPInputStream; 7 import org.apache.log4j.Logger; 7 8 8 9 /** 9 * To compile: 10 * maori-lang-detection/src>javac -cp "." org/greenstone/atea/WETProcessor.java 11 * To run: 12 * java org.greenstone.atea.WETProcessor <wetfile>.wet.warc <outputFolder> 10 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through 11 * the WET records in each, putting each WET record into a file. Each file is put into a 12 * keep or discard folder, based on content-length and number of lines. 13 * A single instance of the WETProcessor class processes a single unzipped warc.wet file. 14 * 15 * To compile, including the jars in lib/ for compiling. 16 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java 17 * 18 * To run, passing the log4j and other properties files in conf/ folder: 19 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder> 20 * 13 21 * e.g. 14 * - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 15 * - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 22 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 23 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 24 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less 25 * 16 26 */ 17 27 public class WETProcessor { 28 //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 29 private Properties configProperties = null; 30 31 18 32 // arbitrary cut-off values, TODO: put in .properties file 19 33 private static final int MIN_CONTENT_LENGTH = 100; … … 21 35 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500; 22 36 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10; 23 37 38 // keep a count of all the records that all WETProcessors instantiated 39 // by our main method have processed 40 private static int recordCount = 0; 41 24 42 private final File outputFolder; 25 private final String WETFileID; 26 private int recordCount = -1; 27 28 /* 29 public WETProcessor(File inFile) { 30 43 private final String WETFileID; 44 45 46 public static int getRecordCount() { return recordCount; } 47 48 public WETProcessor(File inFile, File outFolder, Properties configProps) { 49 this.outputFolder = outFolder; 50 this.configProperties = configProps; 51 31 52 StringBuilder record = null; 32 53 String line = null; 33 54 boolean readingRecord = false; 34 35 // read from WETfile 36 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 37 while((line = reader.readLine()) != null) { // readLine removes newline separator 38 39 if(line.startsWith("Content-Length:")) { 40 readingRecord = true; 41 record = new StringBuilder(); 42 continue; 43 } 44 45 if(readingRecord) { 46 if(line.startsWith("WARC/1.0")) { // finished previous WET record 47 readingRecord = false; 48 49 System.err.println("WET record:"); 50 System.err.println(record); 51 System.err.println("--------------------------"); 52 record = null; 53 54 processWETrecord(record); 55 } 56 else { 57 record.append(line + "\n"); // add back (unix style) line ending 58 } 59 } 60 } 61 } catch(IOException ioe) { 62 ioe.printStackTrace(); 63 } 64 } 65 66 public WETProcessor(File inFile, File outFolder) { 67 this.outputFolder = outFolder; 68 69 StringBuilder record = null; 70 String line = null; 71 boolean readingRecord = false; 72 73 int recordCount = 0; 74 75 // read from WETfile 76 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 77 78 while((line = reader.readLine()) != null) { // readLine removes newline separator 79 80 if(line.startsWith("WARC-Type: warcinfo")) { 81 readingRecord = false; 82 record = null; // drop this record, which is just an info record not actual web page's text 83 recordCount--; 84 continue; 85 } 86 87 if(line.startsWith("WARC/1.0")) { // finished previous WET record 88 // process any previous record 89 if(record != null) { 90 processWETrecord(record, recordCount); 91 record = null; 92 } 93 94 recordCount++; 95 // get ready to start a new record 96 readingRecord = true; 97 record = new StringBuilder(); 98 } 99 100 if(readingRecord) { // append current line to current record 101 record.append(line + "\n"); // add back (unix style) line ending 102 } 103 104 } 105 106 // flush the last record. If it was a warcinfo record, record would be null here 107 if(record != null) { 108 processWETrecord(record, recordCount); 109 record = null; 110 } 111 112 } catch(IOException ioe) { 113 ioe.printStackTrace(); 114 } 115 } 116 */ 117 118 public WETProcessor(File inFile, File outFolder) { 119 this.outputFolder = outFolder; 120 121 StringBuilder record = null; 122 String line = null; 123 boolean readingRecord = false; 124 125 this.recordCount = 0; 55 56 //this.recordCount = 0; 57 126 58 int contentLength = -1; // of record 127 59 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) … … 197 129 } 198 130 199 /* 200 private void processWETrecord(StringBuilder str, int recordID) { 201 String record = str.toString(); 202 //System.err.println("WET record:"); 203 //System.err.println(record); 204 //System.err.println("--------------------------"); 205 206 String paddedFileName = String.format("%04d.txt", recordID); 207 File outFile = new File(this.outputFolder, paddedFileName); 208 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) { 209 writer.write(record); 210 writer.close(); 211 } catch(IOException ioe) { 212 ioe.printStackTrace(); 213 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile); 214 } 215 } 216 */ 217 218 public int getRecordCount() { return this.recordCount; } 219 220 private void processWETrecord(int recordID, int contentLength, int lineCount, String record) { 221 System.err.println("WET record - contentLength: " + contentLength + " - lineCount: " + lineCount); 131 132 private void processWETrecord(int recordID, int contentLength, int lineCount, String record) 133 { 134 System.err.println("WET #" + this.WETFileID + " record #" + recordID 135 + " - contentLength: " + contentLength 136 + " - lineCount: " + lineCount); 222 137 //System.err.println(record); 223 138 //System.err.println("--------------------------"); … … 266 181 public static void printUsage() { 267 182 System.err.println("Run this program as:"); 268 System.err.println("\tWetProcessor <infile>.warc.wet <output folder path>"); 183 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>"); 184 } 185 186 /** Filename filter that only lists warc.wet files or else warc.wet.gz files 187 * for which unzipped warc.wet equivalents don't yet exist. 188 */ 189 private static class WETFilenameFilter implements FilenameFilter { 190 191 public boolean accept(File dir, String name) { 192 if(name.endsWith(".warc.wet")) { 193 System.err.println("Will include " + name + " for processing."); 194 return true; 195 } 196 197 if(name.endsWith(".warc.wet.gz")) { 198 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz")); 199 File unzippedVersion = new File(dir, nameWithoutGZext); 200 if(unzippedVersion.exists()) { 201 System.err.println("--- Unzipped version " + unzippedVersion + " exists."); 202 System.err.println("Skipping " + name); 203 return false; // don't count gzipped version if unzipped version exists. 204 } 205 else { 206 System.err.println("Only zipped version " + name + " exists."); 207 return true; // No unzipped version, so have to work with gzipped version 208 } 209 } 210 211 System.err.println("Skipping " + name); 212 213 // we're not even interested in any other file extensions 214 return false; 215 } 269 216 } 270 217 … … 275 222 } 276 223 277 File WETfile = new File(args[0]); 278 if(!WETfile.exists() || !WETfile.isFile()) { 279 System.err.println("Error: " + args[0] + " does not exist or is not a file"); 224 225 File WETFileDir = new File(args[0]); 226 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 227 System.err.println("Error: " + args[0] + " does not exist or is not a directory"); 280 228 return; 281 229 } … … 285 233 System.err.println("Error: " + args[1] + " does not exist or is not a directory."); 286 234 return; 287 } 288 289 235 } 290 236 291 237 File discardFolder = new File(outFolder, "discard"); … … 296 242 if(!keepFolder.exists()) { 297 243 keepFolder.mkdir(); 298 } 299 300 WETProcessor processor = new WETProcessor(WETfile, outFolder); 301 return; 302 244 } 245 246 // load up the properties from the config file 247 Properties configProps; 248 try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) { 249 configProps = new Properties(); 250 configProps.load(infile); 251 infile.close(); 252 253 } catch(Exception e) { 254 System.err.println("Exception attempting to read properties from config.properties."); 255 e.printStackTrace(); 256 return; 257 } 258 259 // Will list all the warc.wet files in the input directory or else their gzipped versions 260 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 261 262 for(int i = 0; i < WETFiles.length; i++) { 263 File WETFile = WETFiles[i]; 264 System.err.println("Processing WETfile: " + WETFile); 265 266 // Any .gz files listed means they haven't been unzipped yet. So unzip. 267 String WETFilename = WETFile.toString(); 268 if(WETFilename.endsWith(".gz")) { 269 File GZippedWETFile = WETFile; 270 String WETGZippedFilename = WETFilename; 271 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz")); 272 273 WETFile = new File(WETFilename); 274 Utility.unzipFile(GZippedWETFile, WETFile); 275 } 276 // hereafter all WETFiles should refer to the unzipped version 277 // Check the unzipped WETFile exists 278 279 if(!WETFile.exists() || !WETFile.isFile()) { 280 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 281 return; 282 } 283 284 // Finally, we can process this WETFile's records into the keep and discard pile 285 System.err.println("Off to process " + WETFile); 286 WETProcessor processor = new WETProcessor(WETFile, outFolder, configProps); 287 288 } 289 290 return; 303 291 } 304 292 }
Note:
See TracChangeset
for help on using the changeset viewer.