source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33467

Last change on this file since 33467 was 33467, checked in by ak19, 5 years ago

Improved the code to use a static block to load the needed properties from config.properties and initialise some static final ints from there. Code now uses the logger for debugging. New properties in config.properties. Returned code to use a counter, recordCount, re-zeroed for each WETProcessor since the count was used for unique filenames, and filename prefixes are unique for each warc.wet file. So these prefixes, in combination with keeping track of the recordcount per warc.wet file, each WET record written out to a file is assigned a unique filename. (No longer need a running total of all WET records across warc.wet files processed ensuring uniqueness of filenames.) All appears to still work similarly to previous commit in creating discard and keep subfolders.

File size: 11.8 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7
8import org.apache.log4j.Logger;
9
10/**
11 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
12 * the WET records in each, putting each WET record into a file. Each file is put into a
13 * keep or discard folder, based on content-length and number of lines.
14 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
15 *
16 * To compile, including the jars in lib/ for compiling.
17 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
18 *
19 * To run, passing the log4j and other properties files in conf/ folder:
20 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
21 *
22 * e.g.
23 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
24 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
25 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
26 *
27*/
28public class WETProcessor {
29 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
30 private static Properties configProperties = new Properties();
31
32 // In Java, can initialize static final variables inside a static block
33 // But the unavoidable try/catch in this static block prevents initialization of
34 // the static final int variables further below that therefore need to be declared
35 // and initialized thereafter.
36 static {
37
38 // load up the properties from the config file
39 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
40 configProperties = new Properties();
41 configProperties.load(infile);
42 //infile.close();
43
44 } catch(Exception e) {
45 System.err.println("Exception attempting to read properties from config.properties.");
46 logger.error("Exception attempting to read properties from config.properties.");
47 e.printStackTrace();
48 }
49 }
50
51 // Providing fall-back cuttoff values if config.properties doesn't load
52 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
53 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
54 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
55 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
56 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
57
58 // Keep a count of all the records that all WETProcessors instantiated
59 // by our main method combined have processed
60 //private static int recordCount = 0;
61
62 private final File outputFolder;
63 private final String WETFileID;
64
65
66 /**
67 * WET processor processes a single warc.wet file containing multiple WET records
68 * containing text identified as primary langcode=mri. Each individual WET record is written
69 * out to a uniquely named file in either the keep or discard folder depending on the WET
70 * record's content length and number of lines of actual content (excluding WARC headers).
71 */
72 public WETProcessor(File inFile, File outFolder) {
73 this.outputFolder = outFolder;
74
75 StringBuilder record = null;
76 String line = null;
77 boolean readingRecord = false;
78
79 int recordCount = 0;
80
81 int contentLength = -1; // of record
82 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
83
84 // We just want a unique recordID prefix, which we get from the wet file name suffix:
85 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
86 // the prefix will be everything after the last hyphen and without file extension,
87 // so "000000" in our example. Then suffix the recordCount (keeping track of the current
88 // WET record) to get a unique filename to store each WET record into.
89
90 String fileID = inFile.getName();
91 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
92 fileID = fileID.substring(0, fileID.indexOf("."));
93 this.WETFileID = fileID;
94
95 // read from WETfile
96 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
97
98 while((line = reader.readLine()) != null) { // readLine removes newline separator
99
100 if(line.startsWith("WARC-Type: warcinfo")) {
101 readingRecord = false;
102 record = null; // drop this record, which is just an info record not actual web page's text
103 recordCount--;
104 continue;
105 }
106
107 if(line.startsWith("WARC/1.0")) { // finished previous WET record
108 // process any previous record
109 if(record != null) {
110 processWETrecord(recordCount, contentLength, lineCount, record.toString());
111 record = null;
112 contentLength = -1;
113 lineCount = -1;
114 }
115
116 recordCount++;
117 // get ready to start a new record
118 readingRecord = true;
119 record = new StringBuilder();
120 }
121
122 if(readingRecord) { // append current line to current record
123
124 record.append(line + "\n"); // add back (unix style) line ending
125
126 // if the line is non-empty
127 // AND if we've started counting lines, which happens only when the current
128 // line is past WARC/WET headers and we're into the actual body portion
129 // of the WET record,
130 // start incrementing the line counter.
131 if(lineCount >= 0 && !line.trim().equals("")) {
132 lineCount++;
133 }
134 else if(line.startsWith("Content-Length:")) {
135 String contentLengthStr = line.substring("Content-Length:".length()).trim();
136 contentLength = Integer.parseInt(contentLengthStr);
137 lineCount = 0;
138 }
139
140 }
141
142 }
143
144 // flush the last record. If it was a warcinfo record, record would be null here
145 if(record != null) {
146 processWETrecord(recordCount, contentLength, lineCount, record.toString());
147 record = null;
148 }
149
150 } catch(IOException ioe) {
151 ioe.printStackTrace();
152 }
153 }
154
155 /**
156 * Determines if a WET record belongs in the keep or discard pile depending on if it
157 * contains enough text, based on contentLength and line count of the record body.
158 * Then writes out the WET record to a uniquely named file in the keep or discard folder.
159 */
160 private void processWETrecord(int recordID, int contentLength, int lineCount, String record)
161 {
162 System.err.println("WET #" + this.WETFileID + " record #" + recordID
163 + " - contentLength: " + contentLength
164 + " - lineCount: " + lineCount);
165 //System.err.println(record);
166 //System.err.println("--------------------------");
167
168 String paddedFileName = String.format("%04d.txt", recordID);
169
170 File discardFolder = new File(this.outputFolder, "discard");
171 File keepFolder = new File(this.outputFolder, "keep");
172 File parentFolder = null;
173
174 if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
175 parentFolder = keepFolder;
176 System.err.println("@@@KEEPING");
177 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
178 int countSpaces = 0;
179 for(int i = 0; i < record.length(); i++) {
180 if(record.charAt(i) == ' ') countSpaces++;
181 }
182 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
183 // So we have at least 500 chars (possibly on a single wrapped line)
184 // containing at least 10 spaces. Such a record is also worth keeping.
185 parentFolder = keepFolder;
186 }
187 }
188
189 // if parentFolder still not set, set to discard pile folder
190 if(parentFolder == null) {
191 parentFolder = discardFolder;
192 System.err.println("@@@DISCARDING");
193 }
194
195 System.err.println("--------------------------");
196
197 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
198
199 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
200 writer.write(record);
201 // Try-with-resources examples don't call close() explicitly:
202 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
203 //writer.close();
204 } catch(IOException ioe) {
205 ioe.printStackTrace();
206 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
207 }
208 }
209
210
211 //public static int getRecordCount() { return recordCount; }
212
213 public static void printUsage() {
214 System.err.println("Run this program as:");
215 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
216 }
217
218 /** Filename filter to only list warc.wet files or else warc.wet.gz files
219 * for which unzipped warc.wet equivalents don't yet exist.
220 */
221 private static class WETFilenameFilter implements FilenameFilter {
222
223 public boolean accept(File dir, String name) {
224 if(name.endsWith(".warc.wet")) {
225 logger.debug("Will include " + name + " for processing.");
226 return true;
227 }
228
229 if(name.endsWith(".warc.wet.gz")) {
230 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
231 File unzippedVersion = new File(dir, nameWithoutGZext);
232 if(unzippedVersion.exists()) {
233 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
234 logger.debug("Skipping " + name);
235 return false; // don't count gzipped version if unzipped version exists.
236 }
237 else {
238 logger.debug("Only zipped version " + name + " exists.");
239 return true; // No unzipped version, so have to work with gzipped version
240 }
241 }
242
243 // we're not even interested in any other file extensions
244 logger.debug("Not a WET file. Skipping " + name);
245 return false;
246 }
247 }
248
249 public static void main(String[] args) {
250 if(args.length != 2) {
251 printUsage();
252 return;
253 }
254
255
256 File WETFileDir = new File(args[0]);
257 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
258 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
259 return;
260 }
261
262 File outFolder = new File(args[1]);
263 if(!outFolder.exists() || !outFolder.isDirectory()) {
264 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
265 return;
266 }
267
268 File discardFolder = new File(outFolder, "discard");
269 if(!discardFolder.exists()) {
270 discardFolder.mkdir();
271 }
272 File keepFolder = new File(outFolder, "keep");
273 if(!keepFolder.exists()) {
274 keepFolder.mkdir();
275 }
276
277 // Will list all the warc.wet files in the input directory or else their gzipped versions
278 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
279
280 for(int i = 0; i < WETFiles.length; i++) {
281 File WETFile = WETFiles[i];
282 logger.debug("Processing WETfile: " + WETFile);
283
284 // Any .gz files listed means they haven't been unzipped yet. So unzip.
285 String WETFilename = WETFile.toString();
286 if(WETFilename.endsWith(".gz")) {
287 File GZippedWETFile = WETFile;
288 String WETGZippedFilename = WETFilename;
289 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
290
291 WETFile = new File(WETFilename);
292 Utility.unzipFile(GZippedWETFile, WETFile);
293 }
294 // hereafter all WETFiles should refer to the unzipped version
295 // Check the unzipped WETFile exists
296
297 if(!WETFile.exists() || !WETFile.isFile()) {
298 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
299 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
300 return;
301 }
302
303 // Finally, we can process this WETFile's records into the keep and discard pile
304 logger.debug("Off to process " + WETFile);
305 WETProcessor processor = new WETProcessor(WETFile, outFolder);
306
307 }
308
309 return;
310 }
311}
Note: See TracBrowser for help on using the repository browser.