source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33471

Last change on this file since 33471 was 33471, checked in by ak19, 5 years ago

Very minor changes.

File size: 14.4 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7
8import org.apache.log4j.Logger;
9
10/**
11 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
12 * the WET records in each, putting each WET record into a file. Each file is put into a
13 * keep or discard folder, based on content-length and number of lines.
14 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
15 *
16 * To compile, including the jars in lib/ for compiling.
17 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
18 *
19 * To run, passing the log4j and other properties files in conf/ folder:
20 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
21 *
22 * e.g.
23 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
24 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
25 *
26*/
27public class WETProcessor {
28 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
29 private static Properties configProperties = new Properties();
30
31 // In Java, can initialize static final variables inside a static block
32 // But the unavoidable try/catch in this static block prevents initialization of
33 // the static final int variables further below that therefore need to be declared
34 // and initialized thereafter.
35 static {
36
37 // load up the properties from the config file
38 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
39 configProperties = new Properties();
40 configProperties.load(infile);
41 //infile.close();
42
43 } catch(Exception e) {
44 System.err.println("Exception attempting to read properties from config.properties.");
45 logger.error("Exception attempting to read properties from config.properties.");
46 e.printStackTrace();
47 }
48 }
49
50 // Providing fall-back cuttoff values if config.properties doesn't load
51 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
52 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
53 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
54 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
55 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
56
57 // File paths shared across WETProcessor instances
58 private static File discardFolder;
59 private static File keepFolder;
60 private static File keepURLsFile;
61 private static File discardURLsFile;
62
63 // WARC WET header lines and header line prefixes of interest
64 static final String WARC_RECORD_START = "WARC/1.0";
65 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
66 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
67 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
68
69 // Keep a count of all the records that all WETProcessors instantiated
70 // by our main method combined have processed
71 //private static int recordCount = 0;
72
73 private final File outputFolder;
74 private final String WETFileID;
75
76
77 /**
78 * WET processor processes a single warc.wet file containing multiple WET records
79 * containing text identified as primary langcode=mri. Each individual WET record is written
80 * out to a uniquely named file in either the keep or discard folder depending on the WET
81 * record's content length and number of lines of actual content (excluding WARC headers).
82 */
83 public WETProcessor(File inFile, File outFolder) {
84 this.outputFolder = outFolder;
85
86 StringBuilder record = null;
87 String line = null;
88 boolean readingRecord = false;
89
90 String WARCtargetURI = "";
91
92 int recordCount = 0;
93
94 int contentLength = -1; // of record
95 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
96
97 // We just want a unique recordID prefix, which we get from the wet file name suffix:
98 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
99 // the prefix will be everything after the last hyphen and without file extension,
100 // so "000000" in our example. Then suffix the recordCount (keeping track of the current
101 // WET record) to get a unique filename to store each WET record into.
102
103 String fileID = inFile.getName();
104 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
105 fileID = fileID.substring(0, fileID.indexOf("."));
106 this.WETFileID = fileID;
107
108
109 // read from WETfile
110 try (
111 BufferedReader reader = new BufferedReader(new FileReader(inFile));
112 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
113 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
114 ) {
115
116 while((line = reader.readLine()) != null) { // readLine removes newline separator
117
118 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
119 readingRecord = false;
120 record = null; // drop this record, which is just an info record not actual web page's text
121 recordCount--;
122 continue;
123 }
124
125 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
126 // process any previous record
127 if(record != null) {
128 processWETrecord(keepURLsWriter, discardURLsWriter,
129 recordCount, contentLength, lineCount,
130 WARCtargetURI, record.toString());
131 record = null;
132 contentLength = -1;
133 lineCount = -1;
134 }
135
136 recordCount++;
137 // get ready to start a new record
138 readingRecord = true;
139 record = new StringBuilder();
140 }
141
142 if(readingRecord) { // append current line to current record
143
144 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
145 // get and store the value
146 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
147 }
148
149 record.append(line + "\n"); // add back (unix style) line ending
150
151 // if the line is non-empty
152 // AND if we've started counting lines, which happens only when the current
153 // line is past WARC/WET headers and we're into the actual body portion
154 // of the WET record,
155 // start incrementing the line counter.
156 if(lineCount >= 0 && !line.trim().equals("")) {
157 lineCount++;
158 }
159 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
160 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
161 contentLength = Integer.parseInt(contentLengthStr);
162 lineCount = 0;
163 }
164
165 }
166
167 }
168
169 // flush the last record. If it was a warcinfo record, record would be null here
170 if(record != null) {
171 processWETrecord(keepURLsWriter, discardURLsWriter,
172 recordCount, contentLength, lineCount,
173 WARCtargetURI, record.toString());
174 record = null;
175 }
176
177 } catch(IOException ioe) {
178 ioe.printStackTrace();
179 }
180 }
181
182 /**
183 * Determines if a WET record belongs in the keep or discard pile depending on if it
184 * contains enough text, based on contentLength and line count of the record body.
185 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
186 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
187 */
188 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
189 int recordID, int contentLength, int lineCount,
190 String recordURI, String record)
191 {
192 System.err.println("WET #" + this.WETFileID + " record #" + recordID
193 + " - contentLength: " + contentLength
194 + " - lineCount: " + lineCount);
195 System.err.println("URI: " + recordURI);
196 //System.err.println(record);
197 //System.err.println("--------------------------");
198
199 String paddedFileName = String.format("%04d.txt", recordID);
200
201 File parentFolder = null;
202
203 // want to match "product(s)" but not "production"
204
205 //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) {
206 if(recordURI.contains("product") && !recordURI.contains("production")) {
207
208 // don't want a "translated" product site/online store
209 // These curiously often tend to have "product(s)" in the URL
210 parentFolder = WETProcessor.discardFolder;
211 }
212 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
213 parentFolder = WETProcessor.keepFolder;
214 System.err.println("@@@KEEPING");
215 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
216 int countSpaces = 0;
217 for(int i = 0; i < record.length(); i++) {
218 if(record.charAt(i) == ' ') countSpaces++;
219 }
220 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
221 // So we have at least 500 chars (possibly on a single wrapped line)
222 // containing at least 10 spaces. Such a record is also worth keeping.
223 parentFolder = WETProcessor.keepFolder;
224 }
225 }
226
227 // if parentFolder still not set, set to discard pile folder
228 if(parentFolder == null) {
229 parentFolder = WETProcessor.discardFolder;
230 System.err.println("@@@DISCARDING");
231 }
232
233 try {
234 if (parentFolder == WETProcessor.keepFolder) {
235 keepURLsWriter.write(recordURI + "\n");
236 } else {
237 discardURLsWriter.write(recordURI + "\n");
238 }
239 } catch(Exception e) {
240 System.err.println("Unable to write URL");
241 e.printStackTrace();
242 }
243
244 System.err.println("--------------------------");
245
246 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
247
248 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
249 writer.write(record);
250 // Try-with-resources examples don't call close() explicitly:
251 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
252 //writer.close();
253 } catch(IOException ioe) {
254 ioe.printStackTrace();
255 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
256 }
257 }
258
259
260 //public static int getRecordCount() { return recordCount; }
261
262 public static void printUsage() {
263 System.err.println("Run this program as:");
264 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
265 }
266
267 /** Filename filter to only list warc.wet files or else warc.wet.gz files
268 * for which unzipped warc.wet equivalents don't yet exist.
269 */
270 private static class WETFilenameFilter implements FilenameFilter {
271
272 public boolean accept(File dir, String name) {
273 if(name.endsWith(".warc.wet")) {
274 logger.debug("Will include " + name + " for processing.");
275 return true;
276 }
277
278 if(name.endsWith(".warc.wet.gz")) {
279 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
280 File unzippedVersion = new File(dir, nameWithoutGZext);
281 if(unzippedVersion.exists()) {
282 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
283 logger.debug("Skipping " + name);
284 return false; // don't count gzipped version if unzipped version exists.
285 }
286 else {
287 logger.debug("Only zipped version " + name + " exists.");
288 return true; // No unzipped version, so have to work with gzipped version
289 }
290 }
291
292 // we're not even interested in any other file extensions
293 logger.debug("Not a WET file. Skipping " + name);
294 return false;
295 }
296 }
297
298 public static void main(String[] args) {
299 if(args.length != 2) {
300 printUsage();
301 return;
302 }
303
304
305 File WETFileDir = new File(args[0]);
306 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
307 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
308 return;
309 }
310
311 File outFolder = new File(args[1]);
312 if(!outFolder.exists() || !outFolder.isDirectory()) {
313 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
314 return;
315 }
316
317 // static folders and files to be shared across all WETProcessor instances
318 WETProcessor.discardFolder = new File(outFolder, "discard");
319 if(!WETProcessor.discardFolder.exists()) {
320 WETProcessor.discardFolder.mkdir();
321 }
322 WETProcessor.keepFolder = new File(outFolder, "keep");
323 if(!WETProcessor.keepFolder.exists()) {
324 WETProcessor.keepFolder.mkdir();
325 }
326
327 WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");
328 if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
329 System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
330 return;
331 }
332 WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
333 if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
334 System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
335 return;
336 }
337
338 // Will list all the warc.wet files in the input directory or else their gzipped versions
339 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
340
341 for(int i = 0; i < WETFiles.length; i++) {
342 File WETFile = WETFiles[i];
343 logger.debug("Processing WETfile: " + WETFile);
344
345 // Any .gz files listed means they haven't been unzipped yet. So unzip.
346 String WETFilename = WETFile.toString();
347 if(WETFilename.endsWith(".gz")) {
348 File GZippedWETFile = WETFile;
349 String WETGZippedFilename = WETFilename;
350 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
351
352 WETFile = new File(WETFilename);
353 Utility.unzipFile(GZippedWETFile, WETFile);
354 }
355 // hereafter all WETFiles should refer to the unzipped version
356 // Check the unzipped WETFile exists
357
358 if(!WETFile.exists() || !WETFile.isFile()) {
359 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
360 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
361 return;
362 }
363
364 // Finally, we can process this WETFile's records into the keep and discard pile
365 logger.debug("Off to process " + WETFile);
366 WETProcessor processor = new WETProcessor(WETFile, outFolder);
367
368 }
369
370 return;
371 }
372}
Note: See TracBrowser for help on using the repository browser.