source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33466

Last change on this file since 33466 was 33466, checked in by ak19, 5 years ago
  1. WETProcessor.main() now processes a folder of *.warc.wet(.gz) files. Each file's WET records is written out into an individual file and put into either the keep folder or discard folder, based on amount of content (number lines and/or content-length). 2. Moved unzipFile() from NZTLDProcessor.java into new Utility.java class as a static method.
File size: 10.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import org.apache.log4j.Logger;
8
9/**
10 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
11 * the WET records in each, putting each WET record into a file. Each file is put into a
12 * keep or discard folder, based on content-length and number of lines.
13 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
14 *
15 * To compile, including the jars in lib/ for compiling.
16 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
17 *
18 * To run, passing the log4j and other properties files in conf/ folder:
19 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
20 *
21 * e.g.
22 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
23 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
24 * - java -cp ".:../conf" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
25 *
26*/
27public class WETProcessor {
28 //private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
29 private Properties configProperties = null;
30
31
32 // arbitrary cut-off values, TODO: put in .properties file
33 private static final int MIN_CONTENT_LENGTH = 100;
34 private static final int MIN_LINE_COUNT = 2;
35 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500;
36 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10;
37
38 // keep a count of all the records that all WETProcessors instantiated
39 // by our main method have processed
40 private static int recordCount = 0;
41
42 private final File outputFolder;
43 private final String WETFileID;
44
45
46 public static int getRecordCount() { return recordCount; }
47
48 public WETProcessor(File inFile, File outFolder, Properties configProps) {
49 this.outputFolder = outFolder;
50 this.configProperties = configProps;
51
52 StringBuilder record = null;
53 String line = null;
54 boolean readingRecord = false;
55
56 //this.recordCount = 0;
57
58 int contentLength = -1; // of record
59 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
60
61 // We just want a unique recordID prefix, which we get from the wet file name suffix:
62 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
63 // the prefix will be everything after the last hyphen and without file extension,
64 // so "000000" in our example.
65
66 String fileID = inFile.getName();
67 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
68 fileID = fileID.substring(0, fileID.indexOf("."));
69 this.WETFileID = fileID;
70
71 // read from WETfile
72 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
73
74 while((line = reader.readLine()) != null) { // readLine removes newline separator
75
76 if(line.startsWith("WARC-Type: warcinfo")) {
77 readingRecord = false;
78 record = null; // drop this record, which is just an info record not actual web page's text
79 recordCount--;
80 continue;
81 }
82
83 if(line.startsWith("WARC/1.0")) { // finished previous WET record
84 // process any previous record
85 if(record != null) {
86 processWETrecord(recordCount, contentLength, lineCount, record.toString());
87 record = null;
88 contentLength = -1;
89 lineCount = -1;
90 }
91
92 recordCount++;
93 // get ready to start a new record
94 readingRecord = true;
95 record = new StringBuilder();
96 }
97
98 if(readingRecord) { // append current line to current record
99
100 record.append(line + "\n"); // add back (unix style) line ending
101
102 // if the line is non-empty
103 // AND if we've started counting lines, which happens only when the current
104 // line is past WARC/WET headers and we're into the actual body portion
105 // of the WET record,
106 // start incrementing the line counter.
107 if(lineCount >= 0 && !line.trim().equals("")) {
108 lineCount++;
109 }
110 else if(line.startsWith("Content-Length:")) {
111 String contentLengthStr = line.substring("Content-Length:".length()).trim();
112 contentLength = Integer.parseInt(contentLengthStr);
113 lineCount = 0;
114 }
115
116 }
117
118 }
119
120 // flush the last record. If it was a warcinfo record, record would be null here
121 if(record != null) {
122 processWETrecord(recordCount, contentLength, lineCount, record.toString());
123 record = null;
124 }
125
126 } catch(IOException ioe) {
127 ioe.printStackTrace();
128 }
129 }
130
131
132 private void processWETrecord(int recordID, int contentLength, int lineCount, String record)
133 {
134 System.err.println("WET #" + this.WETFileID + " record #" + recordID
135 + " - contentLength: " + contentLength
136 + " - lineCount: " + lineCount);
137 //System.err.println(record);
138 //System.err.println("--------------------------");
139
140 String paddedFileName = String.format("%04d.txt", recordID);
141
142 File discardFolder = new File(this.outputFolder, "discard");
143 File keepFolder = new File(this.outputFolder, "keep");
144 File parentFolder = null;
145
146 if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
147 parentFolder = keepFolder;
148 System.err.println("@@@KEEPING");
149 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
150 int countSpaces = 0;
151 for(int i = 0; i < record.length(); i++) {
152 if(record.charAt(i) == ' ') countSpaces++;
153 }
154 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
155 // So we have at least 500 chars (possibly on a single wrapped line)
156 // containing at least 10 spaces. Such a record is also worth keeping.
157 parentFolder = keepFolder;
158 }
159 }
160
161 // if parentFolder still not set, set to discard pile folder
162 if(parentFolder == null) {
163 parentFolder = discardFolder;
164 System.err.println("@@@DISCARDING");
165 }
166
167 System.err.println("--------------------------");
168
169 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
170
171 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
172 writer.write(record);
173 writer.close();
174 } catch(IOException ioe) {
175 ioe.printStackTrace();
176 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
177 }
178 }
179
180
181 public static void printUsage() {
182 System.err.println("Run this program as:");
183 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
184 }
185
186 /** Filename filter that only lists warc.wet files or else warc.wet.gz files
187 * for which unzipped warc.wet equivalents don't yet exist.
188 */
189 private static class WETFilenameFilter implements FilenameFilter {
190
191 public boolean accept(File dir, String name) {
192 if(name.endsWith(".warc.wet")) {
193 System.err.println("Will include " + name + " for processing.");
194 return true;
195 }
196
197 if(name.endsWith(".warc.wet.gz")) {
198 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
199 File unzippedVersion = new File(dir, nameWithoutGZext);
200 if(unzippedVersion.exists()) {
201 System.err.println("--- Unzipped version " + unzippedVersion + " exists.");
202 System.err.println("Skipping " + name);
203 return false; // don't count gzipped version if unzipped version exists.
204 }
205 else {
206 System.err.println("Only zipped version " + name + " exists.");
207 return true; // No unzipped version, so have to work with gzipped version
208 }
209 }
210
211 System.err.println("Skipping " + name);
212
213 // we're not even interested in any other file extensions
214 return false;
215 }
216 }
217
218 public static void main(String[] args) {
219 if(args.length != 2) {
220 printUsage();
221 return;
222 }
223
224
225 File WETFileDir = new File(args[0]);
226 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
227 System.err.println("Error: " + args[0] + " does not exist or is not a directory");
228 return;
229 }
230
231 File outFolder = new File(args[1]);
232 if(!outFolder.exists() || !outFolder.isDirectory()) {
233 System.err.println("Error: " + args[1] + " does not exist or is not a directory.");
234 return;
235 }
236
237 File discardFolder = new File(outFolder, "discard");
238 if(!discardFolder.exists()) {
239 discardFolder.mkdir();
240 }
241 File keepFolder = new File(outFolder, "keep");
242 if(!keepFolder.exists()) {
243 keepFolder.mkdir();
244 }
245
246 // load up the properties from the config file
247 Properties configProps;
248 try (InputStream infile = Class.forName("org.greenstone.atea.WETProcessor").getClassLoader().getResourceAsStream("config.properties")) {
249 configProps = new Properties();
250 configProps.load(infile);
251 infile.close();
252
253 } catch(Exception e) {
254 System.err.println("Exception attempting to read properties from config.properties.");
255 e.printStackTrace();
256 return;
257 }
258
259 // Will list all the warc.wet files in the input directory or else their gzipped versions
260 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
261
262 for(int i = 0; i < WETFiles.length; i++) {
263 File WETFile = WETFiles[i];
264 System.err.println("Processing WETfile: " + WETFile);
265
266 // Any .gz files listed means they haven't been unzipped yet. So unzip.
267 String WETFilename = WETFile.toString();
268 if(WETFilename.endsWith(".gz")) {
269 File GZippedWETFile = WETFile;
270 String WETGZippedFilename = WETFilename;
271 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
272
273 WETFile = new File(WETFilename);
274 Utility.unzipFile(GZippedWETFile, WETFile);
275 }
276 // hereafter all WETFiles should refer to the unzipped version
277 // Check the unzipped WETFile exists
278
279 if(!WETFile.exists() || !WETFile.isFile()) {
280 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
281 return;
282 }
283
284 // Finally, we can process this WETFile's records into the keep and discard pile
285 System.err.println("Off to process " + WETFile);
286 WETProcessor processor = new WETProcessor(WETFile, outFolder, configProps);
287
288 }
289
290 return;
291 }
292}
Note: See TracBrowser for help on using the repository browser.