source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33465

Last change on this file since 33465 was 33465, checked in by ak19, 5 years ago

Committing first version of the WETProcessor.java which takes a .warc.wet file and splits it into individual WET records, creating subfolders of keep and discard piles.

File size: 9.1 KB
Line 
1
2
3package org.greenstone.atea;
4
5
6import java.io.*;
7
8/**
9 * To compile:
10 * maori-lang-detection/src>javac -cp "." org/greenstone/atea/WETProcessor.java
11 * To run:
12 * java org.greenstone.atea.WETProcessor <wetfile>.wet.warc <outputFolder>
13 * e.g.
14 * - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
15 * - java org.greenstone.atea.WETProcessor ../wetprocessor/MAORI-CC-2019-30-20190902100139-000000.warc.wet /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
16*/
17public class WETProcessor {
18 // arbitrary cut-off values, TODO: put in .properties file
19 private static final int MIN_CONTENT_LENGTH = 100;
20 private static final int MIN_LINE_COUNT = 2;
21 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = 500;
22 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = 10;
23
24 private final File outputFolder;
25 private final String WETFileID;
26 private int recordCount = -1;
27
28 /*
29 public WETProcessor(File inFile) {
30
31 StringBuilder record = null;
32 String line = null;
33 boolean readingRecord = false;
34
35 // read from WETfile
36 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
37 while((line = reader.readLine()) != null) { // readLine removes newline separator
38
39 if(line.startsWith("Content-Length:")) {
40 readingRecord = true;
41 record = new StringBuilder();
42 continue;
43 }
44
45 if(readingRecord) {
46 if(line.startsWith("WARC/1.0")) { // finished previous WET record
47 readingRecord = false;
48
49 System.err.println("WET record:");
50 System.err.println(record);
51 System.err.println("--------------------------");
52 record = null;
53
54 processWETrecord(record);
55 }
56 else {
57 record.append(line + "\n"); // add back (unix style) line ending
58 }
59 }
60 }
61 } catch(IOException ioe) {
62 ioe.printStackTrace();
63 }
64 }
65
66 public WETProcessor(File inFile, File outFolder) {
67 this.outputFolder = outFolder;
68
69 StringBuilder record = null;
70 String line = null;
71 boolean readingRecord = false;
72
73 int recordCount = 0;
74
75 // read from WETfile
76 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
77
78 while((line = reader.readLine()) != null) { // readLine removes newline separator
79
80 if(line.startsWith("WARC-Type: warcinfo")) {
81 readingRecord = false;
82 record = null; // drop this record, which is just an info record not actual web page's text
83 recordCount--;
84 continue;
85 }
86
87 if(line.startsWith("WARC/1.0")) { // finished previous WET record
88 // process any previous record
89 if(record != null) {
90 processWETrecord(record, recordCount);
91 record = null;
92 }
93
94 recordCount++;
95 // get ready to start a new record
96 readingRecord = true;
97 record = new StringBuilder();
98 }
99
100 if(readingRecord) { // append current line to current record
101 record.append(line + "\n"); // add back (unix style) line ending
102 }
103
104 }
105
106 // flush the last record. If it was a warcinfo record, record would be null here
107 if(record != null) {
108 processWETrecord(record, recordCount);
109 record = null;
110 }
111
112 } catch(IOException ioe) {
113 ioe.printStackTrace();
114 }
115 }
116 */
117
118 public WETProcessor(File inFile, File outFolder) {
119 this.outputFolder = outFolder;
120
121 StringBuilder record = null;
122 String line = null;
123 boolean readingRecord = false;
124
125 this.recordCount = 0;
126 int contentLength = -1; // of record
127 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
128
129 // We just want a unique recordID prefix, which we get from the wet file name suffix:
130 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
131 // the prefix will be everything after the last hyphen and without file extension,
132 // so "000000" in our example.
133
134 String fileID = inFile.getName();
135 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
136 fileID = fileID.substring(0, fileID.indexOf("."));
137 this.WETFileID = fileID;
138
139 // read from WETfile
140 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
141
142 while((line = reader.readLine()) != null) { // readLine removes newline separator
143
144 if(line.startsWith("WARC-Type: warcinfo")) {
145 readingRecord = false;
146 record = null; // drop this record, which is just an info record not actual web page's text
147 recordCount--;
148 continue;
149 }
150
151 if(line.startsWith("WARC/1.0")) { // finished previous WET record
152 // process any previous record
153 if(record != null) {
154 processWETrecord(recordCount, contentLength, lineCount, record.toString());
155 record = null;
156 contentLength = -1;
157 lineCount = -1;
158 }
159
160 recordCount++;
161 // get ready to start a new record
162 readingRecord = true;
163 record = new StringBuilder();
164 }
165
166 if(readingRecord) { // append current line to current record
167
168 record.append(line + "\n"); // add back (unix style) line ending
169
170 // if the line is non-empty
171 // AND if we've started counting lines, which happens only when the current
172 // line is past WARC/WET headers and we're into the actual body portion
173 // of the WET record,
174 // start incrementing the line counter.
175 if(lineCount >= 0 && !line.trim().equals("")) {
176 lineCount++;
177 }
178 else if(line.startsWith("Content-Length:")) {
179 String contentLengthStr = line.substring("Content-Length:".length()).trim();
180 contentLength = Integer.parseInt(contentLengthStr);
181 lineCount = 0;
182 }
183
184 }
185
186 }
187
188 // flush the last record. If it was a warcinfo record, record would be null here
189 if(record != null) {
190 processWETrecord(recordCount, contentLength, lineCount, record.toString());
191 record = null;
192 }
193
194 } catch(IOException ioe) {
195 ioe.printStackTrace();
196 }
197 }
198
199 /*
200 private void processWETrecord(StringBuilder str, int recordID) {
201 String record = str.toString();
202 //System.err.println("WET record:");
203 //System.err.println(record);
204 //System.err.println("--------------------------");
205
206 String paddedFileName = String.format("%04d.txt", recordID);
207 File outFile = new File(this.outputFolder, paddedFileName);
208 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
209 writer.write(record);
210 writer.close();
211 } catch(IOException ioe) {
212 ioe.printStackTrace();
213 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
214 }
215 }
216 */
217
218 public int getRecordCount() { return this.recordCount; }
219
220 private void processWETrecord(int recordID, int contentLength, int lineCount, String record) {
221 System.err.println("WET record - contentLength: " + contentLength + " - lineCount: " + lineCount);
222 //System.err.println(record);
223 //System.err.println("--------------------------");
224
225 String paddedFileName = String.format("%04d.txt", recordID);
226
227 File discardFolder = new File(this.outputFolder, "discard");
228 File keepFolder = new File(this.outputFolder, "keep");
229 File parentFolder = null;
230
231 if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
232 parentFolder = keepFolder;
233 System.err.println("@@@KEEPING");
234 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
235 int countSpaces = 0;
236 for(int i = 0; i < record.length(); i++) {
237 if(record.charAt(i) == ' ') countSpaces++;
238 }
239 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
240 // So we have at least 500 chars (possibly on a single wrapped line)
241 // containing at least 10 spaces. Such a record is also worth keeping.
242 parentFolder = keepFolder;
243 }
244 }
245
246 // if parentFolder still not set, set to discard pile folder
247 if(parentFolder == null) {
248 parentFolder = discardFolder;
249 System.err.println("@@@DISCARDING");
250 }
251
252 System.err.println("--------------------------");
253
254 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
255
256 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
257 writer.write(record);
258 writer.close();
259 } catch(IOException ioe) {
260 ioe.printStackTrace();
261 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
262 }
263 }
264
265
266 public static void printUsage() {
267 System.err.println("Run this program as:");
268 System.err.println("\tWetProcessor <infile>.warc.wet <output folder path>");
269 }
270
271 public static void main(String[] args) {
272 if(args.length != 2) {
273 printUsage();
274 return;
275 }
276
277 File WETfile = new File(args[0]);
278 if(!WETfile.exists() || !WETfile.isFile()) {
279 System.err.println("Error: " + args[0] + " does not exist or is not a file");
280 return;
281 }
282
283 File outFolder = new File(args[1]);
284 if(!outFolder.exists() || !outFolder.isDirectory()) {
285 System.err.println("Error: " + args[1] + " does not exist or is not a directory.");
286 return;
287 }
288
289
290
291 File discardFolder = new File(outFolder, "discard");
292 if(!discardFolder.exists()) {
293 discardFolder.mkdir();
294 }
295 File keepFolder = new File(outFolder, "keep");
296 if(!keepFolder.exists()) {
297 keepFolder.mkdir();
298 }
299
300 WETProcessor processor = new WETProcessor(WETfile, outFolder);
301 return;
302
303 }
304}
Note: See TracBrowser for help on using the repository browser.