source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33480

Last change on this file since 33480 was 33480, checked in by ak19, 5 years ago

Much harder to remove pages where words are fused together as some are shorter than valid word-lengths of 15 chars, some are long, when the number of valid words still come to more than the required number of 20. The next solution was to ignore pages that had more than 2 instances of camelcase, but valid pages (actual Maori language pages) may end up with a few more camelcased words if navigation items get fused together. Not sure what to do.

File size: 16.7 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7
8import org.apache.log4j.Logger;
9
10/**
11 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
12 * the WET records in each, putting each WET record into a file. Each file is put into a
13 * keep or discard folder, based on content-length and number of lines.
14 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
15 *
16 * To compile, including the jars in lib/ for compiling.
17 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
18 *
19 * To run, passing the log4j and other properties files in conf/ folder:
20 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
21 *
22 * e.g.
23 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
24 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
25 *
26*/
27public class WETProcessor {
28 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
29 private static Properties configProperties = new Properties();
30
31 // In Java, can initialize static final variables inside a static block
32 // But the unavoidable try/catch in this static block prevents initialization of
33 // the static final int variables (seen further below) inside the block itself,
34 // that therefore need to be declared and initialized thereafter.
35 static {
36 // load up the properties from the config file
37 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
38 configProperties = new Properties();
39 configProperties.load(infile);
40 //infile.close();
41
42 } catch(Exception e) {
43 System.err.println("Exception attempting to read properties from config.properties.");
44 logger.error("Exception attempting to read properties from config.properties.");
45 e.printStackTrace();
46 }
47 }
48
49 // Providing fall-back cuttoff values if config.properties doesn't load
50 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
51 /*
52 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
53 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
54 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
55 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
56 */
57 private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
58 private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
59 private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
60
61 // File paths shared across WETProcessor instances
62 private static File discardFolder;
63 private static File keepFolder;
64 private static File keepURLsFile;
65 private static File discardURLsFile;
66
67 // WARC WET header lines and header line prefixes of interest
68 static final String WARC_RECORD_START = "WARC/1.0";
69 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
70 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
71 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
72
73 // Keep a count of all the records that all WETProcessors instantiated
74 // by our main method combined have processed
75 //private static int recordCount = 0;
76
77 private final File outputFolder;
78 private final String WETFileID;
79
80
81 /**
82 * WET processor processes a single warc.wet file containing multiple WET records
83 * containing text identified as primary langcode=mri. Each individual WET record is written
84 * out to a uniquely named file in either the keep or discard folder depending on the WET
85 * record's content length and number of lines of actual content (excluding WARC headers).
86 */
87 public WETProcessor(File inFile, File outFolder) {
88 this.outputFolder = outFolder;
89
90 StringBuilder record = null;
91 String line = null;
92 boolean readingRecord = false;
93
94 String WARCtargetURI = "";
95
96 int recordCount = 0;
97
98 int contentLength = -1; // of record
99 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
100
101 // We just want a unique recordID prefix, which we get from the wet file name suffix:
102 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
103 // the prefix will be everything after the last hyphen and without file extension,
104 // so "000000" in our example. Then suffix the recordCount (keeping track of the current
105 // WET record) to get a unique filename to store each WET record into.
106
107 String fileID = inFile.getName();
108 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
109 fileID = fileID.substring(0, fileID.indexOf("."));
110 this.WETFileID = fileID;
111
112
113 // read from WETfile
114 try (
115 BufferedReader reader = new BufferedReader(new FileReader(inFile));
116 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
117 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
118 ) {
119
120 while((line = reader.readLine()) != null) { // readLine removes newline separator
121
122 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
123 readingRecord = false;
124 record = null; // drop this record, which is just an info record not actual web page's text
125 recordCount--;
126 continue;
127 }
128
129 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
130 // process any previous record
131 if(record != null) {
132 processWETrecord(keepURLsWriter, discardURLsWriter,
133 recordCount, contentLength, lineCount,
134 WARCtargetURI, record.toString());
135 record = null;
136 contentLength = -1;
137 lineCount = -1;
138 }
139
140 recordCount++;
141 // get ready to start a new record
142 readingRecord = true;
143 record = new StringBuilder();
144 }
145
146 if(readingRecord) { // append current line to current record
147
148 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
149 // get and store the value
150 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
151 }
152
153 record.append(line + "\n"); // add back (unix style) line ending
154
155 // if the line is non-empty
156 // AND if we've started counting lines, which happens only when the current
157 // line is past WARC/WET headers and we're into the actual body portion
158 // of the WET record,
159 // start incrementing the line counter.
160 if(lineCount >= 0 && !line.trim().equals("")) {
161 lineCount++;
162 }
163 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
164 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
165 contentLength = Integer.parseInt(contentLengthStr);
166 lineCount = 0;
167 }
168
169 }
170
171 }
172
173 // flush the last record. If it was a warcinfo record, record would be null here
174 if(record != null) {
175 processWETrecord(keepURLsWriter, discardURLsWriter,
176 recordCount, contentLength, lineCount,
177 WARCtargetURI, record.toString());
178 record = null;
179 }
180
181 } catch(IOException ioe) {
182 ioe.printStackTrace();
183 }
184 }
185
186 /**
187 * Determines if a WET record belongs in the keep or discard pile depending on if it
188 * contains enough text, based on contentLength and line count of the record body.
189 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
190 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
191 */
192 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
193 int recordID, int contentLength, int lineCount,
194 String recordURI, String record)
195 {
196 System.err.println("WET #" + this.WETFileID + " record #" + recordID
197 + " - contentLength: " + contentLength
198 + " - lineCount: " + lineCount);
199 System.err.println("URI: " + recordURI);
200 //System.err.println(record);
201 //System.err.println("--------------------------");
202
203 String paddedFileName = String.format("%04d.txt", recordID);
204
205 File parentFolder = null;
206
207 // want to match "product(s)" but not "production"
208 //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) {
209
210
211 if(recordURI.contains("product") && !recordURI.contains("production")) {
212
213 // don't want a "translated" product site/online store
214 // These curiously often tend to have "product(s)" in the URL
215 parentFolder = WETProcessor.discardFolder;
216 }
217 /*
218 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
219 parentFolder = WETProcessor.keepFolder;
220 System.err.println("@@@KEEPING");
221 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
222 int countSpaces = 0;
223 for(int i = 0; i < record.length(); i++) {
224 if(record.charAt(i) == ' ') countSpaces++;
225 }
226 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
227 // So we have at least 500 chars (possibly on a single wrapped line)
228 // containing at least 10 spaces. Such a record is also worth keeping.
229 parentFolder = WETProcessor.keepFolder;
230 }
231 }
232 */
233
234 else {
235 // If a web page's WET record contains a certain minimum number of words,
236 // we will think it's a meaningful web page and has sufficient content for text analysis
237 // to have been successful. Cut off values at present are:
238 // - a minimum of 20 words
239 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
240 // to words having been glued together. This is used by irrelevant sites and moreover
241 // can't be analysed for language, so may not be actually MRI.
242
243 // Though StringTokenizer still in use, as seen in discussion at
244 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
245 // String.split(regex) seems better for splitting on general whitespace
246 String[] allWords = record.split("\\s");
247 int validWordCount = 0;
248 int numCamelCaseWords = 0;
249 for(int i = 0; i < allWords.length; i++) {
250 String word = allWords[i];
251
252 // throw away if n words contain camelcase, which is another case of words glued together
253 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
254 numCamelCaseWords++;
255 }
256
257 // In Maori, word length of 1 is not uncommon
258 // but let's skip camelcased words when counting valid words
259 else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
260 }
261
262 // dump if too many camelcase words (ideally keep none of that kind?)
263 if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
264 parentFolder = WETProcessor.discardFolder;
265 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
266 }
267 else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
268 parentFolder = WETProcessor.keepFolder;
269 System.err.println("@@@KEEPING");
270 }
271 }
272 // if parentFolder still not set, set to discard pile folder
273 if(parentFolder == null) {
274 parentFolder = WETProcessor.discardFolder;
275 System.err.println("@@@DISCARDING");
276 }
277
278 try {
279 if (parentFolder == WETProcessor.keepFolder) {
280 keepURLsWriter.write(recordURI + "\n");
281 } else {
282 discardURLsWriter.write(recordURI + "\n");
283 }
284 } catch(Exception e) {
285 System.err.println("Unable to write URL");
286 e.printStackTrace();
287 }
288
289 System.err.println("--------------------------");
290
291 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
292
293 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
294 writer.write(record);
295 // Try-with-resources examples don't call close() explicitly:
296 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
297 //writer.close();
298 } catch(IOException ioe) {
299 ioe.printStackTrace();
300 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
301 }
302 }
303
304
305 //public static int getRecordCount() { return recordCount; }
306
307 public static void printUsage() {
308 System.err.println("Run this program as:");
309 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
310 }
311
312 /** Filename filter to only list warc.wet files or else warc.wet.gz files
313 * for which unzipped warc.wet equivalents don't yet exist.
314 */
315 private static class WETFilenameFilter implements FilenameFilter {
316
317 public boolean accept(File dir, String name) {
318 if(name.endsWith(".warc.wet")) {
319 logger.debug("Will include " + name + " for processing.");
320 return true;
321 }
322
323 if(name.endsWith(".warc.wet.gz")) {
324 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
325 File unzippedVersion = new File(dir, nameWithoutGZext);
326 if(unzippedVersion.exists()) {
327 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
328 logger.debug("Skipping " + name);
329 return false; // don't count gzipped version if unzipped version exists.
330 }
331 else {
332 logger.debug("Only zipped version " + name + " exists.");
333 return true; // No unzipped version, so have to work with gzipped version
334 }
335 }
336
337 // we're not even interested in any other file extensions
338 logger.debug("Not a WET file. Skipping " + name);
339 return false;
340 }
341 }
342
343 public static void main(String[] args) {
344 if(args.length != 2) {
345 printUsage();
346 return;
347 }
348
349
350 File WETFileDir = new File(args[0]);
351 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
352 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
353 return;
354 }
355
356 File outFolder = new File(args[1]);
357 if(!outFolder.exists() || !outFolder.isDirectory()) {
358 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
359 return;
360 }
361
362 // static folders and files to be shared across all WETProcessor instances
363 WETProcessor.discardFolder = new File(outFolder, "discard");
364 if(!WETProcessor.discardFolder.exists()) {
365 WETProcessor.discardFolder.mkdir();
366 }
367 WETProcessor.keepFolder = new File(outFolder, "keep");
368 if(!WETProcessor.keepFolder.exists()) {
369 WETProcessor.keepFolder.mkdir();
370 }
371
372 WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");
373 if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
374 System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
375 return;
376 }
377 WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
378 if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
379 System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
380 return;
381 }
382
383 // Will list all the warc.wet files in the input directory or else their gzipped versions
384 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
385
386 for(int i = 0; i < WETFiles.length; i++) {
387 File WETFile = WETFiles[i];
388 logger.debug("Processing WETfile: " + WETFile);
389
390 // Any .gz files listed means they haven't been unzipped yet. So unzip.
391 String WETFilename = WETFile.toString();
392 if(WETFilename.endsWith(".gz")) {
393 File GZippedWETFile = WETFile;
394 String WETGZippedFilename = WETFilename;
395 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
396
397 WETFile = new File(WETFilename);
398 Utility.unzipFile(GZippedWETFile, WETFile);
399 }
400 // hereafter all WETFiles should refer to the unzipped version
401 // Check the unzipped WETFile exists
402
403 if(!WETFile.exists() || !WETFile.isFile()) {
404 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
405 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
406 return;
407 }
408
409 // Finally, we can process this WETFile's records into the keep and discard pile
410 logger.debug("Off to process " + WETFile);
411 WETProcessor processor = new WETProcessor(WETFile, outFolder);
412
413 }
414
415 return;
416 }
417}
Note: See TracBrowser for help on using the repository browser.