source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33497

Last change on this file since 33497 was 33497, checked in by ak19, 5 years ago

First version of discard url filter file. Inefficient implementation. Better to read the file once, adjust the filters as required and keep in memory to do the comparisons with each URL.

File size: 21.8 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.Set;
9import java.util.TreeSet;
10
11import org.apache.log4j.Logger;
12
13/**
14 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
15 * the WET records in each, putting each WET record into a file. Each file is put into a
16 * keep or discard folder, based on content-length and number of lines.
17 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
18 *
19 * To compile, including the jars in lib/ for compiling.
20 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
21 *
22 * To run, passing the log4j and other properties files in conf/ folder:
23 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
24 *
25 * e.g.
26 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
27 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
28 *
29*/
30public class WETProcessor {
31 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
32 private static Properties configProperties = new Properties();
33
34 // In Java, can initialize static final variables inside a static block
35 // But the unavoidable try/catch in this static block prevents initialization of
36 // the static final int variables (seen further below) inside the block itself,
37 // that therefore need to be declared and initialized thereafter.
38 static {
39 // load up the properties from the config file
40 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
41 configProperties = new Properties();
42 configProperties.load(infile);
43 //infile.close();
44
45 } catch(Exception e) {
46 System.err.println("Exception attempting to read properties from config.properties.");
47 logger.error("Exception attempting to read properties from config.properties.");
48 e.printStackTrace();
49 }
50 }
51
52 // Providing fall-back cuttoff values if config.properties doesn't load
53 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
54 /*
55 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
56 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
57 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
58 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
59 */
60 private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
61 private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
62 private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
63
64 // File paths shared across WETProcessor instances
65 private static File discardFolder;
66 private static File keepFolder;
67 private static File keepURLsFile;
68 private static File discardURLsFile;
69
70 // WARC WET header lines and header line prefixes of interest
71 static final String WARC_RECORD_START = "WARC/1.0";
72 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
73 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
74 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
75
76 // Keep a count of all the records that all WETProcessors instantiated
77 // by our main method combined have processed
78 //private static int recordCount = 0;
79
80 private final File outputFolder;
81 private final String WETFileID;
82
83
84 /**
85 * WET processor processes a single warc.wet file containing multiple WET records
86 * containing text identified as primary langcode=mri. Each individual WET record is written
87 * out to a uniquely named file in either the keep or discard folder depending on the WET
88 * record's content length and number of lines of actual content (excluding WARC headers).
89 */
90 public WETProcessor(File inFile, File outFolder) {
91 this.outputFolder = outFolder;
92
93 StringBuilder record = null;
94 String line = null;
95 boolean readingRecord = false;
96
97 String WARCtargetURI = "";
98
99 int recordCount = 0;
100
101 int contentLength = -1; // of record
102 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
103
104 // We just want a unique recordID prefix, which we get from the wet file name suffix:
105 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
106 // the prefix will be everything after the last hyphen and without file extension,
107 // so "000000" in our example. Then suffix the recordCount (keeping track of the current
108 // WET record) to get a unique filename to store each WET record into.
109
110 String fileID = inFile.getName();
111 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
112 fileID = fileID.substring(0, fileID.indexOf("."));
113 this.WETFileID = fileID;
114
115
116 // read from WETfile
117 try (
118 BufferedReader reader = new BufferedReader(new FileReader(inFile));
119 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
120 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
121 ) {
122
123 while((line = reader.readLine()) != null) { // readLine removes newline separator
124
125 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
126 readingRecord = false;
127 record = null; // drop this record, which is just an info record not actual web page's text
128 recordCount--;
129 continue;
130 }
131
132 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
133 // process any previous record
134 if(record != null) {
135 processWETrecord(keepURLsWriter, discardURLsWriter,
136 recordCount, contentLength, lineCount,
137 WARCtargetURI, record.toString());
138 record = null;
139 contentLength = -1;
140 lineCount = -1;
141 }
142
143 recordCount++;
144 // get ready to start a new record
145 readingRecord = true;
146 record = new StringBuilder();
147 }
148
149 if(readingRecord) { // append current line to current record
150
151 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
152 // get and store the value
153 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
154 }
155
156 record.append(line + "\n"); // add back (unix style) line ending
157
158 // if the line is non-empty
159 // AND if we've started counting lines, which happens only when the current
160 // line is past WARC/WET headers and we're into the actual body portion
161 // of the WET record,
162 // start incrementing the line counter.
163 if(lineCount >= 0 && !line.trim().equals("")) {
164 lineCount++;
165 }
166 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
167 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
168 contentLength = Integer.parseInt(contentLengthStr);
169 lineCount = 0;
170 }
171
172 }
173
174 }
175
176 // flush the last record. If it was a warcinfo record, record would be null here
177 if(record != null) {
178 processWETrecord(keepURLsWriter, discardURLsWriter,
179 recordCount, contentLength, lineCount,
180 WARCtargetURI, record.toString());
181 record = null;
182 }
183
184 } catch(IOException ioe) {
185 ioe.printStackTrace();
186 }
187 }
188
189 /**
190 * Determines if a WET record belongs in the keep or discard pile depending on if it
191 * contains enough text, based on contentLength and line count of the record body.
192 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
193 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
194 */
195 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
196 int recordID, int contentLength, int lineCount,
197 String recordURI, String record)
198 {
199 System.err.println("WET #" + this.WETFileID + " record #" + recordID
200 + " - contentLength: " + contentLength
201 + " - lineCount: " + lineCount);
202 System.err.println("URI: " + recordURI);
203 //System.err.println(record);
204 //System.err.println("--------------------------");
205
206 String paddedFileName = String.format("%04d.txt", recordID);
207
208 File parentFolder = null;
209
210 // want to match "product(s)" but not "production"
211 //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) {
212
213
214 /*
215 if(recordURI.contains("product") && !recordURI.contains("production")) {
216
217 // don't want a "translated" product site/online store
218 // These curiously often tend to have "product(s)" in the URL
219 parentFolder = WETProcessor.discardFolder;
220 }
221
222 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
223 parentFolder = WETProcessor.keepFolder;
224 System.err.println("@@@KEEPING");
225 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
226 int countSpaces = 0;
227 for(int i = 0; i < record.length(); i++) {
228 if(record.charAt(i) == ' ') countSpaces++;
229 }
230 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
231 // So we have at least 500 chars (possibly on a single wrapped line)
232 // containing at least 10 spaces. Such a record is also worth keeping.
233 parentFolder = WETProcessor.keepFolder;
234 }
235 }
236 */
237
238 if(isInDiscardFilter(recordURI)) {
239 parentFolder = WETProcessor.discardFolder;
240 }
241 else if(isInCheckFilter(recordURI)) { // products sites
242 parentFolder = WETProcessor.discardFolder; // TODO: checkfolder
243 } else {
244 // If a web page's WET record contains a certain minimum number of words,
245 // we will think it's a meaningful web page and has sufficient content for text analysis
246 // to have been successful. Cut off values at present are:
247 // - a minimum of 20 words
248 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
249 // to words having been glued together. This is used by irrelevant sites and moreover
250 // can't be analysed for language, so may not be actually MRI.
251
252 // Though StringTokenizer still in use, as seen in discussion at
253 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
254 // String.split(regex) seems better for splitting on general whitespace
255 String[] allWords = record.split("\\s");
256 int validWordCount = 0;
257 int numCamelCaseWords = 0;
258 for(int i = 0; i < allWords.length; i++) {
259 String word = allWords[i];
260
261 // throw away if n words contain camelcase, which is another case of words glued together
262 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
263 numCamelCaseWords++;
264 }
265
266 // In Maori, word length of 1 is not uncommon
267 // but let's skip camelcased words when counting valid words
268 else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
269 }
270
271 // dump if too many camelcase words (ideally keep none of that kind?)
272 if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
273 parentFolder = WETProcessor.discardFolder;
274 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
275 }
276 else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
277 parentFolder = WETProcessor.keepFolder;
278 System.err.println("@@@KEEPING");
279 }
280 }
281 // if parentFolder still not set, set to discard pile folder
282 if(parentFolder == null) {
283 parentFolder = WETProcessor.discardFolder;
284 System.err.println("@@@DISCARDING");
285 }
286
287 try {
288 if (parentFolder == WETProcessor.keepFolder) {
289 keepURLsWriter.write(recordURI + "\n");
290 } else {
291 discardURLsWriter.write(recordURI + "\n");
292 }
293 } catch(Exception e) {
294 System.err.println("Unable to write URL");
295 e.printStackTrace();
296 }
297
298 System.err.println("--------------------------");
299
300 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
301
302 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
303 writer.write(record);
304 // Try-with-resources examples don't call close() explicitly:
305 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
306 //writer.close();
307 } catch(IOException ioe) {
308 ioe.printStackTrace();
309 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
310 }
311 }
312
313
314 /**
315 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
316 * As output produces the URL seed list and regex-urlfilter text files required by nutch,
317 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
318 */
319 public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {
320 // Maintain Sets of unique domains and urls
321 // TreeSet: by default, "the elements are ordered using their natural ordering"
322 // (or by a Comparator provided at set creation time).
323 // Whereas HashSet doesn't guarantee ordering.
324 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
325
326 Set<String> domainsSet = new TreeSet<String>();
327 Set<String> urlsSet = new TreeSet<String>();
328
329 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
330
331 try (
332 BufferedReader reader = new BufferedReader(new FileReader(urlsFile));
333 ) {
334
335 // read a URL at a time from urlsFile
336 String url = null;
337 String domain = null;
338 while((url = reader.readLine()) != null) { // readLine removes newline separator
339
340 // work out domain. This retains any www. or subdomain prefix:
341 int startIndex = url.indexOf("//"); // http:// or https:// prefix
342 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
343 domain = url.substring(startIndex);
344 int endIndex = domain.indexOf("/");
345 if(endIndex == -1) endIndex = domain.length();
346 domain = domain.substring(0, endIndex);
347
348 //if(!domainsMap.containsKey(domain)) {
349 urlsSet.add(url);
350 domainsSet.add(domain);
351 //}
352 }
353 } catch (IOException ioe) {
354 ioe.printStackTrace();
355 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);
356 }
357
358 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
359 Iterator<String> i = urlsSet.iterator();
360 while(i.hasNext()) {
361 String url = i.next();
362 seedURLsWriter.write(url + "\n");
363 }
364
365 } catch (IOException ioe) {
366 ioe.printStackTrace();
367 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
368 }
369
370 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
371 Iterator<String> i = domainsSet.iterator();
372 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
373 while(i.hasNext()) {
374 String domain = i.next();
375 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
376 urlFilterWriter.write(domain + "\n");
377 }
378
379 } catch (IOException ioe) {
380 ioe.printStackTrace();
381 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
382 }
383 }
384
385 /**
386 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
387 * whether it is in the discard list.
388 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
389 * By not having this method deal with actual regex for filters, this has the advantage that
390 * we don't have to remember to escape or double escape each filter to turn it into a regex.
391 */
392 public boolean isInDiscardFilter(String url) {
393
394 String discardFilterFile = "url-discard-filter.txt"; // in conf folder
395
396 try (
397 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
398 ) {
399 String filter = null;
400 while((filter = reader.readLine()) != null) {
401 if(filter.trim().equals("")) {
402 continue;
403 }
404 //System.err.println("Got filter: " + filter);
405 if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
406 System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
407 }
408 else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
409 System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
410 return true;
411 }
412 else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
413 System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
414 return true;
415 }
416 else if(url.contains(filter)) {
417 System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
418 return true;
419 }
420
421 }
422
423 } catch (IOException ioe) {
424 ioe.printStackTrace();
425 System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
426 }
427
428 return false;
429 }
430
431 // TODO
432 public boolean isInCheckFilter(String url) {
433 //System.err.println("isInCheckFilter(url) is not yet implemented");
434 return false;
435 }
436
437 //public static int getRecordCount() { return recordCount; }
438
439 public static void printUsage() {
440 System.err.println("Run this program as:");
441 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
442 }
443
444 /** Filename filter to only list warc.wet files or else warc.wet.gz files
445 * for which unzipped warc.wet equivalents don't yet exist.
446 */
447 private static class WETFilenameFilter implements FilenameFilter {
448
449 public boolean accept(File dir, String name) {
450 if(name.endsWith(".warc.wet")) {
451 logger.debug("Will include " + name + " for processing.");
452 return true;
453 }
454
455 if(name.endsWith(".warc.wet.gz")) {
456 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
457 File unzippedVersion = new File(dir, nameWithoutGZext);
458 if(unzippedVersion.exists()) {
459 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
460 logger.debug("Skipping " + name);
461 return false; // don't count gzipped version if unzipped version exists.
462 }
463 else {
464 logger.debug("Only zipped version " + name + " exists.");
465 return true; // No unzipped version, so have to work with gzipped version
466 }
467 }
468
469 // we're not even interested in any other file extensions
470 logger.debug("Not a WET file. Skipping " + name);
471 return false;
472 }
473 }
474
475 public static void main(String[] args) {
476 if(args.length != 2) {
477 printUsage();
478 return;
479 }
480
481
482 File WETFileDir = new File(args[0]);
483 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
484 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
485 return;
486 }
487
488 File outFolder = new File(args[1]);
489 if(!outFolder.exists() || !outFolder.isDirectory()) {
490 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
491 return;
492 }
493
494 // static folders and files to be shared across all WETProcessor instances
495 WETProcessor.discardFolder = new File(outFolder, "discard");
496 if(!WETProcessor.discardFolder.exists()) {
497 WETProcessor.discardFolder.mkdir();
498 }
499 WETProcessor.keepFolder = new File(outFolder, "keep");
500 if(!WETProcessor.keepFolder.exists()) {
501 WETProcessor.keepFolder.mkdir();
502 }
503
504 WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");
505 if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
506 System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
507 return;
508 }
509 WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
510 if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
511 System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
512 return;
513 }
514
515 // Will list all the warc.wet files in the input directory or else their gzipped versions
516 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
517
518 for(int i = 0; i < WETFiles.length; i++) {
519 File WETFile = WETFiles[i];
520 logger.debug("Processing WETfile: " + WETFile);
521
522 // Any .gz files listed means they haven't been unzipped yet. So unzip.
523 String WETFilename = WETFile.toString();
524 if(WETFilename.endsWith(".gz")) {
525 File GZippedWETFile = WETFile;
526 String WETGZippedFilename = WETFilename;
527 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
528
529 WETFile = new File(WETFilename);
530 Utility.unzipFile(GZippedWETFile, WETFile);
531 }
532 // hereafter all WETFiles should refer to the unzipped version
533 // Check the unzipped WETFile exists
534
535 if(!WETFile.exists() || !WETFile.isFile()) {
536 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
537 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
538 return;
539 }
540
541 // Finally, we can process this WETFile's records into the keep and discard pile
542 logger.debug("Off to process " + WETFile);
543 WETProcessor processor = new WETProcessor(WETFile, outFolder);
544
545 }
546
547 File seedURLsFile = new File(outFolder, "seedURLs.txt");
548 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
549 WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);
550
551 return;
552 }
553}
Note: See TracBrowser for help on using the repository browser.