source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33503

Last change on this file since 33503 was 33503, checked in by ak19, 5 years ago

More efficient blacklisting/greylisting/whitelisting now by reading in the lists only once and then comparing each URL to each list. Explicit whitelisting has precedence over greylisting and which takes precedence over blacklisting. Then any remaining urls are checked for having sufficient content. The code that checks for sufficient content still needs some more adjusting.

File size: 19.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeSet;
12
13import org.apache.log4j.Logger;
14
15/**
16 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
17 * the WET records in each, putting each WET record into a file. Each file is put into a
18 * keep or discard or greyListed folder, and its url listed written into a keep, discard
19 * or greylisted text file, based on based on
20 *
21 * 1. whether it's whitelisted, else greylisted else blacklisted
22 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
23 * enough content. Formerly, content-length and number of lines were used to determine if
24 * the content was sufficient. Now it's just word count and number of MAX characters
25 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
26 * in conf/config.properties.
27 *
28 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
29 * into the conf folder to control any url patterns that are explicitly included or excluded or
30 * set aside for inspecting later. These filter text files don't use regexes, instead their
31 * format is:
32 * - precede URL by ^ to blacklist urls that match the given prefix
33 * - succeed URL by $ to blacklist urls that match the given suffix
34 * - ^url$ will blacklist urls that match the given url completely
35 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
36 *
37 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
38 * over greylisting and which takes precedence over blacklisting in turn. However, even
39 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
40 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
41 * also for nutch.
42 *
43 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
44 * in the given input folder. Then use a single instance of the WETProcessor class to process
45 * each single unzipped warc.wet file.
46 *
47 * To compile, including the jars in lib/ for compiling.
48 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
49 *
50 * To run, passing the log4j and other properties files in conf/ folder:
51 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
52 *
53 * e.g.
54 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
56 *
57*/
58
59public class CCWETProcessor {
60 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
61
62 // Properties shared across WETProcessor instances
63 public final int MAX_WORD_LENGTH;
64 public final int MIN_NUM_WORDS;
65 public final int MAX_WORDS_CAMELCASE;
66
67 private Properties configProperties = new Properties();
68
69 // File paths shared across WETProcessor instances
70 public final File WETFilesDir;
71 public final File outputFolder;
72 public final File discardFolder;
73 public final File keepFolder;
74 public final File greyListedFolder;
75 public final File keepURLsFile;
76 public final File discardURLsFile;
77 public final File greyListedFile;
78
79 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
80 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
81 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
82 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
83
84 private HashMap<String, Integer> blackList;
85 private HashMap<String, Integer> greyList;
86 private HashMap<String, Integer> whiteList;
87
88 // Keep a count of all the records that all WETProcessors instantiated
89 // by our main method combined have processed
90 private int totalRecordCount = 0;
91
92 private int wetFileCount = 0;
93
94 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
95 this.WETFilesDir = inFolder;
96 this.outputFolder = outFolder;
97
98 // load up the properties from the config file
99 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
100 configProperties = new Properties();
101 configProperties.load(infile);
102 //infile.close(); // not explicitly called in examples of try-with-resources
103
104 } catch(Exception e) {
105 System.err.println("Exception attempting to read properties from config.properties.");
106 logger.error("Exception attempting to read properties from config.properties.");
107 e.printStackTrace();
108 }
109
110 if(configProperties.size() == 0) {
111 System.err.println("*** Warning: no values read into config properties. Using defaults.");
112 }
113
114 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
115 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
116 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
117
118
119 this.discardFolder = new File(outFolder, "discard");
120 if(!discardFolder.exists()) {
121 discardFolder.mkdir();
122 }
123 this.keepFolder = new File(outFolder, "keep");
124 if(!keepFolder.exists()) {
125 keepFolder.mkdir();
126 }
127
128 this.greyListedFolder = new File(outFolder, "greylisted");
129 if(!greyListedFolder.exists()) {
130 greyListedFolder.mkdir();
131 }
132
133 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
134 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
135 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
136 }
137 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
138 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
139 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
140 }
141 this.greyListedFile = new File(outFolder, "greyListed.txt");
142 if(greyListedFile.exists() && !greyListedFile.delete()) {
143 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
144 }
145
146 System.err.println("Loading blacklist.");
147 blackList = new HashMap<String, Integer>();
148 initURLFilterList(blackList, "url-blacklist-filter.txt");
149 System.err.println("Loading greylist.");
150 greyList = new HashMap<String, Integer>();
151 initURLFilterList(greyList, "url-greylist-filter.txt");
152 System.err.println("Loading whitelist.");
153 whiteList = new HashMap<String, Integer>();
154 initURLFilterList(whiteList, "url-whitelist-filter.txt");
155
156 //System.err.println("Prematurely terminating for testing purposes.");
157 //System.exit(-1);
158 }
159
160 /**
161 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
162 * As output produces the URL seed list and regex-urlfilter text files required by nutch,
163 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
164 */
165 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
166 // Maintain Sets of unique domains and urls
167 // TreeSet: by default, "the elements are ordered using their natural ordering"
168 // (or by a Comparator provided at set creation time).
169 // Whereas HashSet doesn't guarantee ordering.
170 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
171
172 Set<String> domainsSet = new TreeSet<String>();
173 Set<String> urlsSet = new TreeSet<String>();
174
175 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
176
177 try (
178 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
179 ) {
180
181 // read a URL at a time from urlsFile
182 String url = null;
183 String domain = null;
184 while((url = reader.readLine()) != null) { // readLine removes newline separator
185
186 // work out domain. This retains any www. or subdomain prefix:
187 int startIndex = url.indexOf("//"); // http:// or https:// prefix
188 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
189 domain = url.substring(startIndex);
190 int endIndex = domain.indexOf("/");
191 if(endIndex == -1) endIndex = domain.length();
192 domain = domain.substring(0, endIndex);
193
194 //if(!domainsMap.containsKey(domain)) {
195 urlsSet.add(url);
196 domainsSet.add(domain);
197 //}
198 }
199 } catch (IOException ioe) {
200 ioe.printStackTrace();
201 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
202 }
203
204 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
205 Iterator<String> i = urlsSet.iterator();
206 while(i.hasNext()) {
207 String url = i.next();
208 seedURLsWriter.write(url + "\n");
209 }
210
211 } catch (IOException ioe) {
212 ioe.printStackTrace();
213 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
214 }
215
216 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
217 Iterator<String> i = domainsSet.iterator();
218 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
219 while(i.hasNext()) {
220 String domain = i.next();
221 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
222 urlFilterWriter.write(domain + "\n");
223 }
224
225 } catch (IOException ioe) {
226 ioe.printStackTrace();
227 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
228 }
229 }
230
231 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
232 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
233 Iterator<Map.Entry<String, Integer>> i = entries.iterator();
234 while(i.hasNext()) {
235 Map.Entry<String, Integer> entry = i.next();
236 String urlPattern = entry.getKey();
237 Integer matchRule = entry.getValue();
238
239 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
240 return true;
241 }
242 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
243 return true;
244 }
245 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
246 return true;
247 }
248 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
249 return true;
250 }
251 // else check the rest of the filter list against this url
252 // before returning false to be certain it's not been listed in the filter list
253 }
254
255 return false;
256 }
257
258 /**
259 * Returns true if the url or pattern is found in the blacklist file.
260 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
261 * it won't get blacklisted after all. But that's not implemented here.
262 */
263 public boolean isBlacklisted(String url) {
264 return isListedInFilterList(blackList, url);
265 }
266
267 /**
268 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
269 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
270 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
271 */
272 public boolean isGreylisted(String url) {
273 // TODO: alexa top sites and auto-translated product sites
274 return isListedInFilterList(greyList, url);
275 }
276
277 /**
278 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
279 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
280 */
281 public boolean isWhitelisted(String url) {
282 return isListedInFilterList(whiteList, url);
283 }
284
285 /**
286 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
287 * whether it is in the discard list.
288 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
289 * By not having this method deal with actual regex for filters, this has the advantage that
290 * we don't have to remember to escape or double escape each filter to turn it into a regex.
291 */
292 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
293
294 // if filterListFilename does not exist in the conf folder, just return
295 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
296 System.err.println(filterListFilename + " does not exist");
297 return;
298 }
299
300 try (
301 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
302 ) {
303 String filter = null;
304 while((filter = reader.readLine()) != null) {
305 // skip comments and empty lines
306 filter = filter.trim();
307 if(filter.equals("") || filter.startsWith("#")) {
308 continue;
309 }
310
311 if(filter.startsWith("^") && filter.endsWith("$")) {
312 filter = filter.substring(1, filter.length()-1);
313 list.put(filter, LIST_ENTRY_MATCHES);
314 }
315 else if(filter.startsWith("^")) {
316 filter = filter.substring(1);
317 list.put(filter, LIST_ENTRY_STARTSWITH);
318 System.err.println("Match filter startswith: " + filter);
319 }
320 else if(filter.endsWith("$")) {
321 filter = filter.substring(0, filter.length()-1);
322 list.put(filter, LIST_ENTRY_ENDSWITH);
323 }
324 else {
325 list.put(filter, LIST_ENTRY_CONTAINS);
326 }
327 //System.err.println("Got filter: " + filter);
328 }
329
330 } catch (IOException ioe) {
331 ioe.printStackTrace();
332 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
333 }
334
335 }
336 /*
337 public boolean isInDiscardFilter(String url) {
338 String discardFilterFile = "url-discard-filter.txt"; // in conf folder
339
340 try (
341 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
342 ) {
343 String filter = null;
344 while((filter = reader.readLine()) != null) {
345 if(filter.trim().equals("")) {
346 continue;
347 }
348 //System.err.println("Got filter: " + filter);
349 if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
350 System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
351 }
352 else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
353 System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
354 return true;
355 }
356 else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
357 System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
358 return true;
359 }
360 else if(url.contains(filter)) {
361 System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
362 return true;
363 }
364
365 }
366
367 } catch (IOException ioe) {
368 ioe.printStackTrace();
369 System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
370 }
371
372 return false;
373 }*/
374
375 /** Maintain a count of all WET files processed. */
376 public void setWETFileCount(int count) { this.wetFileCount = count; }
377
378 /** Maintain a count of all WET records processed. */
379 //public int getRecordCount() { return this.totalRecordCount; }
380 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
381 public void setRecordCount(int count) { this.totalRecordCount = count; }
382
383 public static void printUsage() {
384 System.err.println("Run this program as:");
385 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
386 }
387
388 /** Filename filter to only list warc.wet files or else warc.wet.gz files
389 * for which unzipped warc.wet equivalents don't yet exist.
390 */
391 private static class WETFilenameFilter implements FilenameFilter {
392
393 public boolean accept(File dir, String name) {
394 if(name.endsWith(".warc.wet")) {
395 logger.debug("Will include " + name + " for processing.");
396 return true;
397 }
398
399 if(name.endsWith(".warc.wet.gz")) {
400 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
401 File unzippedVersion = new File(dir, nameWithoutGZext);
402 if(unzippedVersion.exists()) {
403 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
404 logger.debug("Skipping " + name);
405 return false; // don't count gzipped version if unzipped version exists.
406 }
407 else {
408 logger.debug("Only zipped version " + name + " exists.");
409 return true; // No unzipped version, so have to work with gzipped version
410 }
411 }
412
413 // we're not even interested in any other file extensions
414 logger.debug("Not a WET file. Skipping " + name);
415 return false;
416 }
417 }
418
419
420 public static void main(String[] args) {
421 if(args.length != 2) {
422 printUsage();
423 return;
424 }
425
426
427 File WETFileDir = new File(args[0]);
428 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
429 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
430 return;
431 }
432
433 File outFolder = new File(args[1]);
434 if(!outFolder.exists() || !outFolder.isDirectory()) {
435 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
436 return;
437 }
438
439 try {
440 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
441
442 //ccWETFilesProcessor.processAllWETFiles();
443
444 // Will list all the warc.wet files in the input directory or else their gzipped versions
445 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
446
447 int wetRecordCount = 0;
448 int wetFileCount = 0;
449
450 for(int i = 0; i < WETFiles.length; i++) {
451 File WETFile = WETFiles[i];
452 logger.debug("Processing WETfile: " + WETFile);
453
454 // Any .gz files listed means they haven't been unzipped yet. So unzip.
455 String WETFilename = WETFile.toString();
456 if(WETFilename.endsWith(".gz")) {
457 File GZippedWETFile = WETFile;
458 String WETGZippedFilename = WETFilename;
459 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
460
461 WETFile = new File(WETFilename);
462 Utility.unzipFile(GZippedWETFile, WETFile);
463 }
464 // hereafter all WETFiles should refer to the unzipped version
465 // Check the unzipped WETFile exists
466
467 if(!WETFile.exists() || !WETFile.isFile()) {
468 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
469 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
470 return;
471 }
472
473 // Finally, we can process this WETFile's records into the keep and discard pile
474 wetFileCount++;
475 logger.debug("Off to process " + WETFile);
476 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
477 wetFileProcessor.processWETFile();
478 wetRecordCount += wetFileProcessor.getRecordCount();
479 }
480
481 // for information purposes
482 ccWETFilesProcessor.setWETFileCount(wetFileCount);
483 ccWETFilesProcessor.setRecordCount(wetRecordCount);
484
485 File seedURLsFile = new File(outFolder, "seedURLs.txt");
486 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
487 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
488 } catch(Exception e) {
489 // can get an exception when instantiating CCWETProcessor instance
490 e.printStackTrace();
491 System.err.println(e.getMessage());
492 }
493
494 return;
495
496 }
497}
Note: See TracBrowser for help on using the repository browser.