source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33519

Last change on this file since 33519 was 33519, checked in by ak19, 5 years ago

Code still writes out the global seedURLs.txt and regex-urlfilter.txt (in case this remains meaningful), but now also creates individual site directories containing their individual seedURLs.txt and regex-urlfilter.txt

File size: 22.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeMap;
12import java.util.TreeSet;
13
14import org.apache.log4j.Logger;
15
16/**
17 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18 * the WET records in each, putting each WET record into a file. Each file is put into a
19 * keep or discard or greyListed folder, and its url listed written into a keep, discard
20 * or greylisted text file, based on based on
21 *
22 * 1. whether it's whitelisted, else greylisted else blacklisted
23 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24 * enough content. Formerly, content-length and number of lines were used to determine if
25 * the content was sufficient. Now it's just word count and number of MAX characters
26 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27 * in conf/config.properties.
28 *
29 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30 * into the conf folder to control any url patterns that are explicitly included or excluded or
31 * set aside for inspecting later. These filter text files don't use regexes, instead their
32 * format is:
33 * - precede URL by ^ to blacklist urls that match the given prefix
34 * - succeed URL by $ to blacklist urls that match the given suffix
35 * - ^url$ will blacklist urls that match the given url completely
36 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37 *
38 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
39 * over greylisting and which takes precedence over blacklisting in turn. However, even
40 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42 * also for nutch.
43 *
44 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45 * in the given input folder. Then use a single instance of the WETProcessor class to process
46 * each single unzipped warc.wet file.
47 *
48 * To compile, including the jars in lib/ for compiling.
49 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50 *
51 * To run, passing the log4j and other properties files in conf/ folder:
52 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53 *
54 * e.g.
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
57 *
58*/
59
60public class CCWETProcessor {
61 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63 // Properties shared across WETProcessor instances
64 public final int MAX_WORD_LENGTH;
65 public final int MIN_NUM_WORDS;
66 public final int MAX_WORDS_CAMELCASE;
67
68 private Properties configProperties = new Properties();
69
70 // File paths shared across WETProcessor instances
71 public final File WETFilesDir;
72 public final File outputFolder;
73 public final File discardFolder;
74 public final File keepFolder;
75 public final File greyListedFolder;
76 public final File keepURLsFile;
77 public final File discardURLsFile;
78 public final File greyListedFile;
79
80 /** Possible values stored in the blackList/whiteList/greyList Maps */
81 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
82 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
83 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
84 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
85
86 /**
87 * Store url patterns as keys and values indicated whether a url should
88 * match it exactly, start/end with it, or contain it
89 */
90 private HashMap<String, Integer> blackList;
91 private HashMap<String, Integer> greyList;
92 private HashMap<String, Integer> whiteList;
93
94 /** Map of domains we keep and the full urls we're keeping that are of that domain.
95 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
96 * while a HashMap has no notion of ordering, because we just need to store urls with
97 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
98 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
99 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
100 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
101 */
102 private Map<String, Set<String>> domainsToURLsMap;
103
104 // Keep a count of all the records that all WETProcessors instantiated
105 // by our main method combined have processed
106 private int totalRecordCount = 0;
107
108 private int wetFileCount = 0;
109
110 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
111 this.WETFilesDir = inFolder;
112 this.outputFolder = outFolder;
113
114 // load up the properties from the config file
115 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
116 configProperties = new Properties();
117 configProperties.load(infile);
118 //infile.close(); // not explicitly called in examples of try-with-resources
119
120 } catch(Exception e) {
121 System.err.println("Exception attempting to read properties from config.properties.");
122 logger.error("Exception attempting to read properties from config.properties.");
123 e.printStackTrace();
124 }
125
126 if(configProperties.size() == 0) {
127 System.err.println("*** Warning: no values read into config properties. Using defaults.");
128 }
129
130 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
131 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
132 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
133
134
135 this.discardFolder = new File(outFolder, "discard");
136 if(!discardFolder.exists()) {
137 discardFolder.mkdir();
138 }
139 this.keepFolder = new File(outFolder, "keep");
140 if(!keepFolder.exists()) {
141 keepFolder.mkdir();
142 }
143
144 this.greyListedFolder = new File(outFolder, "greylisted");
145 if(!greyListedFolder.exists()) {
146 greyListedFolder.mkdir();
147 }
148
149 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
150 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
151 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
152 }
153 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
154 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
155 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
156 }
157 this.greyListedFile = new File(outFolder, "greyListed.txt");
158 if(greyListedFile.exists() && !greyListedFile.delete()) {
159 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
160 }
161
162 // prepare our blacklist, greylist (for inspection) and whitelist
163 System.err.println("Loading blacklist.");
164 blackList = new HashMap<String, Integer>();
165 initURLFilterList(blackList, "url-blacklist-filter.txt");
166
167 System.err.println("Loading greylist.");
168 greyList = new HashMap<String, Integer>();
169 initURLFilterList(greyList, "url-greylist-filter.txt");
170
171 System.err.println("Loading whitelist.");
172 whiteList = new HashMap<String, Integer>();
173 initURLFilterList(whiteList, "url-whitelist-filter.txt");
174
175 //System.err.println("Prematurely terminating for testing purposes.");
176 //System.exit(-1);
177 }
178
179 /**
180 * Using the keepURLs.txt file generated by running WETProcessor instances, produces
181 * as output the URL seed list and regex-urlfilter text files required by nutch, see
182 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
183 */
184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
185 // Maintain Sets of unique domains and urls
186 // TreeSet: by default, "the elements are ordered using their natural ordering"
187 // (or by a Comparator provided at set creation time).
188 // Whereas HashSet doesn't guarantee ordering.
189 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
190
191 //Set<String> domainsSet = new TreeSet<String>();
192 //Set<String> urlsSet = new TreeSet<String>();
193 domainsToURLsMap = new TreeMap<String, Set<String>>();
194
195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
196
197 try (
198 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
199 ) {
200
201 // read a URL at a time from urlsFile
202 String url = null;
203 String domain = null;
204 while((url = reader.readLine()) != null) { // readLine removes newline separator
205
206 // work out domain. This retains any www. or subdomain prefix:
207 int startIndex = url.indexOf("//"); // http:// or https:// prefix
208 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
209 domain = url.substring(startIndex);
210 int endIndex = domain.indexOf("/");
211 if(endIndex == -1) endIndex = domain.length();
212 domain = domain.substring(0, endIndex);
213
214 //urlsSet.add(url);
215 //domainsSet.add(domain);
216 Set<String> urlsSet;
217 if(!domainsToURLsMap.containsKey(domain)) {
218 urlsSet = new TreeSet<String>();
219 urlsSet.add(url);
220 domainsToURLsMap.put(domain, urlsSet);
221 } else {
222 urlsSet = domainsToURLsMap.get(domain);
223 urlsSet.add(url);
224 }
225
226 }
227 } catch (IOException ioe) {
228 ioe.printStackTrace();
229 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
230 }
231
232 /*
233 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
234 Iterator<String> i = urlsSet.iterator();
235 while(i.hasNext()) {
236 String url = i.next();
237 seedURLsWriter.write(url + "\n");
238 }
239
240 } catch (IOException ioe) {
241 ioe.printStackTrace();
242 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
243 }
244 */
245
246 int domainCount = 0;
247 File sitesFolder = new File(outputFolder, "sites");
248 if(!sitesFolder.exists()) {
249 sitesFolder.mkdir();
250 }
251 final String FORMATSTR = "%05d";
252
253 // write out each domain followed in sequence by all urls we found in that domain
254 // (urls with tab up front)
255 try (
256 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
257 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
258 ) {
259 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
260 Set<String> domainsSet = domainsToURLsMap.keySet();
261 Iterator<String> domainIterator = domainsSet.iterator();
262
263 while(domainIterator.hasNext()) {
264 domainCount++;
265 String siteID = String.format(FORMATSTR, domainCount);
266 File domainFolder = new File(sitesFolder, siteID);
267 domainFolder.mkdir();
268
269 // write out the domain
270 String domain = domainIterator.next();
271 //seedURLsWriter.write(domain + "\n");
272 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
273 String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
274 urlFilterWriter.write(regexed_domain + "\n");
275
276 // for every domain, we need sites/0000x/ folder containing its own
277 // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
278 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
279 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
280 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
281 try (
282 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
283 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
284 ) {
285 // only write urls and no domain into single global seedurls file
286 // But write domain and tabbed urls into individual sites/0000x.txt files
287 // and write regexed domain into it too
288 siteURLsWriter.write(domain + "\n");
289 siteRegexWriter.write(regexed_domain + "\n");
290
291 // next write out the urls for the domain with a tab prefixed to each
292 // into the sites/0000x/seedURLs.txt file - also write into the global seeds file
293 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
294 Iterator<String> urlIterator = urlsForDomainSet.iterator();
295 while(urlIterator.hasNext()) {
296 String url = urlIterator.next();
297 seedURLsWriter.write(url + "\n"); // global seedURLs file
298 siteURLsWriter.write("\t" + url + "\n");
299 }
300 } catch (IOException ioe) {
301 ioe.printStackTrace();
302 System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
303 }
304 }
305
306 } catch (IOException ioe) {
307 ioe.printStackTrace();
308 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
309 }
310
311 // write out domains as regular expressions into "regex-urlfilter.txt" file
312 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
313 Set<String> domainsSet = domainsToURLsMap.keySet();
314 Iterator<String> i = domainsSet.iterator();
315 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
316 while(i.hasNext()) {
317 String domain = i.next();
318 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
319 urlFilterWriter.write(domain + "\n");
320 }
321
322 } catch (IOException ioe) {
323 ioe.printStackTrace();
324 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
325 }
326 }
327
328 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
329 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
330 Iterator<Map.Entry<String, Integer>> i = entries.iterator();
331 while(i.hasNext()) {
332 Map.Entry<String, Integer> entry = i.next();
333 String urlPattern = entry.getKey();
334 Integer matchRule = entry.getValue();
335
336 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
337 return true;
338 }
339 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
340 return true;
341 }
342 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
343 return true;
344 }
345 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
346 return true;
347 }
348 // else check the rest of the filter list against this url
349 // before returning false to be certain it's not been listed in the filter list
350 }
351
352 return false;
353 }
354
355 /**
356 * Returns true if the url or pattern is found in the blacklist file.
357 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
358 * it won't get blacklisted after all. But that's not implemented here.
359 */
360 public boolean isBlacklisted(String url) {
361 return isListedInFilterList(blackList, url);
362 }
363
364 /**
365 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
366 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
367 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
368 */
369 public boolean isGreylisted(String url) {
370 // TODO: alexa top sites and auto-translated product sites
371 return isListedInFilterList(greyList, url);
372 }
373
374 /**
375 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
376 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
377 */
378 public boolean isWhitelisted(String url) {
379 return isListedInFilterList(whiteList, url);
380 }
381
382 /**
383 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
384 * whether it is in the discard list.
385 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
386 * By not having this method deal with actual regex for filters, this has the advantage that
387 * we don't have to remember to escape or double escape each filter to turn it into a regex.
388 */
389 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
390
391 // if filterListFilename does not exist in the conf folder, just return
392 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
393 System.err.println(filterListFilename + " does not exist");
394 return;
395 }
396
397 try (
398 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
399 ) {
400 String filter = null;
401 while((filter = reader.readLine()) != null) {
402 // skip comments and empty lines
403 filter = filter.trim();
404 if(filter.equals("") || filter.startsWith("#")) {
405 continue;
406 }
407
408 if(filter.startsWith("^") && filter.endsWith("$")) {
409 filter = filter.substring(1, filter.length()-1);
410 list.put(filter, LIST_ENTRY_MATCHES);
411 }
412 else if(filter.startsWith("^")) {
413 filter = filter.substring(1);
414 list.put(filter, LIST_ENTRY_STARTSWITH);
415 System.err.println("Match filter startswith: " + filter);
416 }
417 else if(filter.endsWith("$")) {
418 filter = filter.substring(0, filter.length()-1);
419 list.put(filter, LIST_ENTRY_ENDSWITH);
420 }
421 else {
422 list.put(filter, LIST_ENTRY_CONTAINS);
423 }
424 //System.err.println("Got filter: " + filter);
425 }
426
427 } catch (IOException ioe) {
428 ioe.printStackTrace();
429 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
430 }
431
432 }
433
434 /** Maintain a count of all WET files processed. */
435 public void setWETFileCount(int count) { this.wetFileCount = count; }
436
437 /** Maintain a count of all WET records processed. */
438 //public int getRecordCount() { return this.totalRecordCount; }
439 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
440 public void setRecordCount(int count) { this.totalRecordCount = count; }
441
442 public static void printUsage() {
443 System.err.println("Run this program as:");
444 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
445 }
446
447 /** Filename filter to only list warc.wet files or else warc.wet.gz files
448 * for which unzipped warc.wet equivalents don't yet exist.
449 */
450 private static class WETFilenameFilter implements FilenameFilter {
451
452 public boolean accept(File dir, String name) {
453 if(name.endsWith(".warc.wet")) {
454 logger.debug("Will include " + name + " for processing.");
455 return true;
456 }
457
458 if(name.endsWith(".warc.wet.gz")) {
459 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
460 File unzippedVersion = new File(dir, nameWithoutGZext);
461 if(unzippedVersion.exists()) {
462 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
463 logger.debug("Skipping " + name);
464 return false; // don't count gzipped version if unzipped version exists.
465 }
466 else {
467 logger.debug("Only zipped version " + name + " exists.");
468 return true; // No unzipped version, so have to work with gzipped version
469 }
470 }
471
472 // we're not even interested in any other file extensions
473 logger.debug("Not a WET file. Skipping " + name);
474 return false;
475 }
476 }
477
478
479 public static void main(String[] args) {
480 if(args.length != 2) {
481 printUsage();
482 return;
483 }
484
485
486 File WETFileDir = new File(args[0]);
487 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
488 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
489 return;
490 }
491
492 File outFolder = new File(args[1]);
493 if(!outFolder.exists() || !outFolder.isDirectory()) {
494 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
495 return;
496 }
497
498 try {
499 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
500
501 //ccWETFilesProcessor.processAllWETFiles();
502
503 // Will list all the warc.wet files in the input directory or else their gzipped versions
504 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
505
506 int wetRecordCount = 0;
507 int wetFileCount = 0;
508
509 for(int i = 0; i < WETFiles.length; i++) {
510 File WETFile = WETFiles[i];
511 logger.debug("Processing WETfile: " + WETFile);
512
513 // Any .gz files listed means they haven't been unzipped yet. So unzip.
514 String WETFilename = WETFile.toString();
515 if(WETFilename.endsWith(".gz")) {
516 File GZippedWETFile = WETFile;
517 String WETGZippedFilename = WETFilename;
518 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
519
520 WETFile = new File(WETFilename);
521 Utility.unzipFile(GZippedWETFile, WETFile);
522 }
523 // hereafter all WETFiles should refer to the unzipped version
524 // Check the unzipped WETFile exists
525
526 if(!WETFile.exists() || !WETFile.isFile()) {
527 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
528 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
529 return;
530 }
531
532 // Finally, we can process this WETFile's records into the keep and discard pile
533 wetFileCount++;
534 logger.debug("Off to process " + WETFile);
535 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
536 wetFileProcessor.processWETFile();
537 wetRecordCount += wetFileProcessor.getRecordCount();
538 }
539
540 // for information purposes
541 ccWETFilesProcessor.setWETFileCount(wetFileCount);
542 ccWETFilesProcessor.setRecordCount(wetRecordCount);
543
544 File seedURLsFile = new File(outFolder, "seedURLs.txt");
545 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
546 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
547
548 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
549
550 } catch(Exception e) {
551 // can get an exception when instantiating CCWETProcessor instance
552 e.printStackTrace();
553 System.err.println(e.getMessage());
554 }
555
556 return;
557
558 }
559}
Note: See TracBrowser for help on using the repository browser.