source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33518

Last change on this file since 33518 was 33518, checked in by ak19, 5 years ago

Intermediate commit: got the seed urls file temporarily written out as domain followed by commoncrawl's urls within the domain. For the next commit, I will try splitting them into individual files per domain along with their individual regex-url txt file list restricted just to the site/domain, while returning what's output into the seed urls file back to all urls, sorted.

File size: 20.2 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeMap;
12import java.util.TreeSet;
13
14import org.apache.log4j.Logger;
15
16/**
17 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18 * the WET records in each, putting each WET record into a file. Each file is put into a
19 * keep or discard or greyListed folder, and its url listed written into a keep, discard
20 * or greylisted text file, based on based on
21 *
22 * 1. whether it's whitelisted, else greylisted else blacklisted
23 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24 * enough content. Formerly, content-length and number of lines were used to determine if
25 * the content was sufficient. Now it's just word count and number of MAX characters
26 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27 * in conf/config.properties.
28 *
29 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30 * into the conf folder to control any url patterns that are explicitly included or excluded or
31 * set aside for inspecting later. These filter text files don't use regexes, instead their
32 * format is:
33 * - precede URL by ^ to blacklist urls that match the given prefix
34 * - succeed URL by $ to blacklist urls that match the given suffix
35 * - ^url$ will blacklist urls that match the given url completely
36 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37 *
38 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
39 * over greylisting and which takes precedence over blacklisting in turn. However, even
40 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42 * also for nutch.
43 *
44 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45 * in the given input folder. Then use a single instance of the WETProcessor class to process
46 * each single unzipped warc.wet file.
47 *
48 * To compile, including the jars in lib/ for compiling.
49 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50 *
51 * To run, passing the log4j and other properties files in conf/ folder:
52 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53 *
54 * e.g.
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
57 *
58*/
59
60public class CCWETProcessor {
61 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63 // Properties shared across WETProcessor instances
64 public final int MAX_WORD_LENGTH;
65 public final int MIN_NUM_WORDS;
66 public final int MAX_WORDS_CAMELCASE;
67
68 private Properties configProperties = new Properties();
69
70 // File paths shared across WETProcessor instances
71 public final File WETFilesDir;
72 public final File outputFolder;
73 public final File discardFolder;
74 public final File keepFolder;
75 public final File greyListedFolder;
76 public final File keepURLsFile;
77 public final File discardURLsFile;
78 public final File greyListedFile;
79
80 /** Possible values stored in the blackList/whiteList/greyList Maps */
81 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
82 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
83 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
84 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
85
86 /**
87 * Store url patterns as keys and values indicated whether a url should
88 * match it exactly, start/end with it, or contain it
89 */
90 private HashMap<String, Integer> blackList;
91 private HashMap<String, Integer> greyList;
92 private HashMap<String, Integer> whiteList;
93
94 /** Map of domains we keep and the full urls we're keeping that are of that domain.
95 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
96 * while a HashMap has no notion of ordering, because we just need to store urls with
97 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
98 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
99 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
100 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
101 */
102 private Map<String, Set<String>> domainsToURLsMap;
103
104 // Keep a count of all the records that all WETProcessors instantiated
105 // by our main method combined have processed
106 private int totalRecordCount = 0;
107
108 private int wetFileCount = 0;
109
110 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
111 this.WETFilesDir = inFolder;
112 this.outputFolder = outFolder;
113
114 // load up the properties from the config file
115 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
116 configProperties = new Properties();
117 configProperties.load(infile);
118 //infile.close(); // not explicitly called in examples of try-with-resources
119
120 } catch(Exception e) {
121 System.err.println("Exception attempting to read properties from config.properties.");
122 logger.error("Exception attempting to read properties from config.properties.");
123 e.printStackTrace();
124 }
125
126 if(configProperties.size() == 0) {
127 System.err.println("*** Warning: no values read into config properties. Using defaults.");
128 }
129
130 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
131 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
132 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
133
134
135 this.discardFolder = new File(outFolder, "discard");
136 if(!discardFolder.exists()) {
137 discardFolder.mkdir();
138 }
139 this.keepFolder = new File(outFolder, "keep");
140 if(!keepFolder.exists()) {
141 keepFolder.mkdir();
142 }
143
144 this.greyListedFolder = new File(outFolder, "greylisted");
145 if(!greyListedFolder.exists()) {
146 greyListedFolder.mkdir();
147 }
148
149 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
150 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
151 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
152 }
153 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
154 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
155 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
156 }
157 this.greyListedFile = new File(outFolder, "greyListed.txt");
158 if(greyListedFile.exists() && !greyListedFile.delete()) {
159 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
160 }
161
162 // prepare our blacklist, greylist (for inspection) and whitelist
163 System.err.println("Loading blacklist.");
164 blackList = new HashMap<String, Integer>();
165 initURLFilterList(blackList, "url-blacklist-filter.txt");
166
167 System.err.println("Loading greylist.");
168 greyList = new HashMap<String, Integer>();
169 initURLFilterList(greyList, "url-greylist-filter.txt");
170
171 System.err.println("Loading whitelist.");
172 whiteList = new HashMap<String, Integer>();
173 initURLFilterList(whiteList, "url-whitelist-filter.txt");
174
175 //System.err.println("Prematurely terminating for testing purposes.");
176 //System.exit(-1);
177 }
178
179 /**
180 * Using the keepURLs.txt file generated by running WETProcessor instances, produces
181 * as output the URL seed list and regex-urlfilter text files required by nutch, see
182 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
183 */
184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
185 // Maintain Sets of unique domains and urls
186 // TreeSet: by default, "the elements are ordered using their natural ordering"
187 // (or by a Comparator provided at set creation time).
188 // Whereas HashSet doesn't guarantee ordering.
189 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
190
191 //Set<String> domainsSet = new TreeSet<String>();
192 //Set<String> urlsSet = new TreeSet<String>();
193 domainsToURLsMap = new TreeMap<String, Set<String>>();
194
195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
196
197 try (
198 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
199 ) {
200
201 // read a URL at a time from urlsFile
202 String url = null;
203 String domain = null;
204 while((url = reader.readLine()) != null) { // readLine removes newline separator
205
206 // work out domain. This retains any www. or subdomain prefix:
207 int startIndex = url.indexOf("//"); // http:// or https:// prefix
208 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
209 domain = url.substring(startIndex);
210 int endIndex = domain.indexOf("/");
211 if(endIndex == -1) endIndex = domain.length();
212 domain = domain.substring(0, endIndex);
213
214 //urlsSet.add(url);
215 //domainsSet.add(domain);
216 Set<String> urlsSet;
217 if(!domainsToURLsMap.containsKey(domain)) {
218 urlsSet = new TreeSet<String>();
219 urlsSet.add(url);
220 domainsToURLsMap.put(domain, urlsSet);
221 } else {
222 urlsSet = domainsToURLsMap.get(domain);
223 urlsSet.add(url);
224 }
225
226 }
227 } catch (IOException ioe) {
228 ioe.printStackTrace();
229 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
230 }
231
232 /*
233 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
234 Iterator<String> i = urlsSet.iterator();
235 while(i.hasNext()) {
236 String url = i.next();
237 seedURLsWriter.write(url + "\n");
238 }
239
240 } catch (IOException ioe) {
241 ioe.printStackTrace();
242 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
243 }
244 */
245
246 // write out each domain followed in sequence by all urls we found in that domain
247 // (urls with tab up front)
248 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
249 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
250 Set<String> domainsSet = domainsToURLsMap.keySet();
251 Iterator<String> domainIterator = domainsSet.iterator();
252
253 while(domainIterator.hasNext()) {
254 // write out the domain
255 String domain = domainIterator.next();
256 seedURLsWriter.write(domain + "\n");
257
258 // next write out the urls for the domain with a tab prefixed to each
259 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
260 Iterator<String> urlIterator = urlsForDomainSet.iterator();
261 while(urlIterator.hasNext()) {
262 String url = urlIterator.next();
263 seedURLsWriter.write("\t" + url + "\n");
264 }
265 }
266
267 } catch (IOException ioe) {
268 ioe.printStackTrace();
269 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
270 }
271
272 // write out domains as regular expressions into "regex-urlfilter.txt" file
273 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
274 Set<String> domainsSet = domainsToURLsMap.keySet();
275 Iterator<String> i = domainsSet.iterator();
276 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
277 while(i.hasNext()) {
278 String domain = i.next();
279 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
280 urlFilterWriter.write(domain + "\n");
281 }
282
283 } catch (IOException ioe) {
284 ioe.printStackTrace();
285 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
286 }
287 }
288
289 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
290 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
291 Iterator<Map.Entry<String, Integer>> i = entries.iterator();
292 while(i.hasNext()) {
293 Map.Entry<String, Integer> entry = i.next();
294 String urlPattern = entry.getKey();
295 Integer matchRule = entry.getValue();
296
297 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
298 return true;
299 }
300 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
301 return true;
302 }
303 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
304 return true;
305 }
306 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
307 return true;
308 }
309 // else check the rest of the filter list against this url
310 // before returning false to be certain it's not been listed in the filter list
311 }
312
313 return false;
314 }
315
316 /**
317 * Returns true if the url or pattern is found in the blacklist file.
318 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
319 * it won't get blacklisted after all. But that's not implemented here.
320 */
321 public boolean isBlacklisted(String url) {
322 return isListedInFilterList(blackList, url);
323 }
324
325 /**
326 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
327 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
328 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
329 */
330 public boolean isGreylisted(String url) {
331 // TODO: alexa top sites and auto-translated product sites
332 return isListedInFilterList(greyList, url);
333 }
334
335 /**
336 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
337 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
338 */
339 public boolean isWhitelisted(String url) {
340 return isListedInFilterList(whiteList, url);
341 }
342
343 /**
344 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
345 * whether it is in the discard list.
346 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
347 * By not having this method deal with actual regex for filters, this has the advantage that
348 * we don't have to remember to escape or double escape each filter to turn it into a regex.
349 */
350 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
351
352 // if filterListFilename does not exist in the conf folder, just return
353 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
354 System.err.println(filterListFilename + " does not exist");
355 return;
356 }
357
358 try (
359 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
360 ) {
361 String filter = null;
362 while((filter = reader.readLine()) != null) {
363 // skip comments and empty lines
364 filter = filter.trim();
365 if(filter.equals("") || filter.startsWith("#")) {
366 continue;
367 }
368
369 if(filter.startsWith("^") && filter.endsWith("$")) {
370 filter = filter.substring(1, filter.length()-1);
371 list.put(filter, LIST_ENTRY_MATCHES);
372 }
373 else if(filter.startsWith("^")) {
374 filter = filter.substring(1);
375 list.put(filter, LIST_ENTRY_STARTSWITH);
376 System.err.println("Match filter startswith: " + filter);
377 }
378 else if(filter.endsWith("$")) {
379 filter = filter.substring(0, filter.length()-1);
380 list.put(filter, LIST_ENTRY_ENDSWITH);
381 }
382 else {
383 list.put(filter, LIST_ENTRY_CONTAINS);
384 }
385 //System.err.println("Got filter: " + filter);
386 }
387
388 } catch (IOException ioe) {
389 ioe.printStackTrace();
390 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
391 }
392
393 }
394
395 /** Maintain a count of all WET files processed. */
396 public void setWETFileCount(int count) { this.wetFileCount = count; }
397
398 /** Maintain a count of all WET records processed. */
399 //public int getRecordCount() { return this.totalRecordCount; }
400 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
401 public void setRecordCount(int count) { this.totalRecordCount = count; }
402
403 public static void printUsage() {
404 System.err.println("Run this program as:");
405 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
406 }
407
408 /** Filename filter to only list warc.wet files or else warc.wet.gz files
409 * for which unzipped warc.wet equivalents don't yet exist.
410 */
411 private static class WETFilenameFilter implements FilenameFilter {
412
413 public boolean accept(File dir, String name) {
414 if(name.endsWith(".warc.wet")) {
415 logger.debug("Will include " + name + " for processing.");
416 return true;
417 }
418
419 if(name.endsWith(".warc.wet.gz")) {
420 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
421 File unzippedVersion = new File(dir, nameWithoutGZext);
422 if(unzippedVersion.exists()) {
423 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
424 logger.debug("Skipping " + name);
425 return false; // don't count gzipped version if unzipped version exists.
426 }
427 else {
428 logger.debug("Only zipped version " + name + " exists.");
429 return true; // No unzipped version, so have to work with gzipped version
430 }
431 }
432
433 // we're not even interested in any other file extensions
434 logger.debug("Not a WET file. Skipping " + name);
435 return false;
436 }
437 }
438
439
440 public static void main(String[] args) {
441 if(args.length != 2) {
442 printUsage();
443 return;
444 }
445
446
447 File WETFileDir = new File(args[0]);
448 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
449 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
450 return;
451 }
452
453 File outFolder = new File(args[1]);
454 if(!outFolder.exists() || !outFolder.isDirectory()) {
455 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
456 return;
457 }
458
459 try {
460 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
461
462 //ccWETFilesProcessor.processAllWETFiles();
463
464 // Will list all the warc.wet files in the input directory or else their gzipped versions
465 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
466
467 int wetRecordCount = 0;
468 int wetFileCount = 0;
469
470 for(int i = 0; i < WETFiles.length; i++) {
471 File WETFile = WETFiles[i];
472 logger.debug("Processing WETfile: " + WETFile);
473
474 // Any .gz files listed means they haven't been unzipped yet. So unzip.
475 String WETFilename = WETFile.toString();
476 if(WETFilename.endsWith(".gz")) {
477 File GZippedWETFile = WETFile;
478 String WETGZippedFilename = WETFilename;
479 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
480
481 WETFile = new File(WETFilename);
482 Utility.unzipFile(GZippedWETFile, WETFile);
483 }
484 // hereafter all WETFiles should refer to the unzipped version
485 // Check the unzipped WETFile exists
486
487 if(!WETFile.exists() || !WETFile.isFile()) {
488 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
489 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
490 return;
491 }
492
493 // Finally, we can process this WETFile's records into the keep and discard pile
494 wetFileCount++;
495 logger.debug("Off to process " + WETFile);
496 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
497 wetFileProcessor.processWETFile();
498 wetRecordCount += wetFileProcessor.getRecordCount();
499 }
500
501 // for information purposes
502 ccWETFilesProcessor.setWETFileCount(wetFileCount);
503 ccWETFilesProcessor.setRecordCount(wetRecordCount);
504
505 File seedURLsFile = new File(outFolder, "seedURLs.txt");
506 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
507 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
508
509 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
510
511 } catch(Exception e) {
512 // can get an exception when instantiating CCWETProcessor instance
513 e.printStackTrace();
514 System.err.println(e.getMessage());
515 }
516
517 return;
518
519 }
520}
Note: See TracBrowser for help on using the repository browser.