source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33552

Last change on this file since 33552 was 33552, checked in by ak19, 5 years ago
  1. Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.
File size: 24.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeMap;
12import java.util.TreeSet;
13
14import org.apache.log4j.Logger;
15
16/**
17 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18 * the WET records in each, putting each WET record into a file. Each file is put into a
19 * keep or discard or greyListed folder, and its url listed written into a keep, discard
20 * or greylisted text file, based on based on
21 *
22 * 1. whether it's whitelisted, else greylisted else blacklisted
23 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24 * enough content. Formerly, content-length and number of lines were used to determine if
25 * the content was sufficient. Now it's just word count and number of MAX characters
26 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27 * in conf/config.properties.
28 *
29 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30 * into the conf folder to control any url patterns that are explicitly included or excluded or
31 * set aside for inspecting later. These filter text files don't use regexes, instead their
32 * format is:
33 * - precede URL by ^ to blacklist urls that match the given prefix
34 * - succeed URL by $ to blacklist urls that match the given suffix
35 * - ^url$ will blacklist urls that match the given url completely
36 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37 *
38 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
39 * over greylisting and which takes precedence over blacklisting in turn. However, even
40 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42 * also for nutch.
43 *
44 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45 * in the given input folder. Then use a single instance of the WETProcessor class to process
46 * each single unzipped warc.wet file.
47 *
48 * To compile, including the jars in lib/ for compiling.
49 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50 *
51 * To run, passing the log4j and other properties files in conf/ folder:
52 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53 *
54 * e.g.
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
57 *
58*/
59
60public class CCWETProcessor {
61 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63 // Properties shared across WETProcessor instances
64 public final int MAX_WORD_LENGTH;
65 public final int MIN_NUM_WORDS;
66 public final int MAX_WORDS_CAMELCASE;
67
68 private Properties configProperties = new Properties();
69
70 // File paths shared across WETProcessor instances
71 public final File commoncrawlDir;
72 public final File outputFolder;
73 public final File discardFolder;
74 public final File keepFolder;
75 public final File greyListedFolder;
76 public final File keepURLsFile;
77 public final File discardURLsFile;
78 public final File greyListedFile;
79
80 /** Possible values stored in the blackList/whiteList/greyList Maps */
81 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
82 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
83 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
84 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
85
86 /**
87 * Store url patterns as keys and values indicated whether a url should
88 * match it exactly, start/end with it, or contain it
89 */
90 private HashMap<String, Integer> blackList;
91 private HashMap<String, Integer> greyList;
92 private HashMap<String, Integer> whiteList;
93
94 /** Map of domains we keep and the full urls we're keeping that are of that domain.
95 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
96 * while a HashMap has no notion of ordering, because we just need to store urls with
97 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes
98 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
99 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
100 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
101 */
102 private Map<String, Set<String>> domainsToURLsMap;
103
104 // Keep a count of all the records that all WETProcessors instantiated
105 // by our main method combined have processed
106 private int totalRecordCount = 0;
107
108 private int wetFileCount = 0;
109
110 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
111 this.commoncrawlDir = inFolder;
112 this.outputFolder = outFolder;
113
114 // load up the properties from the config file
115 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
116 configProperties = new Properties();
117 configProperties.load(infile);
118 //infile.close(); // not explicitly called in examples of try-with-resources
119
120 } catch(Exception e) {
121 System.err.println("Exception attempting to read properties from config.properties.");
122 logger.error("Exception attempting to read properties from config.properties.");
123 e.printStackTrace();
124 }
125
126 if(configProperties.size() == 0) {
127 System.err.println("*** Warning: no values read into config properties. Using defaults.");
128 }
129
130 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
131 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
132 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
133
134
135 this.discardFolder = new File(outFolder, "discard");
136 if(!discardFolder.exists()) {
137 discardFolder.mkdir();
138 }
139 this.keepFolder = new File(outFolder, "keep");
140 if(!keepFolder.exists()) {
141 keepFolder.mkdir();
142 }
143
144 this.greyListedFolder = new File(outFolder, "greylisted");
145 if(!greyListedFolder.exists()) {
146 greyListedFolder.mkdir();
147 }
148
149 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
150 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
151 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
152 }
153 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
154 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
155 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
156 }
157 this.greyListedFile = new File(outFolder, "greyListed.txt");
158 if(greyListedFile.exists() && !greyListedFile.delete()) {
159 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
160 }
161
162 // prepare our blacklist, greylist (for inspection) and whitelist
163 System.err.println("Loading blacklist.");
164 blackList = new HashMap<String, Integer>();
165 initURLFilterList(blackList, "url-blacklist-filter.txt");
166
167 System.err.println("Loading greylist.");
168 greyList = new HashMap<String, Integer>();
169 initURLFilterList(greyList, "url-greylist-filter.txt");
170
171 System.err.println("Loading whitelist.");
172 whiteList = new HashMap<String, Integer>();
173 initURLFilterList(whiteList, "url-whitelist-filter.txt");
174
175 //System.err.println("Prematurely terminating for testing purposes.");
176 //System.exit(-1);
177 }
178
179 /**
180 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
181 * as output the URL seed list and regex-urlfilter text files required by nutch, see
182 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
183 */
184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) {
185 // Maintain Sets of unique domains and urls
186 // TreeSet: by default, "the elements are ordered using their natural ordering"
187 // (or by a Comparator provided at set creation time).
188 // Whereas HashSet doesn't guarantee ordering.
189 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
190
191 //Set<String> domainsSet = new TreeSet<String>();
192 //Set<String> urlsSet = new TreeSet<String>();
193 domainsToURLsMap = new TreeMap<String, Set<String>>();
194
195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
196
197 try (
198 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
199 ) {
200
201 // read a URL at a time from urlsFile
202 String url = null;
203 String domain = null;
204 while((url = reader.readLine()) != null) { // readLine removes newline separator
205
206 // work out domain. This retains any www. or subdomain prefix:
207 int startIndex = url.indexOf("//"); // http:// or https:// prefix
208 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
209 domain = url.substring(startIndex);
210 int endIndex = domain.indexOf("/");
211 if(endIndex == -1) endIndex = domain.length();
212 domain = domain.substring(0, endIndex);
213
214 //urlsSet.add(url);
215 //domainsSet.add(domain);
216 Set<String> urlsSet;
217 if(!domainsToURLsMap.containsKey(domain)) {
218 urlsSet = new TreeSet<String>();
219 urlsSet.add(url);
220 domainsToURLsMap.put(domain, urlsSet);
221 } else {
222 urlsSet = domainsToURLsMap.get(domain);
223 urlsSet.add(url);
224 }
225
226 }
227 } catch (IOException ioe) {
228 ioe.printStackTrace();
229 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
230 }
231
232 // We'd have pruned out duplicates by now and have a sorted list of domains,
233 // each of which maps to seed URLs in the commoncrawl for that domain
234
235 /*
236 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
237 Iterator<String> i = urlsSet.iterator();
238 while(i.hasNext()) {
239 String url = i.next();
240 seedURLsWriter.write(url + "\n");
241 }
242
243 } catch (IOException ioe) {
244 ioe.printStackTrace();
245 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
246 }
247 */
248
249 int domainCount = 0;
250 File sitesFolder = new File(outputFolder, "sites");
251 if(!sitesFolder.exists()) {
252 sitesFolder.mkdir();
253 }
254 final String FORMATSTR = "%05d";
255
256 // write out each domain followed in sequence by all urls we found in that domain
257 // (urls with tab up front)
258 try (
259 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
260 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
261 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
262 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
263 ) {
264 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
265 Set<String> domainsSet = domainsToURLsMap.keySet();
266 Iterator<String> domainIterator = domainsSet.iterator();
267
268 while(domainIterator.hasNext()) {
269 domainCount++;
270 String siteID = String.format(FORMATSTR, domainCount);
271 File domainFolder = new File(sitesFolder, siteID);
272 domainFolder.mkdir();
273
274 // write out the domain
275 String domain = domainIterator.next();
276 //seedURLsWriter.write(domain + "\n");
277 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
278 String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
279 urlFilterWriter.write(regexed_domain + "\n");
280
281 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
282 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
283 // We still have a global seedURLs.txt and regex-urlfilter.txt too.
284 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
285 File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
286 try (
287 BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
288 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
289 ) {
290
291 // write all sorted unique domains into global domains file
292 domainURLsWriter.write(domain + "\n");
293
294 // Only write urls and no domain into single global seedurls file
295 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
296 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
297 // If we ever run nutch on a single seedURLs listing containing
298 // all seed pages to crawl sites from, the above two files will work for that.
299 siteURLsWriter.write(domain + "\n");
300 siteRegexWriter.write(regexed_domain + "\n");
301
302 // next write out the urls for the domain with a tab prefixed to each
303 // into the sites/0000x/seedURLs.txt file - also write into the global seeds file
304 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
305 Iterator<String> urlIterator = urlsForDomainSet.iterator();
306 while(urlIterator.hasNext()) {
307 String url = urlIterator.next();
308 seedURLsWriter.write(url + "\n"); // global seedURLs file
309 siteURLsWriter.write("\t" + url + "\n");
310 }
311 } catch (IOException ioe) {
312 ioe.printStackTrace();
313 System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
314 }
315 }
316
317 } catch (IOException ioe) {
318 ioe.printStackTrace();
319 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
320 }
321
322 // write out domains as regular expressions into "regex-urlfilter.txt" file
323 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
324 Set<String> domainsSet = domainsToURLsMap.keySet();
325 Iterator<String> i = domainsSet.iterator();
326 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
327 while(i.hasNext()) {
328 String domain = i.next();
329 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
330 urlFilterWriter.write(domain + "\n");
331 }
332
333 } catch (IOException ioe) {
334 ioe.printStackTrace();
335 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
336 }
337 }
338
339 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
340 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
341 Iterator<Map.Entry<String, Integer>> i = entries.iterator();
342 while(i.hasNext()) {
343 Map.Entry<String, Integer> entry = i.next();
344 String urlPattern = entry.getKey();
345 Integer matchRule = entry.getValue();
346
347 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
348 return true;
349 }
350 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
351 return true;
352 }
353 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
354 return true;
355 }
356 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
357 return true;
358 }
359 // else check the rest of the filter list against this url
360 // before returning false to be certain it's not been listed in the filter list
361 }
362
363 return false;
364 }
365
366 /**
367 * Returns true if the url or pattern is found in the blacklist file.
368 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
369 * it won't get blacklisted after all. But that's not implemented here.
370 */
371 public boolean isBlacklisted(String url) {
372 return isListedInFilterList(blackList, url);
373 }
374
375 /**
376 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
377 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
378 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
379 */
380 public boolean isGreylisted(String url) {
381 // TODO: alexa top sites and auto-translated product sites
382 return isListedInFilterList(greyList, url);
383 }
384
385 /**
386 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
387 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
388 */
389 public boolean isWhitelisted(String url) {
390 return isListedInFilterList(whiteList, url);
391 }
392
393 /**
394 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
395 * whether it is in the mentioned black|grey|white list.
396 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
397 * By not having this method deal with actual regex for filters, this has the advantage that
398 * we don't have to remember to escape or double escape each filter to turn it into a regex.
399 */
400 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
401
402 // if filterListFilename does not exist in the conf folder, just return
403 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
404 System.err.println(filterListFilename + " does not exist");
405 return;
406 }
407
408 try (
409 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
410 ) {
411 String filter = null;
412 while((filter = reader.readLine()) != null) {
413 // skip comments and empty lines
414 filter = filter.trim();
415 if(filter.equals("") || filter.startsWith("#")) {
416 continue;
417 }
418
419 if(filter.startsWith("^") && filter.endsWith("$")) {
420 filter = filter.substring(1, filter.length()-1);
421 list.put(filter, LIST_ENTRY_MATCHES);
422 }
423 else if(filter.startsWith("^")) {
424 filter = filter.substring(1);
425 list.put(filter, LIST_ENTRY_STARTSWITH);
426 System.err.println("Match filter startswith: " + filter);
427 }
428 else if(filter.endsWith("$")) {
429 filter = filter.substring(0, filter.length()-1);
430 list.put(filter, LIST_ENTRY_ENDSWITH);
431 }
432 else {
433 list.put(filter, LIST_ENTRY_CONTAINS);
434 }
435 //System.err.println("Got filter: " + filter);
436 }
437
438 } catch (IOException ioe) {
439 ioe.printStackTrace();
440 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
441 }
442
443 }
444
445 /** Maintain a count of all WET files processed. */
446 public void setWETFileCount(int count) { this.wetFileCount = count; }
447
448 /** Maintain a count of all WET records processed. */
449 //public int getRecordCount() { return this.totalRecordCount; }
450 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
451 public void setRecordCount(int count) { this.totalRecordCount = count; }
452
453 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
454
455 // Will list all the warc.wet files in the input directory or else their gzipped versions
456 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
457
458 int wetRecordCount = 0;
459 int wetFileCount = 0;
460
461 for(int i = 0; i < WETFiles.length; i++) {
462 File WETFile = WETFiles[i];
463 logger.debug("Processing WETfile: " + WETFile);
464
465 // Any .gz files listed means they haven't been unzipped yet. So unzip.
466 String WETFilename = WETFile.toString();
467 if(WETFilename.endsWith(".gz")) {
468 File GZippedWETFile = WETFile;
469 String WETGZippedFilename = WETFilename;
470 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
471
472 WETFile = new File(WETFilename);
473 Utility.unzipFile(GZippedWETFile, WETFile);
474 }
475 // hereafter all WETFiles should refer to the unzipped version
476 // Check the unzipped WETFile exists
477
478 if(!WETFile.exists() || !WETFile.isFile()) {
479 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
480 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
481 return;
482 }
483
484 // Finally, we can process this WETFile's records into the keep and discard pile
485 wetFileCount++;
486 logger.debug("Off to process " + WETFile);
487 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
488 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
489 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
490 wetFileProcessor.processWETFile();
491 wetRecordCount += wetFileProcessor.getRecordCount();
492 }
493
494 // for information purposes
495 this.setWETFileCount(wetFileCount);
496 this.setRecordCount(wetRecordCount);
497 }
498
499 public static void printUsage() {
500 System.err.println("Run this program as:");
501 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
502 }
503
504 /** Filename filter to only list warc.wet files or else warc.wet.gz files
505 * for which unzipped warc.wet equivalents don't yet exist.
506 */
507 private static class WETFilenameFilter implements FilenameFilter {
508
509 public boolean accept(File dir, String name) {
510 if(name.endsWith(".warc.wet")) {
511 logger.debug("Will include " + name + " for processing.");
512 return true;
513 }
514
515 if(name.endsWith(".warc.wet.gz")) {
516 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
517 File unzippedVersion = new File(dir, nameWithoutGZext);
518 if(unzippedVersion.exists()) {
519 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
520 logger.debug("Skipping " + name);
521 return false; // don't count gzipped version if unzipped version exists.
522 }
523 else {
524 logger.debug("Only zipped version " + name + " exists.");
525 return true; // No unzipped version, so have to work with gzipped version
526 }
527 }
528
529 // we're not even interested in any other file extensions
530 logger.debug("Not a WET file. Skipping " + name);
531 return false;
532 }
533 }
534
535
536 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
537
538 public boolean accept(File dir, String name) {
539 File f = new File (dir, name);
540 if(f.isDirectory()) {
541 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
542 return true;
543 }
544 }
545 else {
546 System.err.println("File " + f + " is not a directory");
547 }
548 return false;
549 }
550 }
551
552 public static void main(String[] args) {
553 if(args.length != 2) {
554 printUsage();
555 return;
556 }
557
558 File commoncrawlDir = new File(args[0]);
559 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
560 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
561 return;
562 }
563
564 File outFolder = new File(args[1]);
565 if(!outFolder.exists() || !outFolder.isDirectory()) {
566 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
567 return;
568 }
569
570 try {
571 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
572
573 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
574
575 for(int i = 0; i < ccrawlFolders.length; i++) {
576 File ccrawlFolder = ccrawlFolders[i];
577 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
578 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
579 }
580
581 // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
582 // The former is the only unique one. seedURLs and regex-urlfilters are
583 // repeated on a per site/domain basis too, stored in the sites folder
584 File seedURLsFile = new File(outFolder, "seedURLs.txt");
585 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
586 File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
587 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile);
588
589 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
590
591 } catch(Exception e) {
592 // can get an exception when instantiating CCWETProcessor instance
593 e.printStackTrace();
594 System.err.println(e.getMessage());
595 }
596
597 return;
598
599 }
600}
Note: See TracBrowser for help on using the repository browser.