Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33552

Last change on this file since 33552 was 33552, checked in by ak19, 5 years ago
Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.
File size: 24.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.HashMap;
9	import java.util.Map;
10	import java.util.Set;
11	import java.util.TreeMap;
12	import java.util.TreeSet;
13
14	import org.apache.log4j.Logger;
15
16	/**
17	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18	* the WET records in each, putting each WET record into a file. Each file is put into a
19	* keep or discard or greyListed folder, and its url listed written into a keep, discard
20	* or greylisted text file, based on based on
21	*
22	* 1. whether it's whitelisted, else greylisted else blacklisted
23	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24	* enough content. Formerly, content-length and number of lines were used to determine if
25	* the content was sufficient. Now it's just word count and number of MAX characters
26	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27	* in conf/config.properties.
28	*
29	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30	* into the conf folder to control any url patterns that are explicitly included or excluded or
31	* set aside for inspecting later. These filter text files don't use regexes, instead their
32	* format is:
33	* - precede URL by ^ to blacklist urls that match the given prefix
34	* - succeed URL by $ to blacklist urls that match the given suffix
35	* - ^url$ will blacklist urls that match the given url completely
36	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37	*
38	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
39	* over greylisting and which takes precedence over blacklisting in turn. However, even
40	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42	* also for nutch.
43	*
44	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45	* in the given input folder. Then use a single instance of the WETProcessor class to process
46	* each single unzipped warc.wet file.
47	*
48	* To compile, including the jars in lib/ for compiling.
49	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50	*
51	* To run, passing the log4j and other properties files in conf/ folder:
52	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53	*
54	* e.g.
55	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
57	*
58	*/
59
60	public class CCWETProcessor {
61	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63	// Properties shared across WETProcessor instances
64	public final int MAX_WORD_LENGTH;
65	public final int MIN_NUM_WORDS;
66	public final int MAX_WORDS_CAMELCASE;
67
68	private Properties configProperties = new Properties();
69
70	// File paths shared across WETProcessor instances
71	public final File commoncrawlDir;
72	public final File outputFolder;
73	public final File discardFolder;
74	public final File keepFolder;
75	public final File greyListedFolder;
76	public final File keepURLsFile;
77	public final File discardURLsFile;
78	public final File greyListedFile;
79
80	/** Possible values stored in the blackList/whiteList/greyList Maps */
81	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
82	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
83	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
84	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
85
86	/**
87	* Store url patterns as keys and values indicated whether a url should
88	* match it exactly, start/end with it, or contain it
89	*/
90	private HashMap<String, Integer> blackList;
91	private HashMap<String, Integer> greyList;
92	private HashMap<String, Integer> whiteList;
93
94	/** Map of domains we keep and the full urls we're keeping that are of that domain.
95	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
96	* while a HashMap has no notion of ordering, because we just need to store urls with
97	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
98	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
99	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
100	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
101	*/
102	private Map<String, Set<String>> domainsToURLsMap;
103
104	// Keep a count of all the records that all WETProcessors instantiated
105	// by our main method combined have processed
106	private int totalRecordCount = 0;
107
108	private int wetFileCount = 0;
109
110	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
111	this.commoncrawlDir = inFolder;
112	this.outputFolder = outFolder;
113
114	// load up the properties from the config file
115	try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
116	configProperties = new Properties();
117	configProperties.load(infile);
118	//infile.close(); // not explicitly called in examples of try-with-resources
119
120	} catch(Exception e) {
121	System.err.println("Exception attempting to read properties from config.properties.");
122	logger.error("Exception attempting to read properties from config.properties.");
123	e.printStackTrace();
124	}
125
126	if(configProperties.size() == 0) {
127	System.err.println("*** Warning: no values read into config properties. Using defaults.");
128	}
129
130	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
131	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
132	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
133
134
135	this.discardFolder = new File(outFolder, "discard");
136	if(!discardFolder.exists()) {
137	discardFolder.mkdir();
138	}
139	this.keepFolder = new File(outFolder, "keep");
140	if(!keepFolder.exists()) {
141	keepFolder.mkdir();
142	}
143
144	this.greyListedFolder = new File(outFolder, "greylisted");
145	if(!greyListedFolder.exists()) {
146	greyListedFolder.mkdir();
147	}
148
149	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
150	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
151	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
152	}
153	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
154	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
155	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
156	}
157	this.greyListedFile = new File(outFolder, "greyListed.txt");
158	if(greyListedFile.exists() && !greyListedFile.delete()) {
159	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
160	}
161
162	// prepare our blacklist, greylist (for inspection) and whitelist
163	System.err.println("Loading blacklist.");
164	blackList = new HashMap<String, Integer>();
165	initURLFilterList(blackList, "url-blacklist-filter.txt");
166
167	System.err.println("Loading greylist.");
168	greyList = new HashMap<String, Integer>();
169	initURLFilterList(greyList, "url-greylist-filter.txt");
170
171	System.err.println("Loading whitelist.");
172	whiteList = new HashMap<String, Integer>();
173	initURLFilterList(whiteList, "url-whitelist-filter.txt");
174
175	//System.err.println("Prematurely terminating for testing purposes.");
176	//System.exit(-1);
177	}
178
179	/**
180	* Using the keepURLs.txt file generated by running WETProcessor instances, this produces
181	* as output the URL seed list and regex-urlfilter text files required by nutch, see
182	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
183	*/
184	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) {
185	// Maintain Sets of unique domains and urls
186	// TreeSet: by default, "the elements are ordered using their natural ordering"
187	// (or by a Comparator provided at set creation time).
188	// Whereas HashSet doesn't guarantee ordering.
189	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
190
191	//Set<String> domainsSet = new TreeSet<String>();
192	//Set<String> urlsSet = new TreeSet<String>();
193	domainsToURLsMap = new TreeMap<String, Set<String>>();
194
195	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.) for nutch's regex-urlfilter.txt
196
197	try (
198	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
199	) {
200
201	// read a URL at a time from urlsFile
202	String url = null;
203	String domain = null;
204	while((url = reader.readLine()) != null) { // readLine removes newline separator
205
206	// work out domain. This retains any www. or subdomain prefix:
207	int startIndex = url.indexOf("//"); // http:// or https:// prefix
208	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
209	domain = url.substring(startIndex);
210	int endIndex = domain.indexOf("/");
211	if(endIndex == -1) endIndex = domain.length();
212	domain = domain.substring(0, endIndex);
213
214	//urlsSet.add(url);
215	//domainsSet.add(domain);
216	Set<String> urlsSet;
217	if(!domainsToURLsMap.containsKey(domain)) {
218	urlsSet = new TreeSet<String>();
219	urlsSet.add(url);
220	domainsToURLsMap.put(domain, urlsSet);
221	} else {
222	urlsSet = domainsToURLsMap.get(domain);
223	urlsSet.add(url);
224	}
225
226	}
227	} catch (IOException ioe) {
228	ioe.printStackTrace();
229	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
230	}
231
232	// We'd have pruned out duplicates by now and have a sorted list of domains,
233	// each of which maps to seed URLs in the commoncrawl for that domain
234
235	/*
236	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
237	Iterator<String> i = urlsSet.iterator();
238	while(i.hasNext()) {
239	String url = i.next();
240	seedURLsWriter.write(url + "\n");
241	}
242
243	} catch (IOException ioe) {
244	ioe.printStackTrace();
245	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
246	}
247	*/
248
249	int domainCount = 0;
250	File sitesFolder = new File(outputFolder, "sites");
251	if(!sitesFolder.exists()) {
252	sitesFolder.mkdir();
253	}
254	final String FORMATSTR = "%05d";
255
256	// write out each domain followed in sequence by all urls we found in that domain
257	// (urls with tab up front)
258	try (
259	// global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
260	BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
261	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
262	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
263	) {
264	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
265	Set<String> domainsSet = domainsToURLsMap.keySet();
266	Iterator<String> domainIterator = domainsSet.iterator();
267
268	while(domainIterator.hasNext()) {
269	domainCount++;
270	String siteID = String.format(FORMATSTR, domainCount);
271	File domainFolder = new File(sitesFolder, siteID);
272	domainFolder.mkdir();
273
274	// write out the domain
275	String domain = domainIterator.next();
276	//seedURLsWriter.write(domain + "\n");
277	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
278	String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
279	urlFilterWriter.write(regexed_domain + "\n");
280
281	// for every domain, we need a sites/0000x/ folder, where x is domain#, containing
282	// its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
283	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
284	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
285	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
286	try (
287	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
288	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
289	) {
290
291	// write all sorted unique domains into global domains file
292	domainURLsWriter.write(domain + "\n");
293
294	// Only write urls and no domain into single global seedurls file
295	// But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
296	// files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
297	// If we ever run nutch on a single seedURLs listing containing
298	// all seed pages to crawl sites from, the above two files will work for that.
299	siteURLsWriter.write(domain + "\n");
300	siteRegexWriter.write(regexed_domain + "\n");
301
302	// next write out the urls for the domain with a tab prefixed to each
303	// into the sites/0000x/seedURLs.txt file - also write into the global seeds file
304	Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
305	Iterator<String> urlIterator = urlsForDomainSet.iterator();
306	while(urlIterator.hasNext()) {
307	String url = urlIterator.next();
308	seedURLsWriter.write(url + "\n"); // global seedURLs file
309	siteURLsWriter.write("\t" + url + "\n");
310	}
311	} catch (IOException ioe) {
312	ioe.printStackTrace();
313	System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
314	}
315	}
316
317	} catch (IOException ioe) {
318	ioe.printStackTrace();
319	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
320	}
321
322	// write out domains as regular expressions into "regex-urlfilter.txt" file
323	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
324	Set<String> domainsSet = domainsToURLsMap.keySet();
325	Iterator<String> i = domainsSet.iterator();
326	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
327	while(i.hasNext()) {
328	String domain = i.next();
329	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
330	urlFilterWriter.write(domain + "\n");
331	}
332
333	} catch (IOException ioe) {
334	ioe.printStackTrace();
335	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
336	}
337	}
338
339	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
340	Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
341	Iterator<Map.Entry<String, Integer>> i = entries.iterator();
342	while(i.hasNext()) {
343	Map.Entry<String, Integer> entry = i.next();
344	String urlPattern = entry.getKey();
345	Integer matchRule = entry.getValue();
346
347	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
348	return true;
349	}
350	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
351	return true;
352	}
353	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
354	return true;
355	}
356	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
357	return true;
358	}
359	// else check the rest of the filter list against this url
360	// before returning false to be certain it's not been listed in the filter list
361	}
362
363	return false;
364	}
365
366	/**
367	* Returns true if the url or pattern is found in the blacklist file.
368	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
369	* it won't get blacklisted after all. But that's not implemented here.
370	*/
371	public boolean isBlacklisted(String url) {
372	return isListedInFilterList(blackList, url);
373	}
374
375	/**
376	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
377	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
378	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
379	*/
380	public boolean isGreylisted(String url) {
381	// TODO: alexa top sites and auto-translated product sites
382	return isListedInFilterList(greyList, url);
383	}
384
385	/**
386	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
387	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
388	*/
389	public boolean isWhitelisted(String url) {
390	return isListedInFilterList(whiteList, url);
391	}
392
393	/**
394	* Checks URL parameter against each line ("filter") of conf/url-black\|grey\|whitelist-filter.txt to decide
395	* whether it is in the mentioned black\|grey\|white list.
396	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
397	* By not having this method deal with actual regex for filters, this has the advantage that
398	* we don't have to remember to escape or double escape each filter to turn it into a regex.
399	*/
400	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
401
402	// if filterListFilename does not exist in the conf folder, just return
403	if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
404	System.err.println(filterListFilename + " does not exist");
405	return;
406	}
407
408	try (
409	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
410	) {
411	String filter = null;
412	while((filter = reader.readLine()) != null) {
413	// skip comments and empty lines
414	filter = filter.trim();
415	if(filter.equals("") \|\| filter.startsWith("#")) {
416	continue;
417	}
418
419	if(filter.startsWith("^") && filter.endsWith("$")) {
420	filter = filter.substring(1, filter.length()-1);
421	list.put(filter, LIST_ENTRY_MATCHES);
422	}
423	else if(filter.startsWith("^")) {
424	filter = filter.substring(1);
425	list.put(filter, LIST_ENTRY_STARTSWITH);
426	System.err.println("Match filter startswith: " + filter);
427	}
428	else if(filter.endsWith("$")) {
429	filter = filter.substring(0, filter.length()-1);
430	list.put(filter, LIST_ENTRY_ENDSWITH);
431	}
432	else {
433	list.put(filter, LIST_ENTRY_CONTAINS);
434	}
435	//System.err.println("Got filter: " + filter);
436	}
437
438	} catch (IOException ioe) {
439	ioe.printStackTrace();
440	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
441	}
442
443	}
444
445	/** Maintain a count of all WET files processed. */
446	public void setWETFileCount(int count) { this.wetFileCount = count; }
447
448	/** Maintain a count of all WET records processed. */
449	//public int getRecordCount() { return this.totalRecordCount; }
450	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
451	public void setRecordCount(int count) { this.totalRecordCount = count; }
452
453	public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
454
455	// Will list all the warc.wet files in the input directory or else their gzipped versions
456	File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
457
458	int wetRecordCount = 0;
459	int wetFileCount = 0;
460
461	for(int i = 0; i < WETFiles.length; i++) {
462	File WETFile = WETFiles[i];
463	logger.debug("Processing WETfile: " + WETFile);
464
465	// Any .gz files listed means they haven't been unzipped yet. So unzip.
466	String WETFilename = WETFile.toString();
467	if(WETFilename.endsWith(".gz")) {
468	File GZippedWETFile = WETFile;
469	String WETGZippedFilename = WETFilename;
470	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
471
472	WETFile = new File(WETFilename);
473	Utility.unzipFile(GZippedWETFile, WETFile);
474	}
475	// hereafter all WETFiles should refer to the unzipped version
476	// Check the unzipped WETFile exists
477
478	if(!WETFile.exists() \|\| !WETFile.isFile()) {
479	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
480	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
481	return;
482	}
483
484	// Finally, we can process this WETFile's records into the keep and discard pile
485	wetFileCount++;
486	logger.debug("Off to process " + WETFile);
487	String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
488	crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
489	WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
490	wetFileProcessor.processWETFile();
491	wetRecordCount += wetFileProcessor.getRecordCount();
492	}
493
494	// for information purposes
495	this.setWETFileCount(wetFileCount);
496	this.setRecordCount(wetRecordCount);
497	}
498
499	public static void printUsage() {
500	System.err.println("Run this program as:");
501	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
502	}
503
504	/** Filename filter to only list warc.wet files or else warc.wet.gz files
505	* for which unzipped warc.wet equivalents don't yet exist.
506	*/
507	private static class WETFilenameFilter implements FilenameFilter {
508
509	public boolean accept(File dir, String name) {
510	if(name.endsWith(".warc.wet")) {
511	logger.debug("Will include " + name + " for processing.");
512	return true;
513	}
514
515	if(name.endsWith(".warc.wet.gz")) {
516	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
517	File unzippedVersion = new File(dir, nameWithoutGZext);
518	if(unzippedVersion.exists()) {
519	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
520	logger.debug("Skipping " + name);
521	return false; // don't count gzipped version if unzipped version exists.
522	}
523	else {
524	logger.debug("Only zipped version " + name + " exists.");
525	return true; // No unzipped version, so have to work with gzipped version
526	}
527	}
528
529	// we're not even interested in any other file extensions
530	logger.debug("Not a WET file. Skipping " + name);
531	return false;
532	}
533	}
534
535
536	private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
537
538	public boolean accept(File dir, String name) {
539	File f = new File (dir, name);
540	if(f.isDirectory()) {
541	if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
542	return true;
543	}
544	}
545	else {
546	System.err.println("File " + f + " is not a directory");
547	}
548	return false;
549	}
550	}
551
552	public static void main(String[] args) {
553	if(args.length != 2) {
554	printUsage();
555	return;
556	}
557
558	File commoncrawlDir = new File(args[0]);
559	if(!commoncrawlDir.exists() \|\| !commoncrawlDir.isDirectory()) {
560	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
561	return;
562	}
563
564	File outFolder = new File(args[1]);
565	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
566	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
567	return;
568	}
569
570	try {
571	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
572
573	File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
574
575	for(int i = 0; i < ccrawlFolders.length; i++) {
576	File ccrawlFolder = ccrawlFolders[i];
577	System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
578	ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
579	}
580
581	// global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
582	// The former is the only unique one. seedURLs and regex-urlfilters are
583	// repeated on a per site/domain basis too, stored in the sites folder
584	File seedURLsFile = new File(outFolder, "seedURLs.txt");
585	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
586	File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
587	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile);
588
589	System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
590
591	} catch(Exception e) {
592	// can get an exception when instantiating CCWETProcessor instance
593	e.printStackTrace();
594	System.err.println(e.getMessage());
595	}
596
597	return;
598
599	}
600	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: