Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33568

Last change on this file since 33568 was 33568, checked in by ak19, 5 years ago
More sites greylisted and blacklisted, discovered as I attempted to crawl them and afterwards learnt to investigate sites first. Should all .ru and .pl domains be on the greylist? 2. Adjusted instruction comments in CCWETProcessor for compiling and running
File size: 36.7 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.nio.charset.StandardCharsets;
6	import java.util.Properties;
7	import java.util.zip.GZIPInputStream;
8	import java.util.Iterator;
9	import java.util.HashMap;
10	import java.util.Map;
11	import java.util.Set;
12	import java.util.TreeMap;
13	import java.util.TreeSet;
14
15	import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16	import org.apache.log4j.Logger;
17
18
19	/**
20	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21	* the WET records in each, putting each WET record into a file. Each file is put into a
22	* keep or discard or greyListed folder, and its url listed written into a keep, discard
23	* or greylisted text file, based on based on
24	*
25	* 1. whether it's whitelisted, else greylisted else blacklisted
26	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27	* enough content. Formerly, content-length and number of lines were used to determine if
28	* the content was sufficient. Now it's just word count and number of MAX characters
29	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30	* in conf/config.properties.
31	*
32	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33	* into the conf folder to control any url patterns that are explicitly included or excluded or
34	* set aside for inspecting later. These filter text files don't use regexes, instead their
35	* format is:
36	* - precede URL by ^ to blacklist urls that match the given prefix
37	* - succeed URL by $ to blacklist urls that match the given suffix
38	* - ^url$ will blacklist urls that match the given url completely
39	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40	*
41	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
42	* over greylisting and which takes precedence over blacklisting in turn. However, even
43	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45	* also for nutch.
46	*
47	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48	* in the given input folder. Then use a single instance of the WETProcessor class to process
49	* each single unzipped warc.wet file.
50	*
51	* To compile, including the jars in lib/ for compiling.
52	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53	*
54	* To run, passing the log4j and other properties files in conf/ folder:
55	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56	*
57	* e.g. (from maori-lang-detection/src)
58	*
59	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 \| less
61	*
62	*/
63
64	public class CCWETProcessor {
65	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67	// Properties shared across WETProcessor instances
68	public final int MAX_WORD_LENGTH;
69	public final int MIN_NUM_WORDS;
70	public final int MAX_WORDS_CAMELCASE;
71
72	// constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73	public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74	public final String SINGLEPAGE = "SINGLEPAGE";
75	public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77	/**
78	* Characters that need escaping if used as a string literal in a regex
79	* https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80	* https://www.regular-expressions.info/refcharacters.html
81	* Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82	* as would happen for any chars appearing earlier in this list than \\
83	*/
84	public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{\|";
85	//public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\|"];
86
87	private Properties configProperties = new Properties();
88
89	// File paths shared across WETProcessor instances
90	public final File commoncrawlDir;
91	public final File outputFolder;
92	public final File discardFolder;
93	public final File keepFolder;
94	public final File greyListedFolder;
95	public final File keepURLsFile;
96	public final File discardURLsFile;
97	public final File greyListedFile;
98
99	/** Possible values stored in the blackList/whiteList/greyList Maps */
100	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105	/**
106	* Store url patterns as keys and values indicated whether a url should
107	* match it exactly, start/end with it, or contain it
108	*/
109	private HashMap<String, Integer> blackList;
110	private HashMap<String, Integer> greyList;
111	private HashMap<String, Integer> whiteList;
112
113	/** map of topsites with allowable regexes: sites too big to exhaustively crawl
114	* with optional regex defining allowed exceptions, like subdomains or url suffixes
115	* off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116	* is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117	* would be relevant.
118	* The map would store top site domain suffix and an optional regex string for allowable
119	* url patterns.
120	*/
121	private HashMap<String, String> topSitesMap;
122
123	/** Map of domains we keep and the full urls we're keeping that are of that domain.
124	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125	* while a HashMap has no notion of ordering, because we just need to store urls with
126	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130	*/
131	private Map<String, Set<String>> domainsToURLsMap;
132
133	// Keep a count of all the records that all WETProcessors instantiated
134	// by our main method combined have processed
135	private int totalRecordCount = 0;
136
137	private int wetFileCount = 0;
138
139	private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142	this.commoncrawlDir = inFolder;
143	this.outputFolder = outFolder;
144
145	// load up the properties from the config file
146	try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147	configProperties = new Properties();
148	configProperties.load(infile);
149	//infile.close(); // not explicitly called in examples of try-with-resources
150
151	} catch(Exception e) {
152	System.err.println("Exception attempting to read properties from config.properties.");
153	logger.error("Exception attempting to read properties from config.properties.");
154	e.printStackTrace();
155	}
156
157	if(configProperties.size() == 0) {
158	System.err.println("*** Warning: no values read into config properties. Using defaults.");
159	}
160
161	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
162	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
163	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
164
165
166	this.discardFolder = new File(outFolder, "discard");
167	if(!discardFolder.exists()) {
168	discardFolder.mkdir();
169	}
170	this.keepFolder = new File(outFolder, "keep");
171	if(!keepFolder.exists()) {
172	keepFolder.mkdir();
173	}
174
175	this.greyListedFolder = new File(outFolder, "greylisted");
176	if(!greyListedFolder.exists()) {
177	greyListedFolder.mkdir();
178	}
179
180	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
181	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
182	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
183	}
184	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
185	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
186	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
187	}
188	this.greyListedFile = new File(outFolder, "greyListed.txt");
189	if(greyListedFile.exists() && !greyListedFile.delete()) {
190	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
191	}
192
193	// prepare our blacklist, greylist (for inspection) and whitelist
194	System.err.println("Loading blacklist.");
195	blackList = new HashMap<String, Integer>();
196	initURLFilterList(blackList, "url-blacklist-filter.txt");
197
198	System.err.println("Loading greylist.");
199	greyList = new HashMap<String, Integer>();
200	initURLFilterList(greyList, "url-greylist-filter.txt");
201
202	System.err.println("Loading whitelist.");
203	whiteList = new HashMap<String, Integer>();
204	initURLFilterList(whiteList, "url-whitelist-filter.txt");
205
206	// Create the map of topSites
207	System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
208	topSitesMap = new HashMap<String, String>();
209
210	// Read in our csv file of topsites and what to do when one hits a match with a seedURL
211	// and put these in our topSitesMap
212	// https://commons.apache.org/proper/commons-csv/apidocs/index.html
213	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
214	//https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
215	CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
216	.withCommentMarker('#')
217	.withSkipHeaderRecord()
218	.withIgnoreSurroundingSpaces();
219
220	File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
221	// CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
222	try (
223	CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
224	) {
225	for (CSVRecord csvRecord : parser) {
226	String topsite = csvRecord.get(0);
227	String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
228	topSitesMap.put(topsite, allowed_url_pattern);
229
230	//System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
231
232	}
233	} catch(Exception e) {
234	e.printStackTrace();
235	System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
236	}
237
238
239
240	//System.err.println("Prematurely terminating for testing purposes.");
241	//System.exit(-1);
242	}
243
244	/** Work out the 'domain' for a given url.
245	* This retains any www. or subdomain prefix.
246	*/
247	private String getDomainForURL(String url, boolean withProtocol) {
248	int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
249	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
250	// the keep the URL around in case param withProtocol=true
251	String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
252
253	String domain = url.substring(startIndex);
254	int endIndex = domain.indexOf("/");
255	if(endIndex == -1) endIndex = domain.length();
256	domain = domain.substring(0, endIndex);
257
258	if(withProtocol) {
259	// now that we have the domain (everything to the first / when there is no protocol)
260	// can glue the protocol back on
261	domain = protocol + domain;
262	}
263
264	return domain;
265	}
266
267	/** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
268	private String escapeStringForRegex(String str) {
269	for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
270	char c = ESCAPE_CHARS_FOR_RE.charAt(i);
271	str = str.replace(Character.toString(c), "\\"+c);
272	}
273	return str;
274	}
275
276	/**
277	* Using the keepURLs.txt file generated by running WETProcessor instances, this produces
278	* as output the URL seed list and regex-urlfilter text files required by nutch, see
279	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
280	*/
281	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
282	File domainURLsFile, File topSiteMatchesFile) {
283	// Maintain a Map of unique domains mapped to seed urls at that domain
284	// TreeSet: by default, "the elements are ordered using their natural ordering"
285	// (or by a Comparator provided at set creation time).
286	// Whereas HashSet doesn't guarantee ordering.
287	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
288	// Would be a similar distinction for Maps.
289	domainsToURLsMap = new TreeMap<String, Set<String>>();
290
291	final String PROTOCOL_REGEX_PREFIX = "+^https?://";
292	final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.) for nutch's regex-urlfilter.txt
293
294	try (
295	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
296	) {
297
298	// read a URL at a time from urlsFile
299	String url = null;
300	String domainWithProtocol = null;
301	while((url = reader.readLine()) != null) { // readLine removes newline separator
302
303	// work out domain. This retains any www. or subdomain prefix
304	// passing true to further also retain the http(s) protocol
305	domainWithProtocol = getDomainForURL(url, true);
306
307	Set<String> urlsSet;
308	if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
309	urlsSet = new TreeSet<String>();
310	urlsSet.add(url);
311	domainsToURLsMap.put(domainWithProtocol, urlsSet);
312	} else {
313	urlsSet = domainsToURLsMap.get(domainWithProtocol);
314	urlsSet.add(url);
315	}
316
317	}
318	} catch (IOException ioe) {
319	ioe.printStackTrace();
320	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
321	}
322
323	// We'd have pruned out duplicates by now and have a sorted list of domains,
324	// each of which maps to seed URLs in the commoncrawl for that domain
325
326	int domainCount = 0;
327	File sitesFolder = new File(outputFolder, "sites");
328	if(!sitesFolder.exists()) {
329	sitesFolder.mkdir();
330	}
331	final String FORMATSTR = "%05d";
332
333	// write out each domain followed in sequence by all urls we found in that domain
334	// (urls with tab up front)
335	try (
336	// global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
337	// Also a global file listing any urls that matched top sites that didn't specify
338	// allowed regex patterns
339	BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
340	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
341	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
342	BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
343	) {
344
345	// initialise topSiteMatchesFile with some instructional text.
346	topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
347	topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
348	topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
349
350	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
351	Set<String> domainsSet = domainsToURLsMap.keySet();
352	Iterator<String> domainIterator = domainsSet.iterator();
353
354	/*
355	// DEBUG
356	String value = topSitesMap.get("wikipedia.org");
357	if(value == null) {
358	System.err.println("### wikipedia.org had null value");
359	} else {
360	System.err.println("### wikipedia.org had value: " + value);
361	} // DEBUG
362	*/
363
364	while(domainIterator.hasNext()) {
365	String domainWithProtocol = domainIterator.next();
366	// Also get domain without protocol prefix
367	int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
368	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
369	String domain = domainWithProtocol.substring(startIndex);
370
371	/*if(domain.contains("docs.google.com")) {
372	System.err.println("domain with protocol: " + domainWithProtocol);
373	System.err.println("domain: " + domain);
374	}*/
375
376	String allowedURLPatternRegex = isURLinTopSitesMap(domain);
377	// If the domain is of a topsite for which no allowed URL pattern has been provided
378	// in sites-too-big-to-exhaustively-crawl.txt,
379	// then we don't know how to crawl the site. Warn the user by writing the affected
380	// domain and seedURLs to the topSiteMatchesFile.
381	if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
382
383	// topsite, but we don't (yet) know what portion can be crawled
384	// Append the top site and url to a global/toplevel file that
385	// the user needs to check later and we're done with this domain as it
386	// won't go into any other file hereafter
387
388	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
389	for(String url : urlsForDomainSet) {
390	topSiteMatchesWriter.write("\t" + url + "\n");
391	}
392
393	continue; // done with this domain
394	}
395
396	// start counting the domains we're actually going to process
397	domainCount++;
398
399	String siteID = String.format(FORMATSTR, domainCount);
400	File domainFolder = new File(sitesFolder, siteID);
401	domainFolder.mkdir();
402
403	// write out the domain
404	//seedURLsWriter.write(domainWithProtocol + "\n");
405
406
407	// for every domain, we need a sites/0000x/ folder, where x is domain#, containing
408	// its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
409	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
410	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
411	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
412	try (
413	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
414	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
415	) {
416
417	// write all sorted unique domains into global domains file
418	// Using the domain withuot protocol since the global domains file is for
419	// informational purposes
420	domainURLsWriter.write(domain + "\n");
421
422	// Only write urls and no domain into single global seedurls file
423	// But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
424	// files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
425	// If we ever run nutch on a single seedURLs listing containing
426	// all seed pages to crawl sites from, the above two files will work for that.
427
428	// first write out the urls for the domain into the sites/0000x/seedURLs.txt file
429	// also write into the global seeds file (with a tab prefixed to each?)
430	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
431	for(String url : urlsForDomainSet) {
432	seedURLsWriter.write(url + "\n"); // global seedURLs file
433	siteURLsWriter.write(url + "\n");
434	}
435
436
437	if(allowedURLPatternRegex == null) { // entire site can be crawled
438	siteURLsWriter.write(domainWithProtocol + "\n");
439
440	// Write out filter in the following form for a site, e.g. for nutch.apache.org:
441	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
442	String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
443	//String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
444	urlFilterWriter.write(regexed_domain + "\n"); //global file
445	siteRegexWriter.write(regexed_domain + "\n"); // site file
446	}
447	else { // domain belongs to a top site where only portion of site can be crawled
448
449	if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
450	siteURLsWriter.write(domainWithProtocol + "\n");
451	// e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
452	// and not for all of blogspot.com
453
454	String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
455	//String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
456	urlFilterWriter.write(regexed_domain + "\n");
457	siteRegexWriter.write(regexed_domain + "\n");
458
459	} else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
460	// don't write out domain. We want individual pages
461	//DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
462
463	// don't write out domain as a regex expression url filter either,
464	// write out the individual seed urls for the domain instead
465	// since we will only be downloading the single page
466
467	urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
468	for(String urlInDomain : urlsForDomainSet) {
469	// don't append slash to end this time
470	String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
471	//String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
472	urlFilterWriter.write(regexed_url + "\n");
473	siteRegexWriter.write(regexed_url + "\n");
474	}
475	} else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
476
477	// DON'T write out domain into siteURLs file,
478	// BUT DO write it into urlFilter file
479	String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
480
481	urlFilterWriter.write(regexed_domain + "\n");
482	siteRegexWriter.write(regexed_domain + "\n");
483	} else { // allowedURLPatternRegex is a url-form - convert to regex
484	if(!allowedURLPatternRegex.endsWith("/")) {
485	allowedURLPatternRegex += "/";
486	}
487	String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
488	//String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
489	siteURLsWriter.write(domainWithProtocol + "\n");
490	urlFilterWriter.write(regexed_pattern + "\n");
491	siteRegexWriter.write(regexed_pattern + "\n");
492
493	}
494	}
495
496	} catch (IOException ioe) {
497	ioe.printStackTrace();
498	System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
499	}
500
501	}
502
503	} catch (IOException ioe) {
504	ioe.printStackTrace();
505	System.err.println("\n@@@@@@@@@ Error writing to one of: ");
506	System.err.println("\t" + seedURLsFile);
507	System.err.println("\t" + urlFilterFile);
508	System.err.println("\t" + domainURLsFile);
509	System.err.println("\t" + topSiteMatchesFile);
510	}
511
512	/*
513	// BEGIN DEBUG
514	System.err.println("@@@@ TopSitesMap contains: ");
515	for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
516	String topSite = entry.getKey();
517	String urlPattern = entry.getValue();
518	System.err.println(topSite + " - " + urlPattern);
519	} // END DEBUG
520	*/
521	}
522
523	private String stripSubDomain(String url) {
524	int index = url.indexOf(".");
525	if(index != -1) {
526	url = url.substring(index+1);
527	}
528	return url;
529	}
530
531
532	/**
533	* @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
534	* with or without www. prefix. This method tests for such as case as it would be dangerous
535	* to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
536	*/
537	private boolean isExactDomainMatch(String seedURLDomain, String domain) {
538	// check for an exact match as-is
539	if(seedURLDomain.equals(domain)) {
540	return true;
541	}
542
543	// else check if with or without a www. prefix we have an exact match with domain
544	if(seedURLDomain.startsWith("www.")) {
545	if(seedURLDomain.substring(4).equals(domain)) {
546	return true;
547	}
548	} else {
549	if(domain.equals("www."+seedURLDomain)) {
550	return true;
551	}
552	}
553
554	return false;
555	}
556
557
558	/**
559	* Check if the domain of the seedurl, either in its entirety or when stripped of
560	* www/subdomains, is in the list of top sites.
561	* If it is, and the given url matches the regex for that topsite, then add the url to the
562	* whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
563	* @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
564	* @return one of the following values:
565	* - This function returns null if the seedURL's domain does not match any of the topsites.
566	* - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
567	* url-pattern) value was defined for it. The empty String is also returned if the seedURL's
568	* domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
569	* want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
570	* - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
571	* was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
572	* URL pattern.
573	*/
574	private String isURLinTopSitesMap(String fullSeedDomain) {
575	boolean keepLooping = true;
576
577	String domain = fullSeedDomain;
578
579	// domain aprameter will have retained www or subdomains, but is stripped of protocol
580
581	// keep looping, stripping subdomains from url and checking if it matches a topsite domain
582	// if it does, return the value for that topsite domain in the topSitesMap
583	// If no match at all, return null.
584	do {
585
586	String allowed_url_pattern = topSitesMap.get(domain);
587	if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
588	// there's an entry for the URL in the topSitesMap
589	System.err.println("##### A top site matches URL domain " + domain);
590
591	// if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
592	// www prefix, should not exactly match the topSitesMap domain
593	// e.g. we don't want to crawl a seed URL with domain www.blogspot.com
594	// despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
595
596	if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
597	return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
598	}
599	return allowed_url_pattern;
600	}
601	// else, no entry for the URL in the topSitesMap
602	// We're not done yet: strip subDomain from URL and check it against topSitesMap again
603
604	String newDomain = stripSubDomain(domain);
605	if(domain.equals(newDomain)) {
606	keepLooping = false;
607	} else {
608	domain = newDomain;
609	}
610	} while(keepLooping);
611
612	// url in entirety or stripped of subdomains did not match any of the topsites
613	return null;
614	}
615
616	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
617	//Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
618	//Iterator<Map.Entry<String, Integer>> i = entries.iterator();
619	//while(i.hasNext()) {
620	// Map.Entry<String, Integer> entry = i.next();
621	for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
622	String urlPattern = entry.getKey();
623	Integer matchRule = entry.getValue();
624
625	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
626	return true;
627	}
628	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
629	return true;
630	}
631	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
632	return true;
633	}
634	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
635	return true;
636	}
637	// else check the rest of the filter list against this url
638	// before returning false to be certain it's not been listed in the filter list
639	}
640
641	return false;
642	}
643
644	/**
645	* Returns true if the url or pattern is found in the blacklist file.
646	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
647	* it won't get blacklisted after all. But that's not implemented here.
648	*/
649	public boolean isBlacklisted(String url) {
650	return isListedInFilterList(blackList, url);
651	}
652
653	/**
654	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
655	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
656	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
657	*/
658	public boolean isGreylisted(String url) {
659	// auto-translated product sites
660	return isListedInFilterList(greyList, url);
661	}
662
663	/**
664	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
665	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
666	*/
667	public boolean isWhitelisted(String url) {
668	return isListedInFilterList(whiteList, url);
669	}
670
671	/**
672	* Checks URL parameter against each line ("filter") of conf/url-black\|grey\|whitelist-filter.txt to decide
673	* whether it is in the mentioned black\|grey\|white list.
674	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
675	* By not having this method deal with actual regex for filters, this has the advantage that
676	* we don't have to remember to escape or double escape each filter to turn it into a regex.
677	*/
678	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
679
680	// if filterListFilename does not exist in the conf folder, just return
681	if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
682	System.err.println(filterListFilename + " does not exist");
683	return;
684	}
685
686	try (
687	BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
688	) {
689	String filter = null;
690	while((filter = reader.readLine()) != null) {
691	// skip comments and empty lines
692	filter = filter.trim();
693	if(filter.equals("") \|\| filter.startsWith("#")) {
694	continue;
695	}
696
697	if(filter.startsWith("^") && filter.endsWith("$")) {
698	filter = filter.substring(1, filter.length()-1);
699	list.put(filter, LIST_ENTRY_MATCHES);
700	}
701	else if(filter.startsWith("^")) {
702	filter = filter.substring(1);
703	list.put(filter, LIST_ENTRY_STARTSWITH);
704	System.err.println("Match filter startswith: " + filter);
705	}
706	else if(filter.endsWith("$")) {
707	filter = filter.substring(0, filter.length()-1);
708	list.put(filter, LIST_ENTRY_ENDSWITH);
709	}
710	else {
711	list.put(filter, LIST_ENTRY_CONTAINS);
712	}
713	//System.err.println("Got filter: " + filter);
714	}
715
716	} catch (IOException ioe) {
717	ioe.printStackTrace();
718	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
719	}
720
721	}
722
723	/** Maintain a count of all WET files processed. */
724	public void setWETFileCount(int count) { this.wetFileCount = count; }
725
726	/** Maintain a count of all WET records processed. */
727	//public int getRecordCount() { return this.totalRecordCount; }
728	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
729	public void setRecordCount(int count) { this.totalRecordCount = count; }
730
731	public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
732
733	// Will list all the warc.wet files in the input directory or else their gzipped versions
734	File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
735
736	int wetRecordCount = 0;
737	int wetFileCount = 0;
738
739	for(int i = 0; i < WETFiles.length; i++) {
740	File WETFile = WETFiles[i];
741	logger.debug("Processing WETfile: " + WETFile);
742
743	// Any .gz files listed means they haven't been unzipped yet. So unzip.
744	String WETFilename = WETFile.toString();
745	if(WETFilename.endsWith(".gz")) {
746	File GZippedWETFile = WETFile;
747	String WETGZippedFilename = WETFilename;
748	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
749
750	WETFile = new File(WETFilename);
751	Utility.unzipFile(GZippedWETFile, WETFile);
752	}
753	// hereafter all WETFiles should refer to the unzipped version
754	// Check the unzipped WETFile exists
755
756	if(!WETFile.exists() \|\| !WETFile.isFile()) {
757	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
758	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
759	return;
760	}
761
762	// Finally, we can process this WETFile's records into the keep and discard pile
763	wetFileCount++;
764	logger.debug("Off to process " + WETFile);
765	String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
766	crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
767	WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
768	wetFileProcessor.processWETFile();
769	wetRecordCount += wetFileProcessor.getRecordCount();
770	}
771
772	// for information purposes
773	this.setWETFileCount(wetFileCount);
774	this.setRecordCount(wetRecordCount);
775	}
776
777
778	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
779	public static void printUsage() {
780	System.err.println("Run this program as:");
781	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
782	}
783
784	/** Filename filter to only list warc.wet files or else warc.wet.gz files
785	* for which unzipped warc.wet equivalents don't yet exist.
786	*/
787	private static class WETFilenameFilter implements FilenameFilter {
788
789	public boolean accept(File dir, String name) {
790	if(name.endsWith(".warc.wet")) {
791	logger.debug("Will include " + name + " for processing.");
792	return true;
793	}
794
795	if(name.endsWith(".warc.wet.gz")) {
796	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
797	File unzippedVersion = new File(dir, nameWithoutGZext);
798	if(unzippedVersion.exists()) {
799	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
800	logger.debug("Skipping " + name);
801	return false; // don't count gzipped version if unzipped version exists.
802	}
803	else {
804	logger.debug("Only zipped version " + name + " exists.");
805	return true; // No unzipped version, so have to work with gzipped version
806	}
807	}
808
809	// we're not even interested in any other file extensions
810	logger.debug("Not a WET file. Skipping " + name);
811	return false;
812	}
813	}
814
815
816	private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
817
818	public boolean accept(File dir, String name) {
819	File f = new File (dir, name);
820	if(f.isDirectory()) {
821	if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
822	return true;
823	}
824	}
825	else {
826	System.err.println("File " + f + " is not a directory");
827	}
828	return false;
829	}
830	}
831
832	public static void main(String[] args) {
833	if(args.length != 2) {
834	printUsage();
835	return;
836	}
837
838	File commoncrawlDir = new File(args[0]);
839	if(!commoncrawlDir.exists() \|\| !commoncrawlDir.isDirectory()) {
840	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
841	return;
842	}
843
844	File outFolder = new File(args[1]);
845	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
846	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
847	return;
848	}
849
850	try {
851	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
852
853	File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
854
855	for(int i = 0; i < ccrawlFolders.length; i++) {
856	File ccrawlFolder = ccrawlFolders[i];
857	System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
858	ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
859	}
860
861	// create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
862	// The former is the only unique one. seedURLs and regex-urlfilters are
863	// repeated on a per site/domain basis too, stored in the sites folder
864	File seedURLsFile = new File(outFolder, "seedURLs.txt");
865	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
866	File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
867	File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
868
869	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
870
871	System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
872
873	System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
874
875
876	} catch(Exception e) {
877	// can get an exception when instantiating CCWETProcessor instance
878	e.printStackTrace();
879	System.err.println(e.getMessage());
880	}
881
882	return;
883
884	}
885	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: