Context Navigation

CCWETProcessor.java@ 33565

Last change on this file since 33565 was 33565, checked in by ak19, 5 years ago

CCWETProcessor: domain url now goes in as a seedURL after the individual seedURLs, after Dr Bainbridge explained why the original ordering didn't make sense. 2. conf: we inspected the first site to be crawled. It was a non-top site, but we still wanted to control the crawling of it in the same way we control topsites. 3. Documented use of the nutch command for testing which urls pass and fail the existing regex-urlfilter checks.

File size: 36.6 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.nio.charset.StandardCharsets;
6	import java.util.Properties;
7	import java.util.zip.GZIPInputStream;
8	import java.util.Iterator;
9	import java.util.HashMap;
10	import java.util.Map;
11	import java.util.Set;
12	import java.util.TreeMap;
13	import java.util.TreeSet;
14
15	import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16	import org.apache.log4j.Logger;
17
18
19	/**
20	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21	* the WET records in each, putting each WET record into a file. Each file is put into a
22	* keep or discard or greyListed folder, and its url listed written into a keep, discard
23	* or greylisted text file, based on based on
24	*
25	* 1. whether it's whitelisted, else greylisted else blacklisted
26	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27	* enough content. Formerly, content-length and number of lines were used to determine if
28	* the content was sufficient. Now it's just word count and number of MAX characters
29	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30	* in conf/config.properties.
31	*
32	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33	* into the conf folder to control any url patterns that are explicitly included or excluded or
34	* set aside for inspecting later. These filter text files don't use regexes, instead their
35	* format is:
36	* - precede URL by ^ to blacklist urls that match the given prefix
37	* - succeed URL by $ to blacklist urls that match the given suffix
38	* - ^url$ will blacklist urls that match the given url completely
39	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40	*
41	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
42	* over greylisting and which takes precedence over blacklisting in turn. However, even
43	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45	* also for nutch.
46	*
47	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48	* in the given input folder. Then use a single instance of the WETProcessor class to process
49	* each single unzipped warc.wet file.
50	*
51	* To compile, including the jars in lib/ for compiling.
52	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53	*
54	* To run, passing the log4j and other properties files in conf/ folder:
55	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
56	*
57	* e.g.
58	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
59	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
60	*
61	*/
62
63	public class CCWETProcessor {
64	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
65
66	// Properties shared across WETProcessor instances
67	public final int MAX_WORD_LENGTH;
68	public final int MIN_NUM_WORDS;
69	public final int MAX_WORDS_CAMELCASE;
70
71	// constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
72	public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
73	public final String SINGLEPAGE = "SINGLEPAGE";
74	public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
75
76	/**
77	* Characters that need escaping if used as a string literal in a regex
78	* https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
79	* https://www.regular-expressions.info/refcharacters.html
80	* Put the \\ (escape char) at start so we don't double-escape chars already escaped,
81	* as would happen for any chars appearing earlier in this list than \\
82	*/
83	public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{\|";
84	//public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\|"];
85
86	private Properties configProperties = new Properties();
87
88	// File paths shared across WETProcessor instances
89	public final File commoncrawlDir;
90	public final File outputFolder;
91	public final File discardFolder;
92	public final File keepFolder;
93	public final File greyListedFolder;
94	public final File keepURLsFile;
95	public final File discardURLsFile;
96	public final File greyListedFile;
97
98	/** Possible values stored in the blackList/whiteList/greyList Maps */
99	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
100	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
101	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
102	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
103
104	/**
105	* Store url patterns as keys and values indicated whether a url should
106	* match it exactly, start/end with it, or contain it
107	*/
108	private HashMap<String, Integer> blackList;
109	private HashMap<String, Integer> greyList;
110	private HashMap<String, Integer> whiteList;
111
112	/** map of topsites with allowable regexes: sites too big to exhaustively crawl
113	* with optional regex defining allowed exceptions, like subdomains or url suffixes
114	* off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
115	* is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
116	* would be relevant.
117	* The map would store top site domain suffix and an optional regex string for allowable
118	* url patterns.
119	*/
120	private HashMap<String, String> topSitesMap;
121
122	/** Map of domains we keep and the full urls we're keeping that are of that domain.
123	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
124	* while a HashMap has no notion of ordering, because we just need to store urls with
125	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
126	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
127	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
128	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
129	*/
130	private Map<String, Set<String>> domainsToURLsMap;
131
132	// Keep a count of all the records that all WETProcessors instantiated
133	// by our main method combined have processed
134	private int totalRecordCount = 0;
135
136	private int wetFileCount = 0;
137
138	private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
139
140	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
141	this.commoncrawlDir = inFolder;
142	this.outputFolder = outFolder;
143
144	// load up the properties from the config file
145	try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
146	configProperties = new Properties();
147	configProperties.load(infile);
148	//infile.close(); // not explicitly called in examples of try-with-resources
149
150	} catch(Exception e) {
151	System.err.println("Exception attempting to read properties from config.properties.");
152	logger.error("Exception attempting to read properties from config.properties.");
153	e.printStackTrace();
154	}
155
156	if(configProperties.size() == 0) {
157	System.err.println("*** Warning: no values read into config properties. Using defaults.");
158	}
159
160	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
161	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
162	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
163
164
165	this.discardFolder = new File(outFolder, "discard");
166	if(!discardFolder.exists()) {
167	discardFolder.mkdir();
168	}
169	this.keepFolder = new File(outFolder, "keep");
170	if(!keepFolder.exists()) {
171	keepFolder.mkdir();
172	}
173
174	this.greyListedFolder = new File(outFolder, "greylisted");
175	if(!greyListedFolder.exists()) {
176	greyListedFolder.mkdir();
177	}
178
179	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
180	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
181	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
182	}
183	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
184	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
185	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
186	}
187	this.greyListedFile = new File(outFolder, "greyListed.txt");
188	if(greyListedFile.exists() && !greyListedFile.delete()) {
189	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
190	}
191
192	// prepare our blacklist, greylist (for inspection) and whitelist
193	System.err.println("Loading blacklist.");
194	blackList = new HashMap<String, Integer>();
195	initURLFilterList(blackList, "url-blacklist-filter.txt");
196
197	System.err.println("Loading greylist.");
198	greyList = new HashMap<String, Integer>();
199	initURLFilterList(greyList, "url-greylist-filter.txt");
200
201	System.err.println("Loading whitelist.");
202	whiteList = new HashMap<String, Integer>();
203	initURLFilterList(whiteList, "url-whitelist-filter.txt");
204
205	// Create the map of topSites
206	System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
207	topSitesMap = new HashMap<String, String>();
208
209	// Read in our csv file of topsites and what to do when one hits a match with a seedURL
210	// and put these in our topSitesMap
211	// https://commons.apache.org/proper/commons-csv/apidocs/index.html
212	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
213	//https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
214	CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
215	.withCommentMarker('#')
216	.withSkipHeaderRecord()
217	.withIgnoreSurroundingSpaces();
218
219	File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
220	// CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
221	try (
222	CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
223	) {
224	for (CSVRecord csvRecord : parser) {
225	String topsite = csvRecord.get(0);
226	String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
227	topSitesMap.put(topsite, allowed_url_pattern);
228
229	//System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
230
231	}
232	} catch(Exception e) {
233	e.printStackTrace();
234	System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
235	}
236
237
238
239	//System.err.println("Prematurely terminating for testing purposes.");
240	//System.exit(-1);
241	}
242
243	/** Work out the 'domain' for a given url.
244	* This retains any www. or subdomain prefix.
245	*/
246	private String getDomainForURL(String url, boolean withProtocol) {
247	int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
248	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
249	// the keep the URL around in case param withProtocol=true
250	String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
251
252	String domain = url.substring(startIndex);
253	int endIndex = domain.indexOf("/");
254	if(endIndex == -1) endIndex = domain.length();
255	domain = domain.substring(0, endIndex);
256
257	if(withProtocol) {
258	// now that we have the domain (everything to the first / when there is no protocol)
259	// can glue the protocol back on
260	domain = protocol + domain;
261	}
262
263	return domain;
264	}
265
266	/** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
267	private String escapeStringForRegex(String str) {
268	for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
269	char c = ESCAPE_CHARS_FOR_RE.charAt(i);
270	str = str.replace(Character.toString(c), "\\"+c);
271	}
272	return str;
273	}
274
275	/**
276	* Using the keepURLs.txt file generated by running WETProcessor instances, this produces
277	* as output the URL seed list and regex-urlfilter text files required by nutch, see
278	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
279	*/
280	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
281	File domainURLsFile, File topSiteMatchesFile) {
282	// Maintain a Map of unique domains mapped to seed urls at that domain
283	// TreeSet: by default, "the elements are ordered using their natural ordering"
284	// (or by a Comparator provided at set creation time).
285	// Whereas HashSet doesn't guarantee ordering.
286	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
287	// Would be a similar distinction for Maps.
288	domainsToURLsMap = new TreeMap<String, Set<String>>();
289
290	final String PROTOCOL_REGEX_PREFIX = "+^https?://";
291	final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.) for nutch's regex-urlfilter.txt
292
293	try (
294	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
295	) {
296
297	// read a URL at a time from urlsFile
298	String url = null;
299	String domainWithProtocol = null;
300	while((url = reader.readLine()) != null) { // readLine removes newline separator
301
302	// work out domain. This retains any www. or subdomain prefix
303	// passing true to further also retain the http(s) protocol
304	domainWithProtocol = getDomainForURL(url, true);
305
306	Set<String> urlsSet;
307	if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
308	urlsSet = new TreeSet<String>();
309	urlsSet.add(url);
310	domainsToURLsMap.put(domainWithProtocol, urlsSet);
311	} else {
312	urlsSet = domainsToURLsMap.get(domainWithProtocol);
313	urlsSet.add(url);
314	}
315
316	}
317	} catch (IOException ioe) {
318	ioe.printStackTrace();
319	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
320	}
321
322	// We'd have pruned out duplicates by now and have a sorted list of domains,
323	// each of which maps to seed URLs in the commoncrawl for that domain
324
325	int domainCount = 0;
326	File sitesFolder = new File(outputFolder, "sites");
327	if(!sitesFolder.exists()) {
328	sitesFolder.mkdir();
329	}
330	final String FORMATSTR = "%05d";
331
332	// write out each domain followed in sequence by all urls we found in that domain
333	// (urls with tab up front)
334	try (
335	// global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
336	// Also a global file listing any urls that matched top sites that didn't specify
337	// allowed regex patterns
338	BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
339	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
340	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
341	BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile))
342	) {
343
344	// initialise topSiteMatchesFile with some instructional text.
345	topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
346	topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
347	topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
348
349	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
350	Set<String> domainsSet = domainsToURLsMap.keySet();
351	Iterator<String> domainIterator = domainsSet.iterator();
352
353	/*
354	// DEBUG
355	String value = topSitesMap.get("wikipedia.org");
356	if(value == null) {
357	System.err.println("### wikipedia.org had null value");
358	} else {
359	System.err.println("### wikipedia.org had value: " + value);
360	} // DEBUG
361	*/
362
363	while(domainIterator.hasNext()) {
364	String domainWithProtocol = domainIterator.next();
365	// Also get domain without protocol prefix
366	int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
367	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
368	String domain = domainWithProtocol.substring(startIndex);
369
370	/*if(domain.contains("docs.google.com")) {
371	System.err.println("domain with protocol: " + domainWithProtocol);
372	System.err.println("domain: " + domain);
373	}*/
374
375	String allowedURLPatternRegex = isURLinTopSitesMap(domain);
376	// If the domain is of a topsite for which no allowed URL pattern has been provided
377	// in sites-too-big-to-exhaustively-crawl.txt,
378	// then we don't know how to crawl the site. Warn the user by writing the affected
379	// domain and seedURLs to the topSiteMatchesFile.
380	if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
381
382	// topsite, but we don't (yet) know what portion can be crawled
383	// Append the top site and url to a global/toplevel file that
384	// the user needs to check later and we're done with this domain as it
385	// won't go into any other file hereafter
386
387	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
388	for(String url : urlsForDomainSet) {
389	topSiteMatchesWriter.write("\t" + url + "\n");
390	}
391
392	continue; // done with this domain
393	}
394
395	// start counting the domains we're actually going to process
396	domainCount++;
397
398	String siteID = String.format(FORMATSTR, domainCount);
399	File domainFolder = new File(sitesFolder, siteID);
400	domainFolder.mkdir();
401
402	// write out the domain
403	//seedURLsWriter.write(domainWithProtocol + "\n");
404
405
406	// for every domain, we need a sites/0000x/ folder, where x is domain#, containing
407	// its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
408	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
409	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
410	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
411	try (
412	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
413	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
414	) {
415
416	// write all sorted unique domains into global domains file
417	// Using the domain withuot protocol since the global domains file is for
418	// informational purposes
419	domainURLsWriter.write(domain + "\n");
420
421	// Only write urls and no domain into single global seedurls file
422	// But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
423	// files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
424	// If we ever run nutch on a single seedURLs listing containing
425	// all seed pages to crawl sites from, the above two files will work for that.
426
427	// first write out the urls for the domain into the sites/0000x/seedURLs.txt file
428	// also write into the global seeds file (with a tab prefixed to each?)
429	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
430	for(String url : urlsForDomainSet) {
431	seedURLsWriter.write(url + "\n"); // global seedURLs file
432	siteURLsWriter.write(url + "\n");
433	}
434
435
436	if(allowedURLPatternRegex == null) { // entire site can be crawled
437	siteURLsWriter.write(domainWithProtocol + "\n");
438
439	// Write out filter in the following form for a site, e.g. for nutch.apache.org:
440	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
441	String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
442	//String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
443	urlFilterWriter.write(regexed_domain + "\n"); //global file
444	siteRegexWriter.write(regexed_domain + "\n"); // site file
445	}
446	else { // domain belongs to a top site where only portion of site can be crawled
447
448	if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
449	siteURLsWriter.write(domainWithProtocol + "\n");
450	// e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
451	// and not for all of blogspot.com
452
453	String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
454	//String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
455	urlFilterWriter.write(regexed_domain + "\n");
456	siteRegexWriter.write(regexed_domain + "\n");
457
458	} else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
459	// don't write out domain. We want individual pages
460	//DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
461
462	// don't write out domain as a regex expression url filter either,
463	// write out the individual seed urls for the domain instead
464	// since we will only be downloading the single page
465
466	urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
467	for(String urlInDomain : urlsForDomainSet) {
468	// don't append slash to end this time
469	String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
470	//String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
471	urlFilterWriter.write(regexed_url + "\n");
472	siteRegexWriter.write(regexed_url + "\n");
473	}
474	} else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
475
476	// DON'T write out domain into siteURLs file,
477	// BUT DO write it into urlFilter file
478	String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
479
480	urlFilterWriter.write(regexed_domain + "\n");
481	siteRegexWriter.write(regexed_domain + "\n");
482	} else { // allowedURLPatternRegex is a url-form - convert to regex
483	if(!allowedURLPatternRegex.endsWith("/")) {
484	allowedURLPatternRegex += "/";
485	}
486	String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
487	//String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
488	siteURLsWriter.write(domainWithProtocol + "\n");
489	urlFilterWriter.write(regexed_pattern + "\n");
490	siteRegexWriter.write(regexed_pattern + "\n");
491
492	}
493	}
494
495	} catch (IOException ioe) {
496	ioe.printStackTrace();
497	System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
498	}
499
500	}
501
502	} catch (IOException ioe) {
503	ioe.printStackTrace();
504	System.err.println("\n@@@@@@@@@ Error writing to one of: ");
505	System.err.println("\t" + seedURLsFile);
506	System.err.println("\t" + urlFilterFile);
507	System.err.println("\t" + domainURLsFile);
508	System.err.println("\t" + topSiteMatchesFile);
509	}
510
511	/*
512	// BEGIN DEBUG
513	System.err.println("@@@@ TopSitesMap contains: ");
514	for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
515	String topSite = entry.getKey();
516	String urlPattern = entry.getValue();
517	System.err.println(topSite + " - " + urlPattern);
518	} // END DEBUG
519	*/
520	}
521
522	private String stripSubDomain(String url) {
523	int index = url.indexOf(".");
524	if(index != -1) {
525	url = url.substring(index+1);
526	}
527	return url;
528	}
529
530
531	/**
532	* @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
533	* with or without www. prefix. This method tests for such as case as it would be dangerous
534	* to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
535	*/
536	private boolean isExactDomainMatch(String seedURLDomain, String domain) {
537	// check for an exact match as-is
538	if(seedURLDomain.equals(domain)) {
539	return true;
540	}
541
542	// else check if with or without a www. prefix we have an exact match with domain
543	if(seedURLDomain.startsWith("www.")) {
544	if(seedURLDomain.substring(4).equals(domain)) {
545	return true;
546	}
547	} else {
548	if(domain.equals("www."+seedURLDomain)) {
549	return true;
550	}
551	}
552
553	return false;
554	}
555
556
557	/**
558	* Check if the domain of the seedurl, either in its entirety or when stripped of
559	* www/subdomains, is in the list of top sites.
560	* If it is, and the given url matches the regex for that topsite, then add the url to the
561	* whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
562	* @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
563	* @return one of the following values:
564	* - This function returns null if the seedURL's domain does not match any of the topsites.
565	* - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
566	* url-pattern) value was defined for it. The empty String is also returned if the seedURL's
567	* domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
568	* want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
569	* - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
570	* was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
571	* URL pattern.
572	*/
573	private String isURLinTopSitesMap(String fullSeedDomain) {
574	boolean keepLooping = true;
575
576	String domain = fullSeedDomain;
577
578	// domain aprameter will have retained www or subdomains, but is stripped of protocol
579
580	// keep looping, stripping subdomains from url and checking if it matches a topsite domain
581	// if it does, return the value for that topsite domain in the topSitesMap
582	// If no match at all, return null.
583	do {
584
585	String allowed_url_pattern = topSitesMap.get(domain);
586	if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
587	// there's an entry for the URL in the topSitesMap
588	System.err.println("##### A top site matches URL domain " + domain);
589
590	// if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
591	// www prefix, should not exactly match the topSitesMap domain
592	// e.g. we don't want to crawl a seed URL with domain www.blogspot.com
593	// despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
594
595	if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
596	return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
597	}
598	return allowed_url_pattern;
599	}
600	// else, no entry for the URL in the topSitesMap
601	// We're not done yet: strip subDomain from URL and check it against topSitesMap again
602
603	String newDomain = stripSubDomain(domain);
604	if(domain.equals(newDomain)) {
605	keepLooping = false;
606	} else {
607	domain = newDomain;
608	}
609	} while(keepLooping);
610
611	// url in entirety or stripped of subdomains did not match any of the topsites
612	return null;
613	}
614
615	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
616	//Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
617	//Iterator<Map.Entry<String, Integer>> i = entries.iterator();
618	//while(i.hasNext()) {
619	// Map.Entry<String, Integer> entry = i.next();
620	for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
621	String urlPattern = entry.getKey();
622	Integer matchRule = entry.getValue();
623
624	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
625	return true;
626	}
627	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
628	return true;
629	}
630	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
631	return true;
632	}
633	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
634	return true;
635	}
636	// else check the rest of the filter list against this url
637	// before returning false to be certain it's not been listed in the filter list
638	}
639
640	return false;
641	}
642
643	/**
644	* Returns true if the url or pattern is found in the blacklist file.
645	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
646	* it won't get blacklisted after all. But that's not implemented here.
647	*/
648	public boolean isBlacklisted(String url) {
649	return isListedInFilterList(blackList, url);
650	}
651
652	/**
653	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
654	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
655	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
656	*/
657	public boolean isGreylisted(String url) {
658	// auto-translated product sites
659	return isListedInFilterList(greyList, url);
660	}
661
662	/**
663	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
664	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
665	*/
666	public boolean isWhitelisted(String url) {
667	return isListedInFilterList(whiteList, url);
668	}
669
670	/**
671	* Checks URL parameter against each line ("filter") of conf/url-black\|grey\|whitelist-filter.txt to decide
672	* whether it is in the mentioned black\|grey\|white list.
673	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
674	* By not having this method deal with actual regex for filters, this has the advantage that
675	* we don't have to remember to escape or double escape each filter to turn it into a regex.
676	*/
677	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
678
679	// if filterListFilename does not exist in the conf folder, just return
680	if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
681	System.err.println(filterListFilename + " does not exist");
682	return;
683	}
684
685	try (
686	BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
687	) {
688	String filter = null;
689	while((filter = reader.readLine()) != null) {
690	// skip comments and empty lines
691	filter = filter.trim();
692	if(filter.equals("") \|\| filter.startsWith("#")) {
693	continue;
694	}
695
696	if(filter.startsWith("^") && filter.endsWith("$")) {
697	filter = filter.substring(1, filter.length()-1);
698	list.put(filter, LIST_ENTRY_MATCHES);
699	}
700	else if(filter.startsWith("^")) {
701	filter = filter.substring(1);
702	list.put(filter, LIST_ENTRY_STARTSWITH);
703	System.err.println("Match filter startswith: " + filter);
704	}
705	else if(filter.endsWith("$")) {
706	filter = filter.substring(0, filter.length()-1);
707	list.put(filter, LIST_ENTRY_ENDSWITH);
708	}
709	else {
710	list.put(filter, LIST_ENTRY_CONTAINS);
711	}
712	//System.err.println("Got filter: " + filter);
713	}
714
715	} catch (IOException ioe) {
716	ioe.printStackTrace();
717	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
718	}
719
720	}
721
722	/** Maintain a count of all WET files processed. */
723	public void setWETFileCount(int count) { this.wetFileCount = count; }
724
725	/** Maintain a count of all WET records processed. */
726	//public int getRecordCount() { return this.totalRecordCount; }
727	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
728	public void setRecordCount(int count) { this.totalRecordCount = count; }
729
730	public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
731
732	// Will list all the warc.wet files in the input directory or else their gzipped versions
733	File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
734
735	int wetRecordCount = 0;
736	int wetFileCount = 0;
737
738	for(int i = 0; i < WETFiles.length; i++) {
739	File WETFile = WETFiles[i];
740	logger.debug("Processing WETfile: " + WETFile);
741
742	// Any .gz files listed means they haven't been unzipped yet. So unzip.
743	String WETFilename = WETFile.toString();
744	if(WETFilename.endsWith(".gz")) {
745	File GZippedWETFile = WETFile;
746	String WETGZippedFilename = WETFilename;
747	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
748
749	WETFile = new File(WETFilename);
750	Utility.unzipFile(GZippedWETFile, WETFile);
751	}
752	// hereafter all WETFiles should refer to the unzipped version
753	// Check the unzipped WETFile exists
754
755	if(!WETFile.exists() \|\| !WETFile.isFile()) {
756	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
757	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
758	return;
759	}
760
761	// Finally, we can process this WETFile's records into the keep and discard pile
762	wetFileCount++;
763	logger.debug("Off to process " + WETFile);
764	String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
765	crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
766	WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
767	wetFileProcessor.processWETFile();
768	wetRecordCount += wetFileProcessor.getRecordCount();
769	}
770
771	// for information purposes
772	this.setWETFileCount(wetFileCount);
773	this.setRecordCount(wetRecordCount);
774	}
775
776
777	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
778	public static void printUsage() {
779	System.err.println("Run this program as:");
780	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
781	}
782
783	/** Filename filter to only list warc.wet files or else warc.wet.gz files
784	* for which unzipped warc.wet equivalents don't yet exist.
785	*/
786	private static class WETFilenameFilter implements FilenameFilter {
787
788	public boolean accept(File dir, String name) {
789	if(name.endsWith(".warc.wet")) {
790	logger.debug("Will include " + name + " for processing.");
791	return true;
792	}
793
794	if(name.endsWith(".warc.wet.gz")) {
795	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
796	File unzippedVersion = new File(dir, nameWithoutGZext);
797	if(unzippedVersion.exists()) {
798	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
799	logger.debug("Skipping " + name);
800	return false; // don't count gzipped version if unzipped version exists.
801	}
802	else {
803	logger.debug("Only zipped version " + name + " exists.");
804	return true; // No unzipped version, so have to work with gzipped version
805	}
806	}
807
808	// we're not even interested in any other file extensions
809	logger.debug("Not a WET file. Skipping " + name);
810	return false;
811	}
812	}
813
814
815	private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
816
817	public boolean accept(File dir, String name) {
818	File f = new File (dir, name);
819	if(f.isDirectory()) {
820	if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
821	return true;
822	}
823	}
824	else {
825	System.err.println("File " + f + " is not a directory");
826	}
827	return false;
828	}
829	}
830
831	public static void main(String[] args) {
832	if(args.length != 2) {
833	printUsage();
834	return;
835	}
836
837	File commoncrawlDir = new File(args[0]);
838	if(!commoncrawlDir.exists() \|\| !commoncrawlDir.isDirectory()) {
839	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
840	return;
841	}
842
843	File outFolder = new File(args[1]);
844	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
845	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
846	return;
847	}
848
849	try {
850	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
851
852	File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
853
854	for(int i = 0; i < ccrawlFolders.length; i++) {
855	File ccrawlFolder = ccrawlFolders[i];
856	System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
857	ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
858	}
859
860	// create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
861	// The former is the only unique one. seedURLs and regex-urlfilters are
862	// repeated on a per site/domain basis too, stored in the sites folder
863	File seedURLsFile = new File(outFolder, "seedURLs.txt");
864	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
865	File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
866	File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
867
868	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
869
870	System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
871
872	System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
873
874
875	} catch(Exception e) {
876	// can get an exception when instantiating CCWETProcessor instance
877	e.printStackTrace();
878	System.err.println(e.getMessage());
879	}
880
881	return;
882
883	}
884	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33565

Download in other formats: