Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33604

Last change on this file since 33604 was 33604, checked in by ak19, 5 years ago
Better output into possible-product-sites.txt including the overseas country code prefix to help decide whether the site is worth keeping or not. 2. Updated whitelisting and top-sites filters to grab the /mi/ subsections of sites that don't appear to be autotranslated. This is done in preparation for blocking out product sites hereafter
File size: 39.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.nio.charset.StandardCharsets;
6	import java.util.Properties;
7	import java.util.zip.GZIPInputStream;
8	import java.util.Iterator;
9	import java.util.HashMap;
10	import java.util.Map;
11	import java.util.Set;
12	import java.util.TreeMap;
13	import java.util.TreeSet;
14
15	import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16	import org.apache.log4j.Logger;
17
18
19	/**
20	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21	* the WET records in each, putting each WET record into a file. Each file is put into a
22	* keep or discard or greyListed folder, and its url listed written into a keep, discard
23	* or greylisted text file, based on based on
24	*
25	* 1. whether it's whitelisted, else greylisted else blacklisted
26	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27	* enough content. Formerly, content-length and number of lines were used to determine if
28	* the content was sufficient. Now it's just word count and number of MAX characters
29	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30	* in conf/config.properties.
31	*
32	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33	* into the conf folder to control any url patterns that are explicitly included or excluded or
34	* set aside for inspecting later. These filter text files don't use regexes, instead their
35	* format is:
36	* - precede URL by ^ to blacklist urls that match the given prefix
37	* - succeed URL by $ to blacklist urls that match the given suffix
38	* - ^url$ will blacklist urls that match the given url completely
39	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40	*
41	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
42	* over greylisting and which takes precedence over blacklisting in turn. However, even
43	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45	* also for nutch.
46	*
47	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48	* in the given input folder. Then use a single instance of the WETProcessor class to process
49	* each single unzipped warc.wet file.
50	*
51	* To compile, including the jars in lib/ for compiling.
52	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53	*
54	* To run, passing the log4j and other properties files in conf/ folder:
55	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56	*
57	* e.g. (from maori-lang-detection/src)
58	*
59	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 \| less
61	*
62	*/
63
64	public class CCWETProcessor {
65	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67	// Properties shared across WETProcessor instances
68	public final int MAX_WORD_LENGTH;
69	public final int MIN_NUM_WORDS;
70	public final int MAX_WORDS_CAMELCASE;
71
72	// constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73	public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74	public final String SINGLEPAGE = "SINGLEPAGE";
75	public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77	/**
78	* Characters that need escaping if used as a string literal in a regex
79	* https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80	* https://www.regular-expressions.info/refcharacters.html
81	* Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82	* as would happen for any chars appearing earlier in this list than \\
83	*/
84	public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{\|";
85	//public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\|"];
86
87	private Properties configProperties = new Properties();
88
89	// File paths shared across WETProcessor instances
90	public final File commoncrawlDir;
91	public final File outputFolder;
92	public final File discardFolder;
93	public final File keepFolder;
94	public final File greyListedFolder;
95	public final File keepURLsFile;
96	public final File discardURLsFile;
97	public final File greyListedFile;
98
99	/** Possible values stored in the blackList/whiteList/greyList Maps */
100	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105	/**
106	* Store url patterns as keys and values indicated whether a url should
107	* match it exactly, start/end with it, or contain it
108	*/
109	private HashMap<String, Integer> blackList;
110	private HashMap<String, Integer> greyList;
111	private HashMap<String, Integer> whiteList;
112
113	/** map of topsites with allowable regexes: sites too big to exhaustively crawl
114	* with optional regex defining allowed exceptions, like subdomains or url suffixes
115	* off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116	* is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117	* would be relevant.
118	* The map would store top site domain suffix and an optional regex string for allowable
119	* url patterns.
120	*/
121	private HashMap<String, String> topSitesMap;
122
123	/** Map of domains we keep and the full urls we're keeping that are of that domain.
124	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125	* while a HashMap has no notion of ordering, because we just need to store urls with
126	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130	*/
131	private Map<String, Set<String>> domainsToURLsMap;
132
133	// Keep a count of all the records that all WETProcessors instantiated
134	// by our main method combined have processed
135	private int totalRecordCount = 0;
136
137	private int wetFileCount = 0;
138
139	private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142	this.commoncrawlDir = inFolder;
143	this.outputFolder = outFolder;
144
145	// load up the properties from the config file
146	try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147	configProperties = new Properties();
148	configProperties.load(infile);
149	//infile.close(); // not explicitly called in examples of try-with-resources
150
151	} catch(Exception e) {
152	error("Exception attempting to read properties from config.properties.", e);
153	}
154
155	if(configProperties.size() == 0) {
156	warn("*** Warning: no values read into config properties. Using defaults.");
157	}
158
159	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164	this.discardFolder = new File(outFolder, "discard");
165	if(!discardFolder.exists()) {
166	discardFolder.mkdir();
167	}
168	this.keepFolder = new File(outFolder, "keep");
169	if(!keepFolder.exists()) {
170	keepFolder.mkdir();
171	}
172
173	this.greyListedFolder = new File(outFolder, "greylisted");
174	if(!greyListedFolder.exists()) {
175	greyListedFolder.mkdir();
176	}
177
178	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181	}
182	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185	}
186	this.greyListedFile = new File(outFolder, "greyListed.txt");
187	if(greyListedFile.exists() && !greyListedFile.delete()) {
188	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189	}
190
191	// prepare our blacklist, greylist (for inspection) and whitelist
192	info("Loading blacklist.");
193	blackList = new HashMap<String, Integer>();
194	initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196	info("Loading greylist.");
197	greyList = new HashMap<String, Integer>();
198	initURLFilterList(greyList, "url-greylist-filter.txt");
199
200	info("Loading whitelist.");
201	whiteList = new HashMap<String, Integer>();
202	initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204	// Create the map of topSites
205	info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206	topSitesMap = new HashMap<String, String>();
207
208	// Read in our csv file of topsites and what to do when one hits a match with a seedURL
209	// and put these in our topSitesMap
210	// https://commons.apache.org/proper/commons-csv/apidocs/index.html
211	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212	//https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213	CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214	.withCommentMarker('#')
215	.withSkipHeaderRecord()
216	.withIgnoreSurroundingSpaces();
217
218	File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219	// CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220	try (
221	CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222	) {
223	for (CSVRecord csvRecord : parser) {
224	String topsite = csvRecord.get(0);
225	String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226	topSitesMap.put(topsite, allowed_url_pattern);
227
228	//debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230	}
231	} catch(Exception e) {
232	error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233	}
234
235	//debug("Prematurely terminating for testing purposes.");
236	//System.exit(-1);
237	}
238
239	/** Work out the 'domain' for a given url.
240	* This retains any www. or subdomain prefix.
241	*/
242	public static String getDomainForURL(String url, boolean withProtocol) {
243	int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
244	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
245	// the keep the URL around in case param withProtocol=true
246	String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
247
248	String domain = url.substring(startIndex);
249	int endIndex = domain.indexOf("/");
250	if(endIndex == -1) endIndex = domain.length();
251	domain = domain.substring(0, endIndex);
252
253	if(withProtocol) {
254	// now that we have the domain (everything to the first / when there is no protocol)
255	// can glue the protocol back on
256	domain = protocol + domain;
257	}
258
259	return domain;
260	}
261
262	/** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
263	private String escapeStringForRegex(String str) {
264	for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
265	char c = ESCAPE_CHARS_FOR_RE.charAt(i);
266	str = str.replace(Character.toString(c), "\\"+c);
267	}
268	return str;
269	}
270
271	/**
272	* Using the keepURLs.txt file generated by running WETProcessor instances, this produces
273	* as output the URL seed list and regex-urlfilter text files required by nutch, see
274	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
275	*/
276	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
277	File domainURLsFile, File topSiteMatchesFile,
278	File possibleProductSitesFile) {
279	// Maintain a Map of unique domains mapped to seed urls at that domain
280	// TreeSet: by default, "the elements are ordered using their natural ordering"
281	// (or by a Comparator provided at set creation time).
282	// Whereas HashSet doesn't guarantee ordering.
283	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
284	// Would be a similar distinction for Maps.
285	domainsToURLsMap = new TreeMap<String, Set<String>>();
286
287	final String PROTOCOL_REGEX_PREFIX = "+^https?://";
288	final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.) for nutch's regex-urlfilter.txt
289
290	// keep an eye out on URLs we need to inspect later
291	Set<String> possibleProductDomains = new TreeSet<String>();
292	File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
293
294	try (
295	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
296	BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
297	) {
298
299	// read a URL at a time from urlsFile
300	String url = null;
301	String domainWithProtocol = null;
302	while((url = reader.readLine()) != null) { // readLine removes newline separator
303
304	// work out domain. This retains any www. or subdomain prefix
305	// passing true to further also retain the http(s) protocol
306	domainWithProtocol = getDomainForURL(url, true);
307
308	Set<String> urlsSet;
309	if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
310	urlsSet = new TreeSet<String>();
311	urlsSet.add(url);
312	domainsToURLsMap.put(domainWithProtocol, urlsSet);
313	} else {
314	urlsSet = domainsToURLsMap.get(domainWithProtocol);
315	urlsSet.add(url);
316	}
317
318	// Dr Nichols said that a url that was located outside the country and
319	// which had /mi/ URLs was more likely to be an autotranslated (product) site.
320	// Following Dr Nichols' idea, let's keep a look out for more product sites:
321	// if any URL contains /mi AND the tld of its domain is outside of New Zealand
322	// then add that domain (if not already added) and that url into a file
323	// for later manual inspection
324	if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") \|\| url.endsWith("/mi"))) {
325	if(!possibleProductDomains.contains(domainWithProtocol)) {
326
327	String countryCode = "";
328	try {
329	// more expensive test, so do this only if above conditions are true:
330	countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
331	System.err.println("@@@@ Got country code: " + countryCode);
332	} catch(Exception exceptObj) {
333	countryCode = ""; // forces domain to be included for inspection
334
335	error("Could not check if domain " + domainWithProtocol
336	+ " was in country: " + countryCode,
337	exceptObj);
338	}
339
340	boolean isInNZ = countryCode.toLowerCase().equals("nz");
341
342
343	//if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
344	if(!isInNZ) {
345	possibleProductDomains.add(domainWithProtocol);
346	// write both domain and a sample URL on that site out to file
347	possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
348	possibleProductSitesWriter.write("\t" + url + "\n");
349	}
350	} /*else {
351	// already wrote out domain to file at some point, write just the URL out to file
352	possibleProductSitesWriter.write("\t" + url + "\n");
353	}*/
354	}
355	}
356	} catch (IOException ioe) {
357	error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
358	}
359
360	// We'd have pruned out duplicates by now and have a sorted list of domains,
361	// each of which maps to seed URLs in the commoncrawl for that domain
362
363	int domainCount = 0;
364	File sitesFolder = new File(outputFolder, "sites");
365	if(!sitesFolder.exists()) {
366	sitesFolder.mkdir();
367	}
368	final String FORMATSTR = "%05d";
369
370	// write out each domain followed in sequence by all urls we found in that domain
371	// (urls with tab up front)
372	try (
373	// global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
374	// Also a global file listing any urls that matched top sites that didn't specify
375	// allowed regex patterns
376	BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
377	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
378	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
379	BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));
380	) {
381
382	// initialise topSiteMatchesFile with some instructional text.
383	topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
384	topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
385	topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
386
387	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
388	Set<String> domainsSet = domainsToURLsMap.keySet();
389	Iterator<String> domainIterator = domainsSet.iterator();
390
391	/*
392	// DEBUG
393	String value = topSitesMap.get("wikipedia.org");
394	if(value == null) {
395	debug("### wikipedia.org had null value");
396	} else {
397	debug("### wikipedia.org had value: " + value);
398	} // DEBUG
399	*/
400
401	while(domainIterator.hasNext()) {
402	String domainWithProtocol = domainIterator.next();
403	// Also get domain without protocol prefix
404	int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
405	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
406	String domain = domainWithProtocol.substring(startIndex);
407
408	/*if(domain.contains("docs.google.com")) {
409	debug("domain with protocol: " + domainWithProtocol);
410	debug("domain: " + domain);
411	}*/
412
413	String allowedURLPatternRegex = isURLinTopSitesMap(domain);
414	// If the domain is of a topsite for which no allowed URL pattern has been provided
415	// in sites-too-big-to-exhaustively-crawl.txt,
416	// then we don't know how to crawl the site. Warn the user by writing the affected
417	// domain and seedURLs to the topSiteMatchesFile.
418	if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
419
420	// topsite, but we don't (yet) know what portion can be crawled
421	// Append the top site and url to a global/toplevel file that
422	// the user needs to check later and we're done with this domain as it
423	// won't go into any other file hereafter
424
425	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
426	for(String url : urlsForDomainSet) {
427	topSiteMatchesWriter.write("\t" + url + "\n");
428	}
429
430	continue; // done with this domain
431	}
432
433	// start counting the domains we're actually going to process
434	domainCount++;
435
436	String siteID = String.format(FORMATSTR, domainCount);
437	File domainFolder = new File(sitesFolder, siteID);
438	domainFolder.mkdir();
439
440	// write out the domain
441	//seedURLsWriter.write(domainWithProtocol + "\n");
442
443
444	// for every domain, we need a sites/0000x/ folder, where x is domain#, containing
445	// its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
446	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
447	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
448	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
449	try (
450	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
451	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
452	) {
453
454	// write all sorted unique domains into global domains file
455	// Using the domain withuot protocol since the global domains file is for
456	// informational purposes
457	domainURLsWriter.write(domain + "\n");
458
459	// Only write urls and no domain into single global seedurls file
460	// But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
461	// files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
462	// If we ever run nutch on a single seedURLs listing containing
463	// all seed pages to crawl sites from, the above two files will work for that.
464
465	// first write out the urls for the domain into the sites/0000x/seedURLs.txt file
466	// also write into the global seeds file (with a tab prefixed to each?)
467	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
468	for(String url : urlsForDomainSet) {
469	seedURLsWriter.write(url + "\n"); // global seedURLs file
470	siteURLsWriter.write(url + "\n");
471	}
472
473	if(allowedURLPatternRegex == null) { // entire site can be crawled
474	siteURLsWriter.write(domainWithProtocol + "\n");
475
476	// Write out filter in the following form for a site, e.g. for nutch.apache.org:
477	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
478	String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
479	//String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
480	urlFilterWriter.write(regexed_domain + "\n"); //global file
481	siteRegexWriter.write(regexed_domain + "\n"); // site file
482	}
483	else { // domain belongs to a top site where only portion of site can be crawled
484
485	if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
486	siteURLsWriter.write(domainWithProtocol + "\n");
487	// e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
488	// and not for all of blogspot.com
489
490	String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
491	//String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
492	urlFilterWriter.write(regexed_domain + "\n");
493	siteRegexWriter.write(regexed_domain + "\n");
494
495	} else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
496	// don't write out domain. We want individual pages
497	//DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
498
499	// don't write out domain as a regex expression url filter either,
500	// write out the individual seed urls for the domain instead
501	// since we will only be downloading the single page
502
503	urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
504	for(String urlInDomain : urlsForDomainSet) {
505	// don't append slash to end this time
506	String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
507	//String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
508	urlFilterWriter.write(regexed_url + "\n");
509	siteRegexWriter.write(regexed_url + "\n");
510	}
511	} else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
512
513	// DON'T write out domain into siteURLs file,
514	// BUT DO write it into urlFilter file
515	String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
516
517	urlFilterWriter.write(regexed_domain + "\n");
518	siteRegexWriter.write(regexed_domain + "\n");
519	} else { // allowedURLPatternRegex is a url-form - convert to regex
520	if(!allowedURLPatternRegex.endsWith("/")) {
521	allowedURLPatternRegex += "/";
522	}
523	String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
524	//String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
525	siteURLsWriter.write(domainWithProtocol + "\n");
526	urlFilterWriter.write(regexed_pattern + "\n");
527	siteRegexWriter.write(regexed_pattern + "\n");
528
529	}
530	}
531
532	} catch (IOException ioe) {
533	error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
534	}
535
536	}
537
538	} catch (IOException ioe) {
539	error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
540	+ "\n\t" + urlFilterFile
541	+ "\n\t" + domainURLsFile
542	+ "\n\t" + topSiteMatchesFile, ioe);
543	}
544
545	/*
546	// BEGIN DEBUG
547	debug("@@@@ TopSitesMap contains: ");
548	for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
549	String topSite = entry.getKey();
550	String urlPattern = entry.getValue();
551	debug(topSite + " - " + urlPattern);
552	} // END DEBUG
553	*/
554	}
555
556	private String stripSubDomain(String url) {
557	int index = url.indexOf(".");
558	if(index != -1) {
559	url = url.substring(index+1);
560	}
561	return url;
562	}
563
564
565	/**
566	* @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
567	* with or without www. prefix. This method tests for such as case as it would be dangerous
568	* to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
569	*/
570	private boolean isExactDomainMatch(String seedURLDomain, String domain) {
571	// check for an exact match as-is
572	if(seedURLDomain.equals(domain)) {
573	return true;
574	}
575
576	// else check if with or without a www. prefix we have an exact match with domain
577	if(seedURLDomain.startsWith("www.")) {
578	if(seedURLDomain.substring(4).equals(domain)) {
579	return true;
580	}
581	} else {
582	if(domain.equals("www."+seedURLDomain)) {
583	return true;
584	}
585	}
586
587	return false;
588	}
589
590
591	/**
592	* Check if the domain of the seedurl, either in its entirety or when stripped of
593	* www/subdomains, is in the list of top sites.
594	* If it is, and the given url matches the regex for that topsite, then add the url to the
595	* whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
596	* @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
597	* @return one of the following values:
598	* - This function returns null if the seedURL's domain does not match any of the topsites.
599	* - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
600	* url-pattern) value was defined for it. The empty String is also returned if the seedURL's
601	* domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
602	* want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
603	* - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
604	* was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
605	* URL pattern.
606	*/
607	private String isURLinTopSitesMap(String fullSeedDomain) {
608	boolean keepLooping = true;
609
610	String domain = fullSeedDomain;
611
612	// domain aprameter will have retained www or subdomains, but is stripped of protocol
613
614	// keep looping, stripping subdomains from url and checking if it matches a topsite domain
615	// if it does, return the value for that topsite domain in the topSitesMap
616	// If no match at all, return null.
617	do {
618
619	String allowed_url_pattern = topSitesMap.get(domain);
620	if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
621	// there's an entry for the URL in the topSitesMap
622	debug("##### A top site matches URL domain " + domain);
623
624	// if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
625	// www prefix, should not exactly match the topSitesMap domain
626	// e.g. we don't want to crawl a seed URL with domain www.blogspot.com
627	// despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
628
629	if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
630	return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
631	}
632	return allowed_url_pattern;
633	}
634	// else, no entry for the URL in the topSitesMap
635	// We're not done yet: strip subDomain from URL and check it against topSitesMap again
636
637	String newDomain = stripSubDomain(domain);
638	if(domain.equals(newDomain)) {
639	keepLooping = false;
640	} else {
641	domain = newDomain;
642	}
643	} while(keepLooping);
644
645	// url in entirety or stripped of subdomains did not match any of the topsites
646	return null;
647	}
648
649	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
650	//Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
651	//Iterator<Map.Entry<String, Integer>> i = entries.iterator();
652	//while(i.hasNext()) {
653	// Map.Entry<String, Integer> entry = i.next();
654	for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
655	String urlPattern = entry.getKey();
656	Integer matchRule = entry.getValue();
657
658	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
659	return true;
660	}
661	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
662	return true;
663	}
664	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
665	return true;
666	}
667	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
668	return true;
669	}
670	// else check the rest of the filter list against this url
671	// before returning false to be certain it's not been listed in the filter list
672	}
673
674	return false;
675	}
676
677	/**
678	* Returns true if the url or pattern is found in the blacklist file.
679	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
680	* it won't get blacklisted after all. But that's not implemented here.
681	*/
682	public boolean isBlacklisted(String url) {
683	boolean isBlackListed = isListedInFilterList(blackList, url);
684
685	// if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
686	// then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
687	String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
688	if(!isBlackListed && url.contains("jasmin")) {
689	warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
690	blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
691	}
692	return isBlackListed;
693	}
694
695	/**
696	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
697	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
698	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
699	*/
700	public boolean isGreylisted(String url) {
701	// auto-translated product sites
702	return isListedInFilterList(greyList, url);
703	}
704
705	/**
706	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
707	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
708	*/
709	public boolean isWhitelisted(String url) {
710	return isListedInFilterList(whiteList, url);
711	}
712
713	/**
714	* Checks URL parameter against each line ("filter") of conf/url-black\|grey\|whitelist-filter.txt to decide
715	* whether it is in the mentioned black\|grey\|white list.
716	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
717	* By not having this method deal with actual regex for filters, this has the advantage that
718	* we don't have to remember to escape or double escape each filter to turn it into a regex.
719	*/
720	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
721
722	// if filterListFilename does not exist in the conf folder, just return
723	if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
724	warn("Filter list filename: " + filterListFilename + " does not exist");
725	return;
726	}
727
728	try (
729	BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
730	) {
731	String filter = null;
732	while((filter = reader.readLine()) != null) {
733	// skip comments and empty lines
734	filter = filter.trim();
735	if(filter.equals("") \|\| filter.startsWith("#")) {
736	continue;
737	}
738
739	if(filter.startsWith("^") && filter.endsWith("$")) {
740	filter = filter.substring(1, filter.length()-1);
741	list.put(filter, LIST_ENTRY_MATCHES);
742	}
743	else if(filter.startsWith("^")) {
744	filter = filter.substring(1);
745	list.put(filter, LIST_ENTRY_STARTSWITH);
746	//debug("Match filter startswith: " + filter);
747	}
748	else if(filter.endsWith("$")) {
749	filter = filter.substring(0, filter.length()-1);
750	list.put(filter, LIST_ENTRY_ENDSWITH);
751	//debug("@@@ Match filter endswith: " + filter);
752	}
753	else {
754	list.put(filter, LIST_ENTRY_CONTAINS);
755	}
756	//debug("Got filter: " + filter);
757	}
758
759	} catch (IOException ioe) {
760	error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
761	}
762
763	}
764
765	/** Maintain a count of all WET files processed. */
766	public void setWETFileCount(int count) { this.wetFileCount = count; }
767
768	/** Maintain a count of all WET records processed. */
769	//public int getRecordCount() { return this.totalRecordCount; }
770	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
771	public void setRecordCount(int count) { this.totalRecordCount = count; }
772
773	public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
774
775	// Will list all the warc.wet files in the input directory or else their gzipped versions
776	File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
777
778	int wetRecordCount = 0;
779	int wetFileCount = 0;
780
781	for(int i = 0; i < WETFiles.length; i++) {
782	File WETFile = WETFiles[i];
783	debug("Processing WETfile: " + WETFile);
784
785	// Any .gz files listed means they haven't been unzipped yet. So unzip.
786	String WETFilename = WETFile.toString();
787	if(WETFilename.endsWith(".gz")) {
788	File GZippedWETFile = WETFile;
789	String WETGZippedFilename = WETFilename;
790	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
791
792	WETFile = new File(WETFilename);
793	Utility.unzipFile(GZippedWETFile, WETFile);
794	}
795	// hereafter all WETFiles should refer to the unzipped version
796	// Check the unzipped WETFile exists
797
798	if(!WETFile.exists() \|\| !WETFile.isFile()) {
799	error("Error: " + WETFile + " does not exist (failure to unzip?)");
800	return;
801	}
802
803	// Finally, we can process this WETFile's records into the keep and discard pile
804	wetFileCount++;
805	debug("Off to process " + WETFile);
806	String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
807	crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
808	WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
809	wetFileProcessor.processWETFile();
810	wetRecordCount += wetFileProcessor.getRecordCount();
811	}
812
813	// for information purposes
814	this.setWETFileCount(wetFileCount);
815	this.setRecordCount(wetRecordCount);
816	}
817
818
819	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
820	public static void info(String msg) {
821	System.err.println(msg);
822	logger.info(msg);
823	}
824	public static void debug(String msg) {
825	System.err.println(msg);
826	logger.debug(msg);
827	}
828	public static void warn(String msg) {
829	System.err.println(msg);
830	logger.warn(msg);
831	}
832	public static void error(String msg) {
833	System.err.println(msg);
834	logger.error(msg);
835	}
836	public static void error(String msg, Exception e) {
837	logger.error(msg, e);
838	System.err.println("\n"+msg);
839	e.printStackTrace();
840	}
841
842	public static void printUsage() {
843	info("Run this program as:");
844	info("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
845	}
846
847	/** Filename filter to only list warc.wet files or else warc.wet.gz files
848	* for which unzipped warc.wet equivalents don't yet exist.
849	*/
850	private static class WETFilenameFilter implements FilenameFilter {
851
852	public boolean accept(File dir, String name) {
853	if(name.endsWith(".warc.wet")) {
854	debug("Will include " + name + " for processing.");
855	return true;
856	}
857
858	if(name.endsWith(".warc.wet.gz")) {
859	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
860	File unzippedVersion = new File(dir, nameWithoutGZext);
861	if(unzippedVersion.exists()) {
862	debug("--- Unzipped version " + unzippedVersion + " exists.");
863	debug("Skipping " + name);
864	return false; // don't count gzipped version if unzipped version exists.
865	}
866	else {
867	debug("Only zipped version " + name + " exists.");
868	return true; // No unzipped version, so have to work with gzipped version
869	}
870	}
871
872	// we're not even interested in any other file extensions
873	debug("Not a WET file. Skipping " + name);
874	return false;
875	}
876	}
877
878
879	private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
880
881	public boolean accept(File dir, String name) {
882	File f = new File (dir, name);
883	if(f.isDirectory()) {
884	if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
885	return true;
886	}
887	}
888	else {
889	info("File " + f + " is not a directory");
890	}
891	return false;
892	}
893	}
894
895	public static void main(String[] args) {
896	if(args.length != 2) {
897	printUsage();
898	return;
899	}
900
901	File commoncrawlDir = new File(args[0]);
902	if(!commoncrawlDir.exists() \|\| !commoncrawlDir.isDirectory()) {
903	error("Error: " + args[0] + " does not exist or is not a directory");
904	return;
905	}
906
907	File outFolder = new File(args[1]);
908	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
909	error("Error: " + args[1] + " does not exist or is not a directory.");
910	return;
911	}
912
913	try {
914	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
915
916	File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
917
918	for(int i = 0; i < ccrawlFolders.length; i++) {
919	File ccrawlFolder = ccrawlFolders[i];
920	info("About to process commoncrawl WET files folder: " + ccrawlFolder);
921	ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
922	}
923
924	// create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
925	// The former is the only unique one. seedURLs and regex-urlfilters are
926	// repeated on a per site/domain basis too, stored in the sites folder
927	File seedURLsFile = new File(outFolder, "seedURLs.txt");
928	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
929	File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
930	File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
931	File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
932
933	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
934
935	info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
936
937	info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
938
939
940	} catch(Exception e) {
941	// can get an exception when instantiating CCWETProcessor instance
942	error(e.getMessage(), e);
943	}
944
945	return;
946
947	}
948	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: