Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33615

Last change on this file since 33615 was 33615, checked in by ak19, 4 years ago
Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 38.9 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.nio.charset.StandardCharsets;
6	import java.util.Properties;
7	import java.util.zip.GZIPInputStream;
8	import java.util.Iterator;
9	import java.util.HashMap;
10	import java.util.Map;
11	import java.util.Set;
12	import java.util.TreeMap;
13	import java.util.TreeSet;
14
15	import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16	import org.apache.log4j.Logger;
17
18
19	/**
20	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21	* the WET records in each, putting each WET record into a file. Each file is put into a
22	* keep or discard or greyListed folder, and its url listed written into a keep, discard
23	* or greylisted text file, based on based on
24	*
25	* 1. whether it's whitelisted, else greylisted else blacklisted
26	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27	* enough content. Formerly, content-length and number of lines were used to determine if
28	* the content was sufficient. Now it's just word count and number of MAX characters
29	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30	* in conf/config.properties.
31	*
32	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33	* into the conf folder to control any url patterns that are explicitly included or excluded or
34	* set aside for inspecting later. These filter text files don't use regexes, instead their
35	* format is:
36	* - precede URL by ^ to blacklist urls that match the given prefix
37	* - succeed URL by $ to blacklist urls that match the given suffix
38	* - ^url$ will blacklist urls that match the given url completely
39	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40	*
41	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
42	* over greylisting and which takes precedence over blacklisting in turn. However, even
43	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45	* also for nutch.
46	*
47	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48	* in the given input folder. Then use a single instance of the WETProcessor class to process
49	* each single unzipped warc.wet file.
50	*
51	* To compile, including the jars in lib/ for compiling.
52	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53	*
54	* To run, passing the log4j and other properties files in conf/ folder:
55	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56	*
57	* e.g. (from maori-lang-detection/src)
58	*
59	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 \| less
61	*
62	*/
63
64	public class CCWETProcessor {
65	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67	// Properties shared across WETProcessor instances
68	public final int MAX_WORD_LENGTH;
69	public final int MIN_NUM_WORDS;
70	public final int MAX_WORDS_CAMELCASE;
71
72	// constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73	public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74	public final String SINGLEPAGE = "SINGLEPAGE";
75	public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77	/**
78	* Characters that need escaping if used as a string literal in a regex
79	* https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80	* https://www.regular-expressions.info/refcharacters.html
81	* Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82	* as would happen for any chars appearing earlier in this list than \\
83	*/
84	public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{\|";
85	//public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\|"];
86
87	private Properties configProperties = new Properties();
88
89	// File paths shared across WETProcessor instances
90	public final File commoncrawlDir;
91	public final File outputFolder;
92	public final File discardFolder;
93	public final File keepFolder;
94	public final File greyListedFolder;
95	public final File keepURLsFile;
96	public final File discardURLsFile;
97	public final File greyListedFile;
98
99	/** Possible values stored in the blackList/whiteList/greyList Maps */
100	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105	/**
106	* Store url patterns as keys and values indicated whether a url should
107	* match it exactly, start/end with it, or contain it
108	*/
109	private HashMap<String, Integer> blackList;
110	private HashMap<String, Integer> greyList;
111	private HashMap<String, Integer> whiteList;
112
113	/** map of topsites with allowable regexes: sites too big to exhaustively crawl
114	* with optional regex defining allowed exceptions, like subdomains or url suffixes
115	* off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116	* is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117	* would be relevant.
118	* The map would store top site domain suffix and an optional regex string for allowable
119	* url patterns.
120	*/
121	private HashMap<String, String> topSitesMap;
122
123	/** Map of domains we keep and the full urls we're keeping that are of that domain.
124	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125	* while a HashMap has no notion of ordering, because we just need to store urls with
126	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130	*/
131	private Map<String, Set<String>> domainsToURLsMap;
132
133	// Keep a count of all the records that all WETProcessors instantiated
134	// by our main method combined have processed
135	private int totalRecordCount = 0;
136
137	private int wetFileCount = 0;
138
139	private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142	this.commoncrawlDir = inFolder;
143	this.outputFolder = outFolder;
144
145	// load up the properties from the config file
146	try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147	configProperties = new Properties();
148	configProperties.load(infile);
149	//infile.close(); // not explicitly called in examples of try-with-resources
150
151	} catch(Exception e) {
152	logger.error("Exception attempting to read properties from config.properties.", e);
153	}
154
155	if(configProperties.size() == 0) {
156	logger.warn("*** Warning: no values read into config properties. Using defaults.");
157	}
158
159	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164	this.discardFolder = new File(outFolder, "discard");
165	if(!discardFolder.exists()) {
166	discardFolder.mkdir();
167	}
168	this.keepFolder = new File(outFolder, "keep");
169	if(!keepFolder.exists()) {
170	keepFolder.mkdir();
171	}
172
173	this.greyListedFolder = new File(outFolder, "greylisted");
174	if(!greyListedFolder.exists()) {
175	greyListedFolder.mkdir();
176	}
177
178	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181	}
182	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185	}
186	this.greyListedFile = new File(outFolder, "greyListed.txt");
187	if(greyListedFile.exists() && !greyListedFile.delete()) {
188	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189	}
190
191	// prepare our blacklist, greylist (for inspection) and whitelist
192	logger.info("Loading blacklist.");
193	blackList = new HashMap<String, Integer>();
194	initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196	logger.info("Loading greylist.");
197	greyList = new HashMap<String, Integer>();
198	initURLFilterList(greyList, "url-greylist-filter.txt");
199
200	logger.info("Loading whitelist.");
201	whiteList = new HashMap<String, Integer>();
202	initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204	// Create the map of topSites
205	logger.info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206	topSitesMap = new HashMap<String, String>();
207
208	// Read in our csv file of topsites and what to do when one hits a match with a seedURL
209	// and put these in our topSitesMap
210	// https://commons.apache.org/proper/commons-csv/apidocs/index.html
211	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212	//https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213	CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214	.withCommentMarker('#')
215	.withSkipHeaderRecord()
216	.withIgnoreSurroundingSpaces();
217
218	File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219	// CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220	try (
221	CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222	) {
223	for (CSVRecord csvRecord : parser) {
224	String topsite = csvRecord.get(0);
225	String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226	topSitesMap.put(topsite, allowed_url_pattern);
227
228	//logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230	}
231	} catch(Exception e) {
232	logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233	}
234
235	//logger.debug("Prematurely terminating for testing purposes.");
236	//System.exit(-1);
237	}
238
239	/** Work out the 'domain' for a given url.
240	* This retains any www. or subdomain prefix.
241	*/
242	public static String getDomainForURL(String url, boolean withProtocol) {
243	int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
244	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
245	// the keep the URL around in case param withProtocol=true
246	String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
247
248	String domain = url.substring(startIndex);
249	int endIndex = domain.indexOf("/");
250	if(endIndex == -1) endIndex = domain.length();
251	domain = domain.substring(0, endIndex);
252
253	if(withProtocol) {
254	// now that we have the domain (everything to the first / when there is no protocol)
255	// can glue the protocol back on
256	domain = protocol + domain;
257	}
258
259	return domain;
260	}
261
262	/** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
263	private String escapeStringForRegex(String str) {
264	for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
265	char c = ESCAPE_CHARS_FOR_RE.charAt(i);
266	str = str.replace(Character.toString(c), "\\"+c);
267	}
268	return str;
269	}
270
271	/**
272	* Using the keepURLs.txt file generated by running WETProcessor instances, this produces
273	* as output the URL seed list and regex-urlfilter text files required by nutch, see
274	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
275	*/
276	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
277	File domainURLsFile, File topSiteMatchesFile,
278	File possibleProductSitesFile) {
279	// Maintain a Map of unique domains mapped to seed urls at that domain
280	// TreeSet: by default, "the elements are ordered using their natural ordering"
281	// (or by a Comparator provided at set creation time).
282	// Whereas HashSet doesn't guarantee ordering.
283	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
284	// Would be a similar distinction for Maps.
285	domainsToURLsMap = new TreeMap<String, Set<String>>();
286
287	final String PROTOCOL_REGEX_PREFIX = "+^https?://";
288	final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.) for nutch's regex-urlfilter.txt
289
290	// keep an eye out on URLs we need to inspect later
291	Set<String> possibleProductDomains = new TreeSet<String>();
292	File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
293
294	try (
295	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
296	BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
297	) {
298
299	// read a URL at a time from urlsFile
300	String url = null;
301	String domainWithProtocol = null;
302	while((url = reader.readLine()) != null) { // readLine removes newline separator
303
304	// work out domain. This retains any www. or subdomain prefix
305	// passing true to further also retain the http(s) protocol
306	domainWithProtocol = getDomainForURL(url, true);
307
308	Set<String> urlsSet;
309	if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
310	urlsSet = new TreeSet<String>();
311	urlsSet.add(url);
312	domainsToURLsMap.put(domainWithProtocol, urlsSet);
313	} else {
314	urlsSet = domainsToURLsMap.get(domainWithProtocol);
315	urlsSet.add(url);
316	}
317
318	// Dr Nichols said that a url that was located outside the country and
319	// which had /mi/ URLs was more likely to be an autotranslated (product) site.
320	// Following Dr Nichols' idea, let's keep a look out for more product sites:
321	// if any URL contains /mi AND the tld of its domain is outside of New Zealand
322	// then add that domain (if not already added) and that url into a file
323	// for later manual inspection
324	if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") \|\| url.endsWith("/mi"))) {
325	/*
326	if(!possibleProductDomains.contains(domainWithProtocol)) {
327
328	String countryCode = "";
329	try {
330	// more expensive test, so do this only if above conditions are true:
331	countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
332	System.err.println("@@@@ Got country code: " + countryCode);
333	} catch(Exception exceptObj) {
334	countryCode = ""; // forces domain to be included for inspection
335
336	logger.error("Could not check if domain " + domainWithProtocol
337	+ " was in country: " + countryCode,
338	exceptObj);
339	}
340
341	boolean isInNZ = countryCode.toLowerCase().equals("nz");
342
343
344	//if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
345	if(!isInNZ) {
346	possibleProductDomains.add(domainWithProtocol);
347	// write both domain and a sample URL on that site out to file
348	possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
349	possibleProductSitesWriter.write("\t" + url + "\n");
350	}
351	}/ /else {
352	// already wrote out domain to file at some point, write just the URL out to file
353	possibleProductSitesWriter.write("\t" + url + "\n");
354	}*/
355	}
356	}
357	} catch (IOException ioe) {
358	logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
359	}
360
361	// We'd have pruned out duplicates by now and have a sorted list of domains,
362	// each of which maps to seed URLs in the commoncrawl for that domain
363
364	int domainCount = 0;
365	File sitesFolder = new File(outputFolder, "sites");
366	if(!sitesFolder.exists()) {
367	sitesFolder.mkdir();
368	}
369	final String FORMATSTR = "%05d";
370
371	// write out each domain followed in sequence by all urls we found in that domain
372	// (urls with tab up front)
373	try (
374	// global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
375	// Also a global file listing any urls that matched top sites that didn't specify
376	// allowed regex patterns
377	BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
378	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
379	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
380	BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));
381	) {
382
383	// initialise topSiteMatchesFile with some instructional text.
384	topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
385	topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
386	topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
387
388	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
389	Set<String> domainsSet = domainsToURLsMap.keySet();
390	Iterator<String> domainIterator = domainsSet.iterator();
391
392	/*
393	// DEBUG
394	String value = topSitesMap.get("wikipedia.org");
395	if(value == null) {
396	logger.debug("### wikipedia.org had null value");
397	} else {
398	logger.debug("### wikipedia.org had value: " + value);
399	} // DEBUG
400	*/
401
402	while(domainIterator.hasNext()) {
403	String domainWithProtocol = domainIterator.next();
404	// Also get domain without protocol prefix
405	int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
406	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
407	String domain = domainWithProtocol.substring(startIndex);
408
409	/*if(domain.contains("docs.google.com")) {
410	logger.debug("domain with protocol: " + domainWithProtocol);
411	logger.debug("domain: " + domain);
412	}*/
413
414	String allowedURLPatternRegex = isURLinTopSitesMap(domain);
415	// If the domain is of a topsite for which no allowed URL pattern has been provided
416	// in sites-too-big-to-exhaustively-crawl.txt,
417	// then we don't know how to crawl the site. Warn the user by writing the affected
418	// domain and seedURLs to the topSiteMatchesFile.
419	if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
420
421	// topsite, but we don't (yet) know what portion can be crawled
422	// Append the top site and url to a global/toplevel file that
423	// the user needs to check later and we're done with this domain as it
424	// won't go into any other file hereafter
425
426	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
427	for(String url : urlsForDomainSet) {
428	topSiteMatchesWriter.write("\t" + url + "\n");
429	}
430
431	continue; // done with this domain
432	}
433
434	// start counting the domains we're actually going to process
435	domainCount++;
436
437	String siteID = String.format(FORMATSTR, domainCount);
438	File domainFolder = new File(sitesFolder, siteID);
439	domainFolder.mkdir();
440
441	// write out the domain
442	//seedURLsWriter.write(domainWithProtocol + "\n");
443
444
445	// for every domain, we need a sites/0000x/ folder, where x is domain#, containing
446	// its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
447	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
448	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
449	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
450	try (
451	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
452	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
453	) {
454
455	// write all sorted unique domains into global domains file
456	// Using the domain withuot protocol since the global domains file is for
457	// informational purposes
458	domainURLsWriter.write(domain + "\n");
459
460	// Only write urls and no domain into single global seedurls file
461	// But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
462	// files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
463	// If we ever run nutch on a single seedURLs listing containing
464	// all seed pages to crawl sites from, the above two files will work for that.
465
466	// first write out the urls for the domain into the sites/0000x/seedURLs.txt file
467	// also write into the global seeds file (with a tab prefixed to each?)
468	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
469	for(String url : urlsForDomainSet) {
470	seedURLsWriter.write(url + "\n"); // global seedURLs file
471	siteURLsWriter.write(url + "\n");
472	}
473
474	if(allowedURLPatternRegex == null) { // entire site can be crawled
475	siteURLsWriter.write(domainWithProtocol + "\n");
476
477	// Write out filter in the following form for a site, e.g. for nutch.apache.org:
478	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
479	String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
480	//String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
481	urlFilterWriter.write(regexed_domain + "\n"); //global file
482	siteRegexWriter.write(regexed_domain + "\n"); // site file
483	}
484	else { // domain belongs to a top site where only portion of site can be crawled
485
486	if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
487	siteURLsWriter.write(domainWithProtocol + "\n");
488	// e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
489	// and not for all of blogspot.com
490
491	String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
492	//String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
493	urlFilterWriter.write(regexed_domain + "\n");
494	siteRegexWriter.write(regexed_domain + "\n");
495
496	} else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
497	// don't write out domain. We want individual pages
498	//DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
499
500	// don't write out domain as a regex expression url filter either,
501	// write out the individual seed urls for the domain instead
502	// since we will only be downloading the single page
503
504	urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
505	for(String urlInDomain : urlsForDomainSet) {
506	// don't append slash to end this time
507	String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
508	//String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
509	urlFilterWriter.write(regexed_url + "\n");
510	siteRegexWriter.write(regexed_url + "\n");
511	}
512	} else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
513
514	// DON'T write out domain into siteURLs file,
515	// BUT DO write it into urlFilter file
516	String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
517
518	urlFilterWriter.write(regexed_domain + "\n");
519	siteRegexWriter.write(regexed_domain + "\n");
520	} else { // allowedURLPatternRegex is a url-form - convert to regex
521	if(!allowedURLPatternRegex.endsWith("/")) {
522	allowedURLPatternRegex += "/";
523	}
524	String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
525	//String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
526	siteURLsWriter.write(domainWithProtocol + "\n");
527	urlFilterWriter.write(regexed_pattern + "\n");
528	siteRegexWriter.write(regexed_pattern + "\n");
529
530	}
531	}
532
533	} catch (IOException ioe) {
534	logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
535	}
536
537	}
538
539	} catch (IOException ioe) {
540	logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
541	+ "\n\t" + urlFilterFile
542	+ "\n\t" + domainURLsFile
543	+ "\n\t" + topSiteMatchesFile, ioe);
544	}
545
546	/*
547	// BEGIN DEBUG
548	logger.debug("@@@@ TopSitesMap contains: ");
549	for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
550	String topSite = entry.getKey();
551	String urlPattern = entry.getValue();
552	logger.debug(topSite + " - " + urlPattern);
553	} // END DEBUG
554	*/
555	}
556
557	private String stripSubDomain(String url) {
558	int index = url.indexOf(".");
559	if(index != -1) {
560	url = url.substring(index+1);
561	}
562	return url;
563	}
564
565
566	/**
567	* @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
568	* with or without www. prefix. This method tests for such as case as it would be dangerous
569	* to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
570	*/
571	private boolean isExactDomainMatch(String seedURLDomain, String domain) {
572	// check for an exact match as-is
573	if(seedURLDomain.equals(domain)) {
574	return true;
575	}
576
577	// else check if with or without a www. prefix we have an exact match with domain
578	if(seedURLDomain.startsWith("www.")) {
579	if(seedURLDomain.substring(4).equals(domain)) {
580	return true;
581	}
582	} else {
583	if(domain.equals("www."+seedURLDomain)) {
584	return true;
585	}
586	}
587
588	return false;
589	}
590
591
592	/**
593	* Check if the domain of the seedurl, either in its entirety or when stripped of
594	* www/subdomains, is in the list of top sites.
595	* If it is, and the given url matches the regex for that topsite, then add the url to the
596	* whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
597	* @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
598	* @return one of the following values:
599	* - This function returns null if the seedURL's domain does not match any of the topsites.
600	* - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
601	* url-pattern) value was defined for it. The empty String is also returned if the seedURL's
602	* domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
603	* want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
604	* - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
605	* was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
606	* URL pattern.
607	*/
608	private String isURLinTopSitesMap(String fullSeedDomain) {
609	boolean keepLooping = true;
610
611	String domain = fullSeedDomain;
612
613	// domain aprameter will have retained www or subdomains, but is stripped of protocol
614
615	// keep looping, stripping subdomains from url and checking if it matches a topsite domain
616	// if it does, return the value for that topsite domain in the topSitesMap
617	// If no match at all, return null.
618	do {
619
620	String allowed_url_pattern = topSitesMap.get(domain);
621	if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
622	// there's an entry for the URL in the topSitesMap
623	logger.debug("##### A top site matches URL domain " + domain);
624
625	// if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
626	// www prefix, should not exactly match the topSitesMap domain
627	// e.g. we don't want to crawl a seed URL with domain www.blogspot.com
628	// despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
629
630	if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
631	return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
632	}
633	return allowed_url_pattern;
634	}
635	// else, no entry for the URL in the topSitesMap
636	// We're not done yet: strip subDomain from URL and check it against topSitesMap again
637
638	String newDomain = stripSubDomain(domain);
639	if(domain.equals(newDomain)) {
640	keepLooping = false;
641	} else {
642	domain = newDomain;
643	}
644	} while(keepLooping);
645
646	// url in entirety or stripped of subdomains did not match any of the topsites
647	return null;
648	}
649
650	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
651	//Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
652	//Iterator<Map.Entry<String, Integer>> i = entries.iterator();
653	//while(i.hasNext()) {
654	// Map.Entry<String, Integer> entry = i.next();
655	for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
656	String urlPattern = entry.getKey();
657	Integer matchRule = entry.getValue();
658
659	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
660	return true;
661	}
662	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
663	return true;
664	}
665	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
666	return true;
667	}
668	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
669	return true;
670	}
671	// else check the rest of the filter list against this url
672	// before returning false to be certain it's not been listed in the filter list
673	}
674
675	return false;
676	}
677
678	/**
679	* Returns true if the url or pattern is found in the blacklist file.
680	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
681	* it won't get blacklisted after all. But that's not implemented here.
682	*/
683	public boolean isBlacklisted(String url) {
684	boolean isBlackListed = isListedInFilterList(blackList, url);
685
686	// if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
687	// then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
688	String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
689	if(!isBlackListed && url.contains("jasmin")) {
690	logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
691	blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
692	}
693	return isBlackListed;
694	}
695
696	/**
697	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
698	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
699	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
700	*/
701	public boolean isGreylisted(String url) {
702	// auto-translated product sites
703	return isListedInFilterList(greyList, url);
704	}
705
706	/**
707	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
708	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
709	*/
710	public boolean isWhitelisted(String url) {
711	return isListedInFilterList(whiteList, url);
712	}
713
714	/**
715	* Checks URL parameter against each line ("filter") of conf/url-black\|grey\|whitelist-filter.txt to decide
716	* whether it is in the mentioned black\|grey\|white list.
717	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
718	* By not having this method deal with actual regex for filters, this has the advantage that
719	* we don't have to remember to escape or double escape each filter to turn it into a regex.
720	*/
721	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
722
723	// if filterListFilename does not exist in the conf folder, just return
724	if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
725	logger.warn("Filter list filename: " + filterListFilename + " does not exist");
726	return;
727	}
728
729	try (
730	BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
731	) {
732	String filter = null;
733	while((filter = reader.readLine()) != null) {
734	// skip comments and empty lines
735	filter = filter.trim();
736	if(filter.equals("") \|\| filter.startsWith("#")) {
737	continue;
738	}
739
740	if(filter.startsWith("^") && filter.endsWith("$")) {
741	filter = filter.substring(1, filter.length()-1);
742	list.put(filter, LIST_ENTRY_MATCHES);
743	}
744	else if(filter.startsWith("^")) {
745	filter = filter.substring(1);
746	list.put(filter, LIST_ENTRY_STARTSWITH);
747	//logger.debug("Match filter startswith: " + filter);
748	}
749	else if(filter.endsWith("$")) {
750	filter = filter.substring(0, filter.length()-1);
751	list.put(filter, LIST_ENTRY_ENDSWITH);
752	//logger.debug("@@@ Match filter endswith: " + filter);
753	}
754	else {
755	list.put(filter, LIST_ENTRY_CONTAINS);
756	}
757	//logger.debug("Got filter: " + filter);
758	}
759
760	} catch (IOException ioe) {
761	logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
762	}
763
764	}
765
766	/** Maintain a count of all WET files processed. */
767	public void setWETFileCount(int count) { this.wetFileCount = count; }
768
769	/** Maintain a count of all WET records processed. */
770	//public int getRecordCount() { return this.totalRecordCount; }
771	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
772	public void setRecordCount(int count) { this.totalRecordCount = count; }
773
774	public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
775
776	// Will list all the warc.wet files in the input directory or else their gzipped versions
777	File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
778
779	int wetRecordCount = 0;
780	int wetFileCount = 0;
781
782	for(int i = 0; i < WETFiles.length; i++) {
783	File WETFile = WETFiles[i];
784	logger.debug("Processing WETfile: " + WETFile);
785
786	// Any .gz files listed means they haven't been unzipped yet. So unzip.
787	String WETFilename = WETFile.toString();
788	if(WETFilename.endsWith(".gz")) {
789	File GZippedWETFile = WETFile;
790	String WETGZippedFilename = WETFilename;
791	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
792
793	WETFile = new File(WETFilename);
794	Utility.unzipFile(GZippedWETFile, WETFile);
795	}
796	// hereafter all WETFiles should refer to the unzipped version
797	// Check the unzipped WETFile exists
798
799	if(!WETFile.exists() \|\| !WETFile.isFile()) {
800	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
801	return;
802	}
803
804	// Finally, we can process this WETFile's records into the keep and discard pile
805	wetFileCount++;
806	logger.debug("Off to process " + WETFile);
807	String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
808	crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
809	WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
810	wetFileProcessor.processWETFile();
811	wetRecordCount += wetFileProcessor.getRecordCount();
812	}
813
814	// for information purposes
815	this.setWETFileCount(wetFileCount);
816	this.setRecordCount(wetRecordCount);
817	}
818
819
820	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
821	public static void printUsage() {
822	System.err.println("Run this program as:");
823	System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
824	}
825
826	/** Filename filter to only list warc.wet files or else warc.wet.gz files
827	* for which unzipped warc.wet equivalents don't yet exist.
828	*/
829	private static class WETFilenameFilter implements FilenameFilter {
830
831	public boolean accept(File dir, String name) {
832	if(name.endsWith(".warc.wet")) {
833	logger.debug("Will include " + name + " for processing.");
834	return true;
835	}
836
837	if(name.endsWith(".warc.wet.gz")) {
838	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
839	File unzippedVersion = new File(dir, nameWithoutGZext);
840	if(unzippedVersion.exists()) {
841	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
842	logger.debug("Skipping " + name);
843	return false; // don't count gzipped version if unzipped version exists.
844	}
845	else {
846	logger.debug("Only zipped version " + name + " exists.");
847	return true; // No unzipped version, so have to work with gzipped version
848	}
849	}
850
851	// we're not even interested in any other file extensions
852	logger.debug("Not a WET file. Skipping " + name);
853	return false;
854	}
855	}
856
857
858	private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
859
860	public boolean accept(File dir, String name) {
861	File f = new File (dir, name);
862	if(f.isDirectory()) {
863	if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
864	return true;
865	}
866	}
867	else {
868	logger.info("File " + f + " is not a directory");
869	}
870	return false;
871	}
872	}
873
874	public static void main(String[] args) {
875	if(args.length != 2) {
876	printUsage();
877	return;
878	}
879
880	File commoncrawlDir = new File(args[0]);
881	if(!commoncrawlDir.exists() \|\| !commoncrawlDir.isDirectory()) {
882	logger.error("Error: " + args[0] + " does not exist or is not a directory");
883	return;
884	}
885
886	File outFolder = new File(args[1]);
887	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
888	logger.error("Error: " + args[1] + " does not exist or is not a directory.");
889	return;
890	}
891
892	try {
893	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
894
895	File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
896
897	for(int i = 0; i < ccrawlFolders.length; i++) {
898	File ccrawlFolder = ccrawlFolders[i];
899	logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder);
900	ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
901	}
902
903	// create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
904	// The former is the only unique one. seedURLs and regex-urlfilters are
905	// repeated on a per site/domain basis too, stored in the sites folder
906	File seedURLsFile = new File(outFolder, "seedURLs.txt");
907	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
908	File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
909	File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
910	File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
911
912	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
913
914	logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
915
916	logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
917
918
919	} catch(Exception e) {
920	// can get an exception when instantiating CCWETProcessor instance
921	logger.error(e.getMessage(), e);
922	}
923
924	return;
925
926	}
927	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: