Context Navigation

CCWETProcessor.java@ 33623

Last change on this file since 33623 was 33623, checked in by ak19, 4 years ago

Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.

File size: 38.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.nio.charset.StandardCharsets;
6	import java.util.Properties;
7	import java.util.zip.GZIPInputStream;
8	import java.util.Iterator;
9	import java.util.HashMap;
10	import java.util.Map;
11	import java.util.Set;
12	import java.util.TreeMap;
13	import java.util.TreeSet;
14
15	import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
16	import org.apache.log4j.Logger;
17
18
19	/**
20	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
21	* the WET records in each, putting each WET record into a file. Each file is put into a
22	* keep or discard or greyListed folder, and its url listed written into a keep, discard
23	* or greylisted text file, based on based on
24	*
25	* 1. whether it's whitelisted, else greylisted else blacklisted
26	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
27	* enough content. Formerly, content-length and number of lines were used to determine if
28	* the content was sufficient. Now it's just word count and number of MAX characters
29	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
30	* in conf/config.properties.
31	*
32	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
33	* into the conf folder to control any url patterns that are explicitly included or excluded or
34	* set aside for inspecting later. These filter text files don't use regexes, instead their
35	* format is:
36	* - precede URL by ^ to blacklist urls that match the given prefix
37	* - succeed URL by $ to blacklist urls that match the given suffix
38	* - ^url$ will blacklist urls that match the given url completely
39	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
40	*
41	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
42	* over greylisting and which takes precedence over blacklisting in turn. However, even
43	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
44	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
45	* also for nutch.
46	*
47	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
48	* in the given input folder. Then use a single instance of the WETProcessor class to process
49	* each single unzipped warc.wet file.
50	*
51	* To compile, including the jars in lib/ for compiling.
52	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
53	*
54	* To run, passing the log4j and other properties files in conf/ folder:
55	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing commoncrawls subfolders containing warc.wet(.gz) files> <outputFolder>
56	*
57	* e.g. (from maori-lang-detection/src)
58	*
59	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl
60	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../ccrawl-data /Scratch/ak19/gs3-extensions/maori-lang-detection/to_crawl 2>&1 \| less
61	*
62	*/
63
64	public class CCWETProcessor {
65	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
66
67	// Properties shared across WETProcessor instances
68	public final int MAX_WORD_LENGTH;
69	public final int MIN_NUM_WORDS;
70	public final int MAX_WORDS_CAMELCASE;
71
72	// constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file
73	public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY";
74	public final String SINGLEPAGE = "SINGLEPAGE";
75	public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE";
76
77	/**
78	* Characters that need escaping if used as a string literal in a regex
79	* https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
80	* https://www.regular-expressions.info/refcharacters.html
81	* Put the \\ (escape char) at start so we don't double-escape chars already escaped,
82	* as would happen for any chars appearing earlier in this list than \\
83	*/
84	public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{\|";
85	//public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\|"];
86
87	private Properties configProperties = new Properties();
88
89	// File paths shared across WETProcessor instances
90	public final File commoncrawlDir;
91	public final File outputFolder;
92	public final File discardFolder;
93	public final File keepFolder;
94	public final File greyListedFolder;
95	public final File keepURLsFile;
96	public final File discardURLsFile;
97	public final File greyListedFile;
98
99	/** Possible values stored in the blackList/whiteList/greyList Maps */
100	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
101	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
102	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
103	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
104
105	/**
106	* Store url patterns as keys and values indicated whether a url should
107	* match it exactly, start/end with it, or contain it
108	*/
109	private HashMap<String, Integer> blackList;
110	private HashMap<String, Integer> greyList;
111	private HashMap<String, Integer> whiteList;
112
113	/** map of topsites with allowable regexes: sites too big to exhaustively crawl
114	* with optional regex defining allowed exceptions, like subdomains or url suffixes
115	* off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org
116	* is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot
117	* would be relevant.
118	* The map would store top site domain suffix and an optional regex string for allowable
119	* url patterns.
120	*/
121	private HashMap<String, String> topSitesMap;
122
123	/** Map of domains we keep and the full urls we're keeping that are of that domain.
124	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
125	* while a HashMap has no notion of ordering, because we just need to store urls with
126	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
127	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
128	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
129	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
130	*/
131	private Map<String, Set<String>> domainsToURLsMap;
132
133	// Keep a count of all the records that all WETProcessors instantiated
134	// by our main method combined have processed
135	private int totalRecordCount = 0;
136
137	private int wetFileCount = 0;
138
139	private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
140
141	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
142	this.commoncrawlDir = inFolder;
143	this.outputFolder = outFolder;
144
145	// load up the properties from the config file
146	try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
147	configProperties = new Properties();
148	configProperties.load(infile);
149	//infile.close(); // not explicitly called in examples of try-with-resources
150
151	} catch(Exception e) {
152	logger.error("Exception attempting to read properties from config.properties.", e);
153	}
154
155	if(configProperties.size() == 0) {
156	logger.warn("*** Warning: no values read into config properties. Using defaults.");
157	}
158
159	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
160	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
161	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
162
163
164	this.discardFolder = new File(outFolder, "discard");
165	if(!discardFolder.exists()) {
166	discardFolder.mkdir();
167	}
168	this.keepFolder = new File(outFolder, "keep");
169	if(!keepFolder.exists()) {
170	keepFolder.mkdir();
171	}
172
173	this.greyListedFolder = new File(outFolder, "greylisted");
174	if(!greyListedFolder.exists()) {
175	greyListedFolder.mkdir();
176	}
177
178	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
179	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
180	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
181	}
182	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
183	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
184	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
185	}
186	this.greyListedFile = new File(outFolder, "greyListed.txt");
187	if(greyListedFile.exists() && !greyListedFile.delete()) {
188	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
189	}
190
191	// prepare our blacklist, greylist (for inspection) and whitelist
192	logger.info("Loading blacklist.");
193	blackList = new HashMap<String, Integer>();
194	initURLFilterList(blackList, "url-blacklist-filter.txt");
195
196	logger.info("Loading greylist.");
197	greyList = new HashMap<String, Integer>();
198	initURLFilterList(greyList, "url-greylist-filter.txt");
199
200	logger.info("Loading whitelist.");
201	whiteList = new HashMap<String, Integer>();
202	initURLFilterList(whiteList, "url-whitelist-filter.txt");
203
204	// Create the map of topSites
205	logger.info("Loading map of topsites with regex of allowable url patterns for each topsite.");
206	topSitesMap = new HashMap<String, String>();
207
208	// Read in our csv file of topsites and what to do when one hits a match with a seedURL
209	// and put these in our topSitesMap
210	// https://commons.apache.org/proper/commons-csv/apidocs/index.html
211	// https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
212	//https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
213	CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
214	.withCommentMarker('#')
215	.withSkipHeaderRecord()
216	.withIgnoreSurroundingSpaces();
217
218	File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
219	// CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
220	try (
221	CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
222	) {
223	for (CSVRecord csvRecord : parser) {
224	String topsite = csvRecord.get(0);
225	String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
226	topSitesMap.put(topsite, allowed_url_pattern);
227
228	//logger.debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
229
230	}
231	} catch(Exception e) {
232	logger.error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
233	}
234
235	//logger.debug("Prematurely terminating for testing purposes.");
236	//System.exit(-1);
237	}
238
239
240	/** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
241	private String escapeStringForRegex(String str) {
242	for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) {
243	char c = ESCAPE_CHARS_FOR_RE.charAt(i);
244	str = str.replace(Character.toString(c), "\\"+c);
245	}
246	return str;
247	}
248
249	/**
250	* Using the keepURLs.txt file generated by running WETProcessor instances, this produces
251	* as output the URL seed list and regex-urlfilter text files required by nutch, see
252	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
253	*/
254	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
255	File domainURLsFile, File topSiteMatchesFile,
256	File possibleProductSitesFile) {
257	// Maintain a Map of unique domains mapped to seed urls at that domain
258	// TreeSet: by default, "the elements are ordered using their natural ordering"
259	// (or by a Comparator provided at set creation time).
260	// Whereas HashSet doesn't guarantee ordering.
261	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
262	// Would be a similar distinction for Maps.
263	domainsToURLsMap = new TreeMap<String, Set<String>>();
264
265	final String PROTOCOL_REGEX_PREFIX = "+^https?://";
266	final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.) for nutch's regex-urlfilter.txt
267
268	// keep an eye out on URLs we need to inspect later
269	Set<String> possibleProductDomains = new TreeSet<String>();
270	File geoLiteCityDatFile = new File(MY_CLASSLOADER.getResource("GeoLiteCity.dat").getFile());
271
272	try (
273	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
274	BufferedWriter possibleProductSitesWriter = new BufferedWriter(new FileWriter(possibleProductSitesFile));
275	) {
276
277	// read a URL at a time from urlsFile
278	String url = null;
279	String domainWithProtocol = null;
280	while((url = reader.readLine()) != null) { // readLine removes newline separator
281
282	// work out domain. This retains any www. or subdomain prefix
283	// passing true to further also retain the http(s) protocol
284	domainWithProtocol = Utility.getDomainForURL(url, true);
285
286	Set<String> urlsSet;
287	if(!domainsToURLsMap.containsKey(domainWithProtocol)) {
288	urlsSet = new TreeSet<String>();
289	urlsSet.add(url);
290	domainsToURLsMap.put(domainWithProtocol, urlsSet);
291	} else {
292	urlsSet = domainsToURLsMap.get(domainWithProtocol);
293	urlsSet.add(url);
294	}
295
296	/*
297	// Dr Nichols said that a url that was located outside the country and
298	// which had /mi/ URLs was more likely to be an autotranslated (product) site.
299	// Following Dr Nichols' idea, let's keep a look out for more product sites:
300	// if any URL contains /mi AND the tld of its domain is outside of New Zealand
301	// then add that domain (if not already added) and that url into a file
302	// for later manual inspection
303	if(!domainWithProtocol.endsWith(".nz")
304	&& (url.contains("/mi/") \|\| url.endsWith("/mi"))) {
305
306	if(!possibleProductDomains.contains(domainWithProtocol)) {
307
308	String countryCode = "";
309	try {
310	// more expensive test, so do this only if above conditions are true:
311	countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
312	System.err.println("@@@@ Got country code: " + countryCode);
313	} catch(Exception exceptObj) {
314	countryCode = ""; // forces domain to be included for inspection
315
316	logger.error("Could not check if domain " + domainWithProtocol
317	+ " was in country: " + countryCode,
318	exceptObj);
319	}
320
321	boolean isInNZ = countryCode.toLowerCase().equals("nz");
322
323
324	//if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
325	if(!isInNZ) {
326	possibleProductDomains.add(domainWithProtocol);
327	// write both domain and a sample seedURL on that site out to file
328	possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
329	possibleProductSitesWriter.write("\t" + url + "\n");
330	}
331	}
332	//else {
333	// already wrote out domain to file at some point, write just the URL out to file
334	//possibleProductSitesWriter.write("\t" + url + "\n");
335	//}
336	}
337	*/
338	}
339	} catch (IOException ioe) {
340	logger.error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
341	}
342
343	// We'd have pruned out duplicates by now and have a sorted list of domains,
344	// each of which maps to seed URLs in the commoncrawl for that domain
345
346	int domainCount = 0;
347	File sitesFolder = new File(outputFolder, "sites");
348	if(!sitesFolder.exists()) {
349	sitesFolder.mkdir();
350	}
351	final String FORMATSTR = "%05d";
352
353	// write out each domain followed in sequence by all urls we found in that domain
354	// (urls with tab up front)
355	try (
356	// global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
357	// Also a global file listing any urls that matched top sites that didn't specify
358	// allowed regex patterns
359	BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
360	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
361	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile));
362	BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile));
363	) {
364
365	// initialise topSiteMatchesFile with some instructional text.
366	topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n");
367	topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n");
368	topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n");
369
370	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
371	Set<String> domainsSet = domainsToURLsMap.keySet();
372	Iterator<String> domainIterator = domainsSet.iterator();
373
374	/*
375	// DEBUG
376	String value = topSitesMap.get("wikipedia.org");
377	if(value == null) {
378	logger.debug("### wikipedia.org had null value");
379	} else {
380	logger.debug("### wikipedia.org had value: " + value);
381	} // DEBUG
382	*/
383
384	while(domainIterator.hasNext()) {
385	String domainWithProtocol = domainIterator.next();
386	// Also get domain without protocol prefix
387	int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix
388	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
389	String domain = domainWithProtocol.substring(startIndex);
390
391	/*if(domain.contains("docs.google.com")) {
392	logger.debug("domain with protocol: " + domainWithProtocol);
393	logger.debug("domain: " + domain);
394	}*/
395
396	String allowedURLPatternRegex = isURLinTopSitesMap(domain);
397	// If the domain is of a topsite for which no allowed URL pattern has been provided
398	// in sites-too-big-to-exhaustively-crawl.txt,
399	// then we don't know how to crawl the site. Warn the user by writing the affected
400	// domain and seedURLs to the topSiteMatchesFile.
401	if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) {
402
403	// topsite, but we don't (yet) know what portion can be crawled
404	// Append the top site and url to a global/toplevel file that
405	// the user needs to check later and we're done with this domain as it
406	// won't go into any other file hereafter
407
408	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
409	for(String url : urlsForDomainSet) {
410	topSiteMatchesWriter.write("\t" + url + "\n");
411	}
412
413	continue; // done with this domain
414	}
415
416	// start counting the domains we're actually going to process
417	domainCount++;
418
419	String siteID = String.format(FORMATSTR, domainCount);
420	File domainFolder = new File(sitesFolder, siteID);
421	domainFolder.mkdir();
422
423	// write out the domain
424	//seedURLsWriter.write(domainWithProtocol + "\n");
425
426
427	// for every domain, we need a sites/0000x/ folder, where x is domain#, containing
428	// its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
429	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
430	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
431	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
432	try (
433	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
434	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
435	) {
436
437	// write all sorted unique domains into global domains file
438	// Using the domain withuot protocol since the global domains file is for
439	// informational purposes
440	domainURLsWriter.write(domain + "\n");
441
442	// Only write urls and no domain into single global seedurls file
443	// But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
444	// files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
445	// If we ever run nutch on a single seedURLs listing containing
446	// all seed pages to crawl sites from, the above two files will work for that.
447
448	// first write out the urls for the domain into the sites/0000x/seedURLs.txt file
449	// also write into the global seeds file (with a tab prefixed to each?)
450	Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
451	for(String url : urlsForDomainSet) {
452	seedURLsWriter.write(url + "\n"); // global seedURLs file
453	siteURLsWriter.write(url + "\n");
454	}
455
456	if(allowedURLPatternRegex == null) { // entire site can be crawled
457	siteURLsWriter.write(domainWithProtocol + "\n");
458
459	// Write out filter in the following form for a site, e.g. for nutch.apache.org:
460	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
461	String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
462	//String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
463	urlFilterWriter.write(regexed_domain + "\n"); //global file
464	siteRegexWriter.write(regexed_domain + "\n"); // site file
465	}
466	else { // domain belongs to a top site where only portion of site can be crawled
467
468	if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter
469	siteURLsWriter.write(domainWithProtocol + "\n");
470	// e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com
471	// and not for all of blogspot.com
472
473	String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/";
474	//String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/";
475	urlFilterWriter.write(regexed_domain + "\n");
476	siteRegexWriter.write(regexed_domain + "\n");
477
478	} else if(allowedURLPatternRegex.equals(SINGLEPAGE)) {
479	// don't write out domain. We want individual pages
480	//DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n");
481
482	// don't write out domain as a regex expression url filter either,
483	// write out the individual seed urls for the domain instead
484	// since we will only be downloading the single page
485
486	urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol);
487	for(String urlInDomain : urlsForDomainSet) {
488	// don't append slash to end this time
489	String regexed_url = "+^"+escapeStringForRegex(urlInDomain);
490	//String regexed_url = "+^"+urlInDomain.replace(".", "\\.");
491	urlFilterWriter.write(regexed_url + "\n");
492	siteRegexWriter.write(regexed_url + "\n");
493	}
494	} else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) {
495
496	// DON'T write out domain into siteURLs file,
497	// BUT DO write it into urlFilter file
498	String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/";
499
500	urlFilterWriter.write(regexed_domain + "\n");
501	siteRegexWriter.write(regexed_domain + "\n");
502	} else { // allowedURLPatternRegex is a url-form - convert to regex
503	if(!allowedURLPatternRegex.endsWith("/")) {
504	allowedURLPatternRegex += "/";
505	}
506	String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex);
507	//String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\.");
508	siteURLsWriter.write(domainWithProtocol + "\n");
509	urlFilterWriter.write(regexed_pattern + "\n");
510	siteRegexWriter.write(regexed_pattern + "\n");
511
512	}
513	}
514
515	} catch (IOException ioe) {
516	logger.error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
517	}
518
519	}
520
521	} catch (IOException ioe) {
522	logger.error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
523	+ "\n\t" + urlFilterFile
524	+ "\n\t" + domainURLsFile
525	+ "\n\t" + topSiteMatchesFile, ioe);
526	}
527
528	/*
529	// BEGIN DEBUG
530	logger.debug("@@@@ TopSitesMap contains: ");
531	for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
532	String topSite = entry.getKey();
533	String urlPattern = entry.getValue();
534	logger.debug(topSite + " - " + urlPattern);
535	} // END DEBUG
536	*/
537	}
538
539	private String stripSubDomain(String url) {
540	int index = url.indexOf(".");
541	if(index != -1) {
542	url = url.substring(index+1);
543	}
544	return url;
545	}
546
547
548	/**
549	* @return true when a seedURL's domain exactly matches a topsite such as blogspot.com,
550	* with or without www. prefix. This method tests for such as case as it would be dangerous
551	* to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain.
552	*/
553	private boolean isExactDomainMatch(String seedURLDomain, String domain) {
554	// check for an exact match as-is
555	if(seedURLDomain.equals(domain)) {
556	return true;
557	}
558
559	// else check if with or without a www. prefix we have an exact match with domain
560	if(seedURLDomain.startsWith("www.")) {
561	if(seedURLDomain.substring(4).equals(domain)) {
562	return true;
563	}
564	} else {
565	if(domain.equals("www."+seedURLDomain)) {
566	return true;
567	}
568	}
569
570	return false;
571	}
572
573
574	/**
575	* Check if the domain of the seedurl, either in its entirety or when stripped of
576	* www/subdomains, is in the list of top sites.
577	* If it is, and the given url matches the regex for that topsite, then add the url to the
578	* whitelist and a regex disallowing the rest of the topsite to the url regex filter file.
579	* @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix.
580	* @return one of the following values:
581	* - This function returns null if the seedURL's domain does not match any of the topsites.
582	* - The empty String is returned if the seedURL's domain matched a topsite but no (allowed-
583	* url-pattern) value was defined for it. The empty String is also returned if the seedURL's
584	* domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't
585	* want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY).
586	* - A non-emptry String is returned if the seedURL's domain matched a topsite and a value
587	* was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed
588	* URL pattern.
589	*/
590	private String isURLinTopSitesMap(String fullSeedDomain) {
591	boolean keepLooping = true;
592
593	String domain = fullSeedDomain;
594
595	// domain aprameter will have retained www or subdomains, but is stripped of protocol
596
597	// keep looping, stripping subdomains from url and checking if it matches a topsite domain
598	// if it does, return the value for that topsite domain in the topSitesMap
599	// If no match at all, return null.
600	do {
601
602	String allowed_url_pattern = topSitesMap.get(domain);
603	if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
604	// there's an entry for the URL in the topSitesMap
605	logger.debug("##### A top site matches URL domain " + domain);
606
607	// if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
608	// www prefix, should not exactly match the topSitesMap domain
609	// e.g. we don't want to crawl a seed URL with domain www.blogspot.com
610	// despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY.
611
612	if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) {
613	return ""; // means don't crawl site, write url into unprocessed-topsite-matches file
614	}
615	return allowed_url_pattern;
616	}
617	// else, no entry for the URL in the topSitesMap
618	// We're not done yet: strip subDomain from URL and check it against topSitesMap again
619
620	String newDomain = stripSubDomain(domain);
621	if(domain.equals(newDomain)) {
622	keepLooping = false;
623	} else {
624	domain = newDomain;
625	}
626	} while(keepLooping);
627
628	// url in entirety or stripped of subdomains did not match any of the topsites
629	return null;
630	}
631
632	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
633	//Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
634	//Iterator<Map.Entry<String, Integer>> i = entries.iterator();
635	//while(i.hasNext()) {
636	// Map.Entry<String, Integer> entry = i.next();
637	for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) {
638	String urlPattern = entry.getKey();
639	Integer matchRule = entry.getValue();
640
641	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
642	return true;
643	}
644	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
645	return true;
646	}
647	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
648	return true;
649	}
650	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
651	return true;
652	}
653	// else check the rest of the filter list against this url
654	// before returning false to be certain it's not been listed in the filter list
655	}
656
657	return false;
658	}
659
660	/**
661	* Returns true if the url or pattern is found in the blacklist file.
662	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
663	* it won't get blacklisted after all. But that's not implemented here.
664	*/
665	public boolean isBlacklisted(String url) {
666	boolean isBlackListed = isListedInFilterList(blackList, url);
667
668	// if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
669	// then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
670	String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol
671	if(!isBlackListed && url.contains("jasmin")) {
672	logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
673	blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
674	}
675	return isBlackListed;
676	}
677
678	/**
679	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
680	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
681	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
682	*/
683	public boolean isGreylisted(String url) {
684	// auto-translated product sites
685	return isListedInFilterList(greyList, url);
686	}
687
688	/**
689	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
690	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
691	*/
692	public boolean isWhitelisted(String url) {
693	return isListedInFilterList(whiteList, url);
694	}
695
696	/**
697	* Checks URL parameter against each line ("filter") of conf/url-black\|grey\|whitelist-filter.txt to decide
698	* whether it is in the mentioned black\|grey\|white list.
699	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
700	* By not having this method deal with actual regex for filters, this has the advantage that
701	* we don't have to remember to escape or double escape each filter to turn it into a regex.
702	*/
703	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
704
705	// if filterListFilename does not exist in the conf folder, just return
706	if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
707	logger.warn("Filter list filename: " + filterListFilename + " does not exist");
708	return;
709	}
710
711	try (
712	BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
713	) {
714	String filter = null;
715	while((filter = reader.readLine()) != null) {
716	// skip comments and empty lines
717	filter = filter.trim();
718	if(filter.equals("") \|\| filter.startsWith("#")) {
719	continue;
720	}
721
722	if(filter.startsWith("^") && filter.endsWith("$")) {
723	filter = filter.substring(1, filter.length()-1);
724	list.put(filter, LIST_ENTRY_MATCHES);
725	}
726	else if(filter.startsWith("^")) {
727	filter = filter.substring(1);
728	list.put(filter, LIST_ENTRY_STARTSWITH);
729	//logger.debug("Match filter startswith: " + filter);
730	}
731	else if(filter.endsWith("$")) {
732	filter = filter.substring(0, filter.length()-1);
733	list.put(filter, LIST_ENTRY_ENDSWITH);
734	//logger.debug("@@@ Match filter endswith: " + filter);
735	}
736	else {
737	list.put(filter, LIST_ENTRY_CONTAINS);
738	}
739	//logger.debug("Got filter: " + filter);
740	}
741
742	} catch (IOException ioe) {
743	logger.error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
744	}
745
746	}
747
748	/** Maintain a count of all WET files processed. */
749	public void setWETFileCount(int count) { this.wetFileCount = count; }
750
751	/** Maintain a count of all WET records processed. */
752	//public int getRecordCount() { return this.totalRecordCount; }
753	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
754	public void setRecordCount(int count) { this.totalRecordCount = count; }
755
756	public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
757
758	// Will list all the warc.wet files in the input directory or else their gzipped versions
759	File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
760
761	int wetRecordCount = 0;
762	int wetFileCount = 0;
763
764	for(int i = 0; i < WETFiles.length; i++) {
765	File WETFile = WETFiles[i];
766	logger.debug("Processing WETfile: " + WETFile);
767
768	// Any .gz files listed means they haven't been unzipped yet. So unzip.
769	String WETFilename = WETFile.toString();
770	if(WETFilename.endsWith(".gz")) {
771	File GZippedWETFile = WETFile;
772	String WETGZippedFilename = WETFilename;
773	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
774
775	WETFile = new File(WETFilename);
776	Utility.unzipFile(GZippedWETFile, WETFile);
777	}
778	// hereafter all WETFiles should refer to the unzipped version
779	// Check the unzipped WETFile exists
780
781	if(!WETFile.exists() \|\| !WETFile.isFile()) {
782	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
783	return;
784	}
785
786	// Finally, we can process this WETFile's records into the keep and discard pile
787	wetFileCount++;
788	logger.debug("Off to process " + WETFile);
789	String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
790	crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
791	WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
792	wetFileProcessor.processWETFile();
793	wetRecordCount += wetFileProcessor.getRecordCount();
794	}
795
796	// for information purposes
797	this.setWETFileCount(wetFileCount);
798	this.setRecordCount(wetRecordCount);
799	}
800
801
802	// --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
803	public static void printUsage() {
804	System.err.println("Run this program as:");
805	System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");
806	}
807
808	/** Filename filter to only list warc.wet files or else warc.wet.gz files
809	* for which unzipped warc.wet equivalents don't yet exist.
810	*/
811	private static class WETFilenameFilter implements FilenameFilter {
812
813	public boolean accept(File dir, String name) {
814	if(name.endsWith(".warc.wet")) {
815	logger.debug("Will include " + name + " for processing.");
816	return true;
817	}
818
819	if(name.endsWith(".warc.wet.gz")) {
820	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
821	File unzippedVersion = new File(dir, nameWithoutGZext);
822	if(unzippedVersion.exists()) {
823	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
824	logger.debug("Skipping " + name);
825	return false; // don't count gzipped version if unzipped version exists.
826	}
827	else {
828	logger.debug("Only zipped version " + name + " exists.");
829	return true; // No unzipped version, so have to work with gzipped version
830	}
831	}
832
833	// we're not even interested in any other file extensions
834	logger.debug("Not a WET file. Skipping " + name);
835	return false;
836	}
837	}
838
839
840	private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
841
842	public boolean accept(File dir, String name) {
843	File f = new File (dir, name);
844	if(f.isDirectory()) {
845	if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
846	return true;
847	}
848	}
849	else {
850	logger.info("File " + f + " is not a directory");
851	}
852	return false;
853	}
854	}
855
856	public static void main(String[] args) {
857	if(args.length != 2) {
858	printUsage();
859	return;
860	}
861
862	File commoncrawlDir = new File(args[0]);
863	if(!commoncrawlDir.exists() \|\| !commoncrawlDir.isDirectory()) {
864	logger.error("Error: " + args[0] + " does not exist or is not a directory");
865	return;
866	}
867
868	File outFolder = new File(args[1]);
869	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
870	logger.error("Error: " + args[1] + " does not exist or is not a directory.");
871	return;
872	}
873
874	try {
875	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
876
877	File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
878
879	for(int i = 0; i < ccrawlFolders.length; i++) {
880	File ccrawlFolder = ccrawlFolders[i];
881	logger.info("About to process commoncrawl WET files folder: " + ccrawlFolder);
882	ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
883	}
884
885	// create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
886	// The former is the only unique one. seedURLs and regex-urlfilters are
887	// repeated on a per site/domain basis too, stored in the sites folder
888	File seedURLsFile = new File(outFolder, "seedURLs.txt");
889	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
890	File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
891	File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
892	File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
893
894	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
895
896	logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
897
898	logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
899
900
901	} catch(Exception e) {
902	// can get an exception when instantiating CCWETProcessor instance
903	logger.error(e.getMessage(), e);
904	}
905
906	return;
907
908	}
909	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33623

Download in other formats: