Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33519

Last change on this file since 33519 was 33519, checked in by ak19, 5 years ago
Code still writes out the global seedURLs.txt and regex-urlfilter.txt (in case this remains meaningful), but now also creates individual site directories containing their individual seedURLs.txt and regex-urlfilter.txt
File size: 22.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.HashMap;
9	import java.util.Map;
10	import java.util.Set;
11	import java.util.TreeMap;
12	import java.util.TreeSet;
13
14	import org.apache.log4j.Logger;
15
16	/**
17	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18	* the WET records in each, putting each WET record into a file. Each file is put into a
19	* keep or discard or greyListed folder, and its url listed written into a keep, discard
20	* or greylisted text file, based on based on
21	*
22	* 1. whether it's whitelisted, else greylisted else blacklisted
23	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24	* enough content. Formerly, content-length and number of lines were used to determine if
25	* the content was sufficient. Now it's just word count and number of MAX characters
26	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27	* in conf/config.properties.
28	*
29	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30	* into the conf folder to control any url patterns that are explicitly included or excluded or
31	* set aside for inspecting later. These filter text files don't use regexes, instead their
32	* format is:
33	* - precede URL by ^ to blacklist urls that match the given prefix
34	* - succeed URL by $ to blacklist urls that match the given suffix
35	* - ^url$ will blacklist urls that match the given url completely
36	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37	*
38	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
39	* over greylisting and which takes precedence over blacklisting in turn. However, even
40	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42	* also for nutch.
43	*
44	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45	* in the given input folder. Then use a single instance of the WETProcessor class to process
46	* each single unzipped warc.wet file.
47	*
48	* To compile, including the jars in lib/ for compiling.
49	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50	*
51	* To run, passing the log4j and other properties files in conf/ folder:
52	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53	*
54	* e.g.
55	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
57	*
58	*/
59
60	public class CCWETProcessor {
61	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63	// Properties shared across WETProcessor instances
64	public final int MAX_WORD_LENGTH;
65	public final int MIN_NUM_WORDS;
66	public final int MAX_WORDS_CAMELCASE;
67
68	private Properties configProperties = new Properties();
69
70	// File paths shared across WETProcessor instances
71	public final File WETFilesDir;
72	public final File outputFolder;
73	public final File discardFolder;
74	public final File keepFolder;
75	public final File greyListedFolder;
76	public final File keepURLsFile;
77	public final File discardURLsFile;
78	public final File greyListedFile;
79
80	/** Possible values stored in the blackList/whiteList/greyList Maps */
81	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
82	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
83	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
84	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
85
86	/**
87	* Store url patterns as keys and values indicated whether a url should
88	* match it exactly, start/end with it, or contain it
89	*/
90	private HashMap<String, Integer> blackList;
91	private HashMap<String, Integer> greyList;
92	private HashMap<String, Integer> whiteList;
93
94	/** Map of domains we keep and the full urls we're keeping that are of that domain.
95	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
96	* while a HashMap has no notion of ordering, because we just need to store urls with
97	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
98	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
99	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
100	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
101	*/
102	private Map<String, Set<String>> domainsToURLsMap;
103
104	// Keep a count of all the records that all WETProcessors instantiated
105	// by our main method combined have processed
106	private int totalRecordCount = 0;
107
108	private int wetFileCount = 0;
109
110	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
111	this.WETFilesDir = inFolder;
112	this.outputFolder = outFolder;
113
114	// load up the properties from the config file
115	try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
116	configProperties = new Properties();
117	configProperties.load(infile);
118	//infile.close(); // not explicitly called in examples of try-with-resources
119
120	} catch(Exception e) {
121	System.err.println("Exception attempting to read properties from config.properties.");
122	logger.error("Exception attempting to read properties from config.properties.");
123	e.printStackTrace();
124	}
125
126	if(configProperties.size() == 0) {
127	System.err.println("*** Warning: no values read into config properties. Using defaults.");
128	}
129
130	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
131	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
132	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
133
134
135	this.discardFolder = new File(outFolder, "discard");
136	if(!discardFolder.exists()) {
137	discardFolder.mkdir();
138	}
139	this.keepFolder = new File(outFolder, "keep");
140	if(!keepFolder.exists()) {
141	keepFolder.mkdir();
142	}
143
144	this.greyListedFolder = new File(outFolder, "greylisted");
145	if(!greyListedFolder.exists()) {
146	greyListedFolder.mkdir();
147	}
148
149	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
150	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
151	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
152	}
153	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
154	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
155	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
156	}
157	this.greyListedFile = new File(outFolder, "greyListed.txt");
158	if(greyListedFile.exists() && !greyListedFile.delete()) {
159	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
160	}
161
162	// prepare our blacklist, greylist (for inspection) and whitelist
163	System.err.println("Loading blacklist.");
164	blackList = new HashMap<String, Integer>();
165	initURLFilterList(blackList, "url-blacklist-filter.txt");
166
167	System.err.println("Loading greylist.");
168	greyList = new HashMap<String, Integer>();
169	initURLFilterList(greyList, "url-greylist-filter.txt");
170
171	System.err.println("Loading whitelist.");
172	whiteList = new HashMap<String, Integer>();
173	initURLFilterList(whiteList, "url-whitelist-filter.txt");
174
175	//System.err.println("Prematurely terminating for testing purposes.");
176	//System.exit(-1);
177	}
178
179	/**
180	* Using the keepURLs.txt file generated by running WETProcessor instances, produces
181	* as output the URL seed list and regex-urlfilter text files required by nutch, see
182	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
183	*/
184	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
185	// Maintain Sets of unique domains and urls
186	// TreeSet: by default, "the elements are ordered using their natural ordering"
187	// (or by a Comparator provided at set creation time).
188	// Whereas HashSet doesn't guarantee ordering.
189	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
190
191	//Set<String> domainsSet = new TreeSet<String>();
192	//Set<String> urlsSet = new TreeSet<String>();
193	domainsToURLsMap = new TreeMap<String, Set<String>>();
194
195	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.)
196
197	try (
198	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
199	) {
200
201	// read a URL at a time from urlsFile
202	String url = null;
203	String domain = null;
204	while((url = reader.readLine()) != null) { // readLine removes newline separator
205
206	// work out domain. This retains any www. or subdomain prefix:
207	int startIndex = url.indexOf("//"); // http:// or https:// prefix
208	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
209	domain = url.substring(startIndex);
210	int endIndex = domain.indexOf("/");
211	if(endIndex == -1) endIndex = domain.length();
212	domain = domain.substring(0, endIndex);
213
214	//urlsSet.add(url);
215	//domainsSet.add(domain);
216	Set<String> urlsSet;
217	if(!domainsToURLsMap.containsKey(domain)) {
218	urlsSet = new TreeSet<String>();
219	urlsSet.add(url);
220	domainsToURLsMap.put(domain, urlsSet);
221	} else {
222	urlsSet = domainsToURLsMap.get(domain);
223	urlsSet.add(url);
224	}
225
226	}
227	} catch (IOException ioe) {
228	ioe.printStackTrace();
229	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
230	}
231
232	/*
233	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
234	Iterator<String> i = urlsSet.iterator();
235	while(i.hasNext()) {
236	String url = i.next();
237	seedURLsWriter.write(url + "\n");
238	}
239
240	} catch (IOException ioe) {
241	ioe.printStackTrace();
242	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
243	}
244	*/
245
246	int domainCount = 0;
247	File sitesFolder = new File(outputFolder, "sites");
248	if(!sitesFolder.exists()) {
249	sitesFolder.mkdir();
250	}
251	final String FORMATSTR = "%05d";
252
253	// write out each domain followed in sequence by all urls we found in that domain
254	// (urls with tab up front)
255	try (
256	BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
257	BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
258	) {
259	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
260	Set<String> domainsSet = domainsToURLsMap.keySet();
261	Iterator<String> domainIterator = domainsSet.iterator();
262
263	while(domainIterator.hasNext()) {
264	domainCount++;
265	String siteID = String.format(FORMATSTR, domainCount);
266	File domainFolder = new File(sitesFolder, siteID);
267	domainFolder.mkdir();
268
269	// write out the domain
270	String domain = domainIterator.next();
271	//seedURLsWriter.write(domain + "\n");
272	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
273	String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
274	urlFilterWriter.write(regexed_domain + "\n");
275
276	// for every domain, we need sites/0000x/ folder containing its own
277	// INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
278	// We still have a global seedURLs.txt and regex-urlfilter.txt too.
279	File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
280	File siteRegexFile = new File(domainFolder, "regex-urlfilter.txt"); // e.g. sites/00001/regex-urlfilter.txt
281	try (
282	BufferedWriter siteURLsWriter = new BufferedWriter(new FileWriter(siteSeedsFile));
283	BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
284	) {
285	// only write urls and no domain into single global seedurls file
286	// But write domain and tabbed urls into individual sites/0000x.txt files
287	// and write regexed domain into it too
288	siteURLsWriter.write(domain + "\n");
289	siteRegexWriter.write(regexed_domain + "\n");
290
291	// next write out the urls for the domain with a tab prefixed to each
292	// into the sites/0000x/seedURLs.txt file - also write into the global seeds file
293	Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
294	Iterator<String> urlIterator = urlsForDomainSet.iterator();
295	while(urlIterator.hasNext()) {
296	String url = urlIterator.next();
297	seedURLsWriter.write(url + "\n"); // global seedURLs file
298	siteURLsWriter.write("\t" + url + "\n");
299	}
300	} catch (IOException ioe) {
301	ioe.printStackTrace();
302	System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);
303	}
304	}
305
306	} catch (IOException ioe) {
307	ioe.printStackTrace();
308	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile);
309	}
310
311	// write out domains as regular expressions into "regex-urlfilter.txt" file
312	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
313	Set<String> domainsSet = domainsToURLsMap.keySet();
314	Iterator<String> i = domainsSet.iterator();
315	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
316	while(i.hasNext()) {
317	String domain = i.next();
318	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
319	urlFilterWriter.write(domain + "\n");
320	}
321
322	} catch (IOException ioe) {
323	ioe.printStackTrace();
324	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
325	}
326	}
327
328	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
329	Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
330	Iterator<Map.Entry<String, Integer>> i = entries.iterator();
331	while(i.hasNext()) {
332	Map.Entry<String, Integer> entry = i.next();
333	String urlPattern = entry.getKey();
334	Integer matchRule = entry.getValue();
335
336	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
337	return true;
338	}
339	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
340	return true;
341	}
342	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
343	return true;
344	}
345	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
346	return true;
347	}
348	// else check the rest of the filter list against this url
349	// before returning false to be certain it's not been listed in the filter list
350	}
351
352	return false;
353	}
354
355	/**
356	* Returns true if the url or pattern is found in the blacklist file.
357	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
358	* it won't get blacklisted after all. But that's not implemented here.
359	*/
360	public boolean isBlacklisted(String url) {
361	return isListedInFilterList(blackList, url);
362	}
363
364	/**
365	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
366	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
367	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
368	*/
369	public boolean isGreylisted(String url) {
370	// TODO: alexa top sites and auto-translated product sites
371	return isListedInFilterList(greyList, url);
372	}
373
374	/**
375	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
376	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
377	*/
378	public boolean isWhitelisted(String url) {
379	return isListedInFilterList(whiteList, url);
380	}
381
382	/**
383	* Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
384	* whether it is in the discard list.
385	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
386	* By not having this method deal with actual regex for filters, this has the advantage that
387	* we don't have to remember to escape or double escape each filter to turn it into a regex.
388	*/
389	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
390
391	// if filterListFilename does not exist in the conf folder, just return
392	if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
393	System.err.println(filterListFilename + " does not exist");
394	return;
395	}
396
397	try (
398	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
399	) {
400	String filter = null;
401	while((filter = reader.readLine()) != null) {
402	// skip comments and empty lines
403	filter = filter.trim();
404	if(filter.equals("") \|\| filter.startsWith("#")) {
405	continue;
406	}
407
408	if(filter.startsWith("^") && filter.endsWith("$")) {
409	filter = filter.substring(1, filter.length()-1);
410	list.put(filter, LIST_ENTRY_MATCHES);
411	}
412	else if(filter.startsWith("^")) {
413	filter = filter.substring(1);
414	list.put(filter, LIST_ENTRY_STARTSWITH);
415	System.err.println("Match filter startswith: " + filter);
416	}
417	else if(filter.endsWith("$")) {
418	filter = filter.substring(0, filter.length()-1);
419	list.put(filter, LIST_ENTRY_ENDSWITH);
420	}
421	else {
422	list.put(filter, LIST_ENTRY_CONTAINS);
423	}
424	//System.err.println("Got filter: " + filter);
425	}
426
427	} catch (IOException ioe) {
428	ioe.printStackTrace();
429	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
430	}
431
432	}
433
434	/** Maintain a count of all WET files processed. */
435	public void setWETFileCount(int count) { this.wetFileCount = count; }
436
437	/** Maintain a count of all WET records processed. */
438	//public int getRecordCount() { return this.totalRecordCount; }
439	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
440	public void setRecordCount(int count) { this.totalRecordCount = count; }
441
442	public static void printUsage() {
443	System.err.println("Run this program as:");
444	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
445	}
446
447	/** Filename filter to only list warc.wet files or else warc.wet.gz files
448	* for which unzipped warc.wet equivalents don't yet exist.
449	*/
450	private static class WETFilenameFilter implements FilenameFilter {
451
452	public boolean accept(File dir, String name) {
453	if(name.endsWith(".warc.wet")) {
454	logger.debug("Will include " + name + " for processing.");
455	return true;
456	}
457
458	if(name.endsWith(".warc.wet.gz")) {
459	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
460	File unzippedVersion = new File(dir, nameWithoutGZext);
461	if(unzippedVersion.exists()) {
462	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
463	logger.debug("Skipping " + name);
464	return false; // don't count gzipped version if unzipped version exists.
465	}
466	else {
467	logger.debug("Only zipped version " + name + " exists.");
468	return true; // No unzipped version, so have to work with gzipped version
469	}
470	}
471
472	// we're not even interested in any other file extensions
473	logger.debug("Not a WET file. Skipping " + name);
474	return false;
475	}
476	}
477
478
479	public static void main(String[] args) {
480	if(args.length != 2) {
481	printUsage();
482	return;
483	}
484
485
486	File WETFileDir = new File(args[0]);
487	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
488	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
489	return;
490	}
491
492	File outFolder = new File(args[1]);
493	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
494	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
495	return;
496	}
497
498	try {
499	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
500
501	//ccWETFilesProcessor.processAllWETFiles();
502
503	// Will list all the warc.wet files in the input directory or else their gzipped versions
504	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
505
506	int wetRecordCount = 0;
507	int wetFileCount = 0;
508
509	for(int i = 0; i < WETFiles.length; i++) {
510	File WETFile = WETFiles[i];
511	logger.debug("Processing WETfile: " + WETFile);
512
513	// Any .gz files listed means they haven't been unzipped yet. So unzip.
514	String WETFilename = WETFile.toString();
515	if(WETFilename.endsWith(".gz")) {
516	File GZippedWETFile = WETFile;
517	String WETGZippedFilename = WETFilename;
518	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
519
520	WETFile = new File(WETFilename);
521	Utility.unzipFile(GZippedWETFile, WETFile);
522	}
523	// hereafter all WETFiles should refer to the unzipped version
524	// Check the unzipped WETFile exists
525
526	if(!WETFile.exists() \|\| !WETFile.isFile()) {
527	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
528	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
529	return;
530	}
531
532	// Finally, we can process this WETFile's records into the keep and discard pile
533	wetFileCount++;
534	logger.debug("Off to process " + WETFile);
535	WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
536	wetFileProcessor.processWETFile();
537	wetRecordCount += wetFileProcessor.getRecordCount();
538	}
539
540	// for information purposes
541	ccWETFilesProcessor.setWETFileCount(wetFileCount);
542	ccWETFilesProcessor.setRecordCount(wetRecordCount);
543
544	File seedURLsFile = new File(outFolder, "seedURLs.txt");
545	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
546	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
547
548	System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
549
550	} catch(Exception e) {
551	// can get an exception when instantiating CCWETProcessor instance
552	e.printStackTrace();
553	System.err.println(e.getMessage());
554	}
555
556	return;
557
558	}
559	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: