Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33503

Last change on this file since 33503 was 33503, checked in by ak19, 5 years ago
More efficient blacklisting/greylisting/whitelisting now by reading in the lists only once and then comparing each URL to each list. Explicit whitelisting has precedence over greylisting and which takes precedence over blacklisting. Then any remaining urls are checked for having sufficient content. The code that checks for sufficient content still needs some more adjusting.
File size: 19.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.HashMap;
9	import java.util.Map;
10	import java.util.Set;
11	import java.util.TreeSet;
12
13	import org.apache.log4j.Logger;
14
15	/**
16	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
17	* the WET records in each, putting each WET record into a file. Each file is put into a
18	* keep or discard or greyListed folder, and its url listed written into a keep, discard
19	* or greylisted text file, based on based on
20	*
21	* 1. whether it's whitelisted, else greylisted else blacklisted
22	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
23	* enough content. Formerly, content-length and number of lines were used to determine if
24	* the content was sufficient. Now it's just word count and number of MAX characters
25	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
26	* in conf/config.properties.
27	*
28	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
29	* into the conf folder to control any url patterns that are explicitly included or excluded or
30	* set aside for inspecting later. These filter text files don't use regexes, instead their
31	* format is:
32	* - precede URL by ^ to blacklist urls that match the given prefix
33	* - succeed URL by $ to blacklist urls that match the given suffix
34	* - ^url$ will blacklist urls that match the given url completely
35	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
36	*
37	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
38	* over greylisting and which takes precedence over blacklisting in turn. However, even
39	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
40	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
41	* also for nutch.
42	*
43	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
44	* in the given input folder. Then use a single instance of the WETProcessor class to process
45	* each single unzipped warc.wet file.
46	*
47	* To compile, including the jars in lib/ for compiling.
48	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
49	*
50	* To run, passing the log4j and other properties files in conf/ folder:
51	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
52	*
53	* e.g.
54	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
55	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
56	*
57	*/
58
59	public class CCWETProcessor {
60	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
61
62	// Properties shared across WETProcessor instances
63	public final int MAX_WORD_LENGTH;
64	public final int MIN_NUM_WORDS;
65	public final int MAX_WORDS_CAMELCASE;
66
67	private Properties configProperties = new Properties();
68
69	// File paths shared across WETProcessor instances
70	public final File WETFilesDir;
71	public final File outputFolder;
72	public final File discardFolder;
73	public final File keepFolder;
74	public final File greyListedFolder;
75	public final File keepURLsFile;
76	public final File discardURLsFile;
77	public final File greyListedFile;
78
79	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
80	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
81	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
82	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
83
84	private HashMap<String, Integer> blackList;
85	private HashMap<String, Integer> greyList;
86	private HashMap<String, Integer> whiteList;
87
88	// Keep a count of all the records that all WETProcessors instantiated
89	// by our main method combined have processed
90	private int totalRecordCount = 0;
91
92	private int wetFileCount = 0;
93
94	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
95	this.WETFilesDir = inFolder;
96	this.outputFolder = outFolder;
97
98	// load up the properties from the config file
99	try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
100	configProperties = new Properties();
101	configProperties.load(infile);
102	//infile.close(); // not explicitly called in examples of try-with-resources
103
104	} catch(Exception e) {
105	System.err.println("Exception attempting to read properties from config.properties.");
106	logger.error("Exception attempting to read properties from config.properties.");
107	e.printStackTrace();
108	}
109
110	if(configProperties.size() == 0) {
111	System.err.println("*** Warning: no values read into config properties. Using defaults.");
112	}
113
114	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
115	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
116	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
117
118
119	this.discardFolder = new File(outFolder, "discard");
120	if(!discardFolder.exists()) {
121	discardFolder.mkdir();
122	}
123	this.keepFolder = new File(outFolder, "keep");
124	if(!keepFolder.exists()) {
125	keepFolder.mkdir();
126	}
127
128	this.greyListedFolder = new File(outFolder, "greylisted");
129	if(!greyListedFolder.exists()) {
130	greyListedFolder.mkdir();
131	}
132
133	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
134	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
135	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
136	}
137	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
138	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
139	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
140	}
141	this.greyListedFile = new File(outFolder, "greyListed.txt");
142	if(greyListedFile.exists() && !greyListedFile.delete()) {
143	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
144	}
145
146	System.err.println("Loading blacklist.");
147	blackList = new HashMap<String, Integer>();
148	initURLFilterList(blackList, "url-blacklist-filter.txt");
149	System.err.println("Loading greylist.");
150	greyList = new HashMap<String, Integer>();
151	initURLFilterList(greyList, "url-greylist-filter.txt");
152	System.err.println("Loading whitelist.");
153	whiteList = new HashMap<String, Integer>();
154	initURLFilterList(whiteList, "url-whitelist-filter.txt");
155
156	//System.err.println("Prematurely terminating for testing purposes.");
157	//System.exit(-1);
158	}
159
160	/**
161	* Takes as input the keepURLs.txt file generated by running WETProcessor instances.
162	* As output produces the URL seed list and regex-urlfilter text files required by nutch,
163	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
164	*/
165	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
166	// Maintain Sets of unique domains and urls
167	// TreeSet: by default, "the elements are ordered using their natural ordering"
168	// (or by a Comparator provided at set creation time).
169	// Whereas HashSet doesn't guarantee ordering.
170	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
171
172	Set<String> domainsSet = new TreeSet<String>();
173	Set<String> urlsSet = new TreeSet<String>();
174
175	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.)
176
177	try (
178	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
179	) {
180
181	// read a URL at a time from urlsFile
182	String url = null;
183	String domain = null;
184	while((url = reader.readLine()) != null) { // readLine removes newline separator
185
186	// work out domain. This retains any www. or subdomain prefix:
187	int startIndex = url.indexOf("//"); // http:// or https:// prefix
188	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
189	domain = url.substring(startIndex);
190	int endIndex = domain.indexOf("/");
191	if(endIndex == -1) endIndex = domain.length();
192	domain = domain.substring(0, endIndex);
193
194	//if(!domainsMap.containsKey(domain)) {
195	urlsSet.add(url);
196	domainsSet.add(domain);
197	//}
198	}
199	} catch (IOException ioe) {
200	ioe.printStackTrace();
201	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
202	}
203
204	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
205	Iterator<String> i = urlsSet.iterator();
206	while(i.hasNext()) {
207	String url = i.next();
208	seedURLsWriter.write(url + "\n");
209	}
210
211	} catch (IOException ioe) {
212	ioe.printStackTrace();
213	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
214	}
215
216	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
217	Iterator<String> i = domainsSet.iterator();
218	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
219	while(i.hasNext()) {
220	String domain = i.next();
221	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
222	urlFilterWriter.write(domain + "\n");
223	}
224
225	} catch (IOException ioe) {
226	ioe.printStackTrace();
227	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
228	}
229	}
230
231	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
232	Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
233	Iterator<Map.Entry<String, Integer>> i = entries.iterator();
234	while(i.hasNext()) {
235	Map.Entry<String, Integer> entry = i.next();
236	String urlPattern = entry.getKey();
237	Integer matchRule = entry.getValue();
238
239	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
240	return true;
241	}
242	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
243	return true;
244	}
245	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
246	return true;
247	}
248	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
249	return true;
250	}
251	// else check the rest of the filter list against this url
252	// before returning false to be certain it's not been listed in the filter list
253	}
254
255	return false;
256	}
257
258	/**
259	* Returns true if the url or pattern is found in the blacklist file.
260	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
261	* it won't get blacklisted after all. But that's not implemented here.
262	*/
263	public boolean isBlacklisted(String url) {
264	return isListedInFilterList(blackList, url);
265	}
266
267	/**
268	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
269	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
270	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
271	*/
272	public boolean isGreylisted(String url) {
273	// TODO: alexa top sites and auto-translated product sites
274	return isListedInFilterList(greyList, url);
275	}
276
277	/**
278	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
279	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
280	*/
281	public boolean isWhitelisted(String url) {
282	return isListedInFilterList(whiteList, url);
283	}
284
285	/**
286	* Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
287	* whether it is in the discard list.
288	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
289	* By not having this method deal with actual regex for filters, this has the advantage that
290	* we don't have to remember to escape or double escape each filter to turn it into a regex.
291	*/
292	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
293
294	// if filterListFilename does not exist in the conf folder, just return
295	if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
296	System.err.println(filterListFilename + " does not exist");
297	return;
298	}
299
300	try (
301	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
302	) {
303	String filter = null;
304	while((filter = reader.readLine()) != null) {
305	// skip comments and empty lines
306	filter = filter.trim();
307	if(filter.equals("") \|\| filter.startsWith("#")) {
308	continue;
309	}
310
311	if(filter.startsWith("^") && filter.endsWith("$")) {
312	filter = filter.substring(1, filter.length()-1);
313	list.put(filter, LIST_ENTRY_MATCHES);
314	}
315	else if(filter.startsWith("^")) {
316	filter = filter.substring(1);
317	list.put(filter, LIST_ENTRY_STARTSWITH);
318	System.err.println("Match filter startswith: " + filter);
319	}
320	else if(filter.endsWith("$")) {
321	filter = filter.substring(0, filter.length()-1);
322	list.put(filter, LIST_ENTRY_ENDSWITH);
323	}
324	else {
325	list.put(filter, LIST_ENTRY_CONTAINS);
326	}
327	//System.err.println("Got filter: " + filter);
328	}
329
330	} catch (IOException ioe) {
331	ioe.printStackTrace();
332	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
333	}
334
335	}
336	/*
337	public boolean isInDiscardFilter(String url) {
338	String discardFilterFile = "url-discard-filter.txt"; // in conf folder
339
340	try (
341	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
342	) {
343	String filter = null;
344	while((filter = reader.readLine()) != null) {
345	if(filter.trim().equals("")) {
346	continue;
347	}
348	//System.err.println("Got filter: " + filter);
349	if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
350	System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
351	}
352	else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
353	System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
354	return true;
355	}
356	else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
357	System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
358	return true;
359	}
360	else if(url.contains(filter)) {
361	System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
362	return true;
363	}
364
365	}
366
367	} catch (IOException ioe) {
368	ioe.printStackTrace();
369	System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
370	}
371
372	return false;
373	}*/
374
375	/** Maintain a count of all WET files processed. */
376	public void setWETFileCount(int count) { this.wetFileCount = count; }
377
378	/** Maintain a count of all WET records processed. */
379	//public int getRecordCount() { return this.totalRecordCount; }
380	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
381	public void setRecordCount(int count) { this.totalRecordCount = count; }
382
383	public static void printUsage() {
384	System.err.println("Run this program as:");
385	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
386	}
387
388	/** Filename filter to only list warc.wet files or else warc.wet.gz files
389	* for which unzipped warc.wet equivalents don't yet exist.
390	*/
391	private static class WETFilenameFilter implements FilenameFilter {
392
393	public boolean accept(File dir, String name) {
394	if(name.endsWith(".warc.wet")) {
395	logger.debug("Will include " + name + " for processing.");
396	return true;
397	}
398
399	if(name.endsWith(".warc.wet.gz")) {
400	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
401	File unzippedVersion = new File(dir, nameWithoutGZext);
402	if(unzippedVersion.exists()) {
403	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
404	logger.debug("Skipping " + name);
405	return false; // don't count gzipped version if unzipped version exists.
406	}
407	else {
408	logger.debug("Only zipped version " + name + " exists.");
409	return true; // No unzipped version, so have to work with gzipped version
410	}
411	}
412
413	// we're not even interested in any other file extensions
414	logger.debug("Not a WET file. Skipping " + name);
415	return false;
416	}
417	}
418
419
420	public static void main(String[] args) {
421	if(args.length != 2) {
422	printUsage();
423	return;
424	}
425
426
427	File WETFileDir = new File(args[0]);
428	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
429	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
430	return;
431	}
432
433	File outFolder = new File(args[1]);
434	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
435	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
436	return;
437	}
438
439	try {
440	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
441
442	//ccWETFilesProcessor.processAllWETFiles();
443
444	// Will list all the warc.wet files in the input directory or else their gzipped versions
445	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
446
447	int wetRecordCount = 0;
448	int wetFileCount = 0;
449
450	for(int i = 0; i < WETFiles.length; i++) {
451	File WETFile = WETFiles[i];
452	logger.debug("Processing WETfile: " + WETFile);
453
454	// Any .gz files listed means they haven't been unzipped yet. So unzip.
455	String WETFilename = WETFile.toString();
456	if(WETFilename.endsWith(".gz")) {
457	File GZippedWETFile = WETFile;
458	String WETGZippedFilename = WETFilename;
459	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
460
461	WETFile = new File(WETFilename);
462	Utility.unzipFile(GZippedWETFile, WETFile);
463	}
464	// hereafter all WETFiles should refer to the unzipped version
465	// Check the unzipped WETFile exists
466
467	if(!WETFile.exists() \|\| !WETFile.isFile()) {
468	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
469	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
470	return;
471	}
472
473	// Finally, we can process this WETFile's records into the keep and discard pile
474	wetFileCount++;
475	logger.debug("Off to process " + WETFile);
476	WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
477	wetFileProcessor.processWETFile();
478	wetRecordCount += wetFileProcessor.getRecordCount();
479	}
480
481	// for information purposes
482	ccWETFilesProcessor.setWETFileCount(wetFileCount);
483	ccWETFilesProcessor.setRecordCount(wetRecordCount);
484
485	File seedURLsFile = new File(outFolder, "seedURLs.txt");
486	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
487	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
488	} catch(Exception e) {
489	// can get an exception when instantiating CCWETProcessor instance
490	e.printStackTrace();
491	System.err.println(e.getMessage());
492	}
493
494	return;
495
496	}
497	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: