Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33515

Last change on this file since 33515 was 33515, checked in by ak19, 5 years ago
Removed an unused function
File size: 17.7 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.HashMap;
9	import java.util.Map;
10	import java.util.Set;
11	import java.util.TreeSet;
12
13	import org.apache.log4j.Logger;
14
15	/**
16	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
17	* the WET records in each, putting each WET record into a file. Each file is put into a
18	* keep or discard or greyListed folder, and its url listed written into a keep, discard
19	* or greylisted text file, based on based on
20	*
21	* 1. whether it's whitelisted, else greylisted else blacklisted
22	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
23	* enough content. Formerly, content-length and number of lines were used to determine if
24	* the content was sufficient. Now it's just word count and number of MAX characters
25	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
26	* in conf/config.properties.
27	*
28	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
29	* into the conf folder to control any url patterns that are explicitly included or excluded or
30	* set aside for inspecting later. These filter text files don't use regexes, instead their
31	* format is:
32	* - precede URL by ^ to blacklist urls that match the given prefix
33	* - succeed URL by $ to blacklist urls that match the given suffix
34	* - ^url$ will blacklist urls that match the given url completely
35	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
36	*
37	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
38	* over greylisting and which takes precedence over blacklisting in turn. However, even
39	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
40	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
41	* also for nutch.
42	*
43	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
44	* in the given input folder. Then use a single instance of the WETProcessor class to process
45	* each single unzipped warc.wet file.
46	*
47	* To compile, including the jars in lib/ for compiling.
48	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
49	*
50	* To run, passing the log4j and other properties files in conf/ folder:
51	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
52	*
53	* e.g.
54	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
55	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
56	*
57	*/
58
59	public class CCWETProcessor {
60	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
61
62	// Properties shared across WETProcessor instances
63	public final int MAX_WORD_LENGTH;
64	public final int MIN_NUM_WORDS;
65	public final int MAX_WORDS_CAMELCASE;
66
67	private Properties configProperties = new Properties();
68
69	// File paths shared across WETProcessor instances
70	public final File WETFilesDir;
71	public final File outputFolder;
72	public final File discardFolder;
73	public final File keepFolder;
74	public final File greyListedFolder;
75	public final File keepURLsFile;
76	public final File discardURLsFile;
77	public final File greyListedFile;
78
79	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
80	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
81	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
82	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
83
84	private HashMap<String, Integer> blackList;
85	private HashMap<String, Integer> greyList;
86	private HashMap<String, Integer> whiteList;
87
88	// Keep a count of all the records that all WETProcessors instantiated
89	// by our main method combined have processed
90	private int totalRecordCount = 0;
91
92	private int wetFileCount = 0;
93
94	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
95	this.WETFilesDir = inFolder;
96	this.outputFolder = outFolder;
97
98	// load up the properties from the config file
99	try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
100	configProperties = new Properties();
101	configProperties.load(infile);
102	//infile.close(); // not explicitly called in examples of try-with-resources
103
104	} catch(Exception e) {
105	System.err.println("Exception attempting to read properties from config.properties.");
106	logger.error("Exception attempting to read properties from config.properties.");
107	e.printStackTrace();
108	}
109
110	if(configProperties.size() == 0) {
111	System.err.println("*** Warning: no values read into config properties. Using defaults.");
112	}
113
114	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
115	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
116	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
117
118
119	this.discardFolder = new File(outFolder, "discard");
120	if(!discardFolder.exists()) {
121	discardFolder.mkdir();
122	}
123	this.keepFolder = new File(outFolder, "keep");
124	if(!keepFolder.exists()) {
125	keepFolder.mkdir();
126	}
127
128	this.greyListedFolder = new File(outFolder, "greylisted");
129	if(!greyListedFolder.exists()) {
130	greyListedFolder.mkdir();
131	}
132
133	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
134	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
135	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
136	}
137	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
138	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
139	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
140	}
141	this.greyListedFile = new File(outFolder, "greyListed.txt");
142	if(greyListedFile.exists() && !greyListedFile.delete()) {
143	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
144	}
145
146	System.err.println("Loading blacklist.");
147	blackList = new HashMap<String, Integer>();
148	initURLFilterList(blackList, "url-blacklist-filter.txt");
149	System.err.println("Loading greylist.");
150	greyList = new HashMap<String, Integer>();
151	initURLFilterList(greyList, "url-greylist-filter.txt");
152	System.err.println("Loading whitelist.");
153	whiteList = new HashMap<String, Integer>();
154	initURLFilterList(whiteList, "url-whitelist-filter.txt");
155
156	//System.err.println("Prematurely terminating for testing purposes.");
157	//System.exit(-1);
158	}
159
160	/**
161	* Takes as input the keepURLs.txt file generated by running WETProcessor instances.
162	* As output produces the URL seed list and regex-urlfilter text files required by nutch,
163	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
164	*/
165	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
166	// Maintain Sets of unique domains and urls
167	// TreeSet: by default, "the elements are ordered using their natural ordering"
168	// (or by a Comparator provided at set creation time).
169	// Whereas HashSet doesn't guarantee ordering.
170	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
171
172	Set<String> domainsSet = new TreeSet<String>();
173	Set<String> urlsSet = new TreeSet<String>();
174
175	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.)
176
177	try (
178	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
179	) {
180
181	// read a URL at a time from urlsFile
182	String url = null;
183	String domain = null;
184	while((url = reader.readLine()) != null) { // readLine removes newline separator
185
186	// work out domain. This retains any www. or subdomain prefix:
187	int startIndex = url.indexOf("//"); // http:// or https:// prefix
188	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
189	domain = url.substring(startIndex);
190	int endIndex = domain.indexOf("/");
191	if(endIndex == -1) endIndex = domain.length();
192	domain = domain.substring(0, endIndex);
193
194	//if(!domainsMap.containsKey(domain)) {
195	urlsSet.add(url);
196	domainsSet.add(domain);
197	//}
198	}
199	} catch (IOException ioe) {
200	ioe.printStackTrace();
201	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
202	}
203
204	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
205	Iterator<String> i = urlsSet.iterator();
206	while(i.hasNext()) {
207	String url = i.next();
208	seedURLsWriter.write(url + "\n");
209	}
210
211	} catch (IOException ioe) {
212	ioe.printStackTrace();
213	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
214	}
215
216	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
217	Iterator<String> i = domainsSet.iterator();
218	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
219	while(i.hasNext()) {
220	String domain = i.next();
221	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
222	urlFilterWriter.write(domain + "\n");
223	}
224
225	} catch (IOException ioe) {
226	ioe.printStackTrace();
227	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
228	}
229	}
230
231	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
232	Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
233	Iterator<Map.Entry<String, Integer>> i = entries.iterator();
234	while(i.hasNext()) {
235	Map.Entry<String, Integer> entry = i.next();
236	String urlPattern = entry.getKey();
237	Integer matchRule = entry.getValue();
238
239	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
240	return true;
241	}
242	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
243	return true;
244	}
245	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
246	return true;
247	}
248	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
249	return true;
250	}
251	// else check the rest of the filter list against this url
252	// before returning false to be certain it's not been listed in the filter list
253	}
254
255	return false;
256	}
257
258	/**
259	* Returns true if the url or pattern is found in the blacklist file.
260	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
261	* it won't get blacklisted after all. But that's not implemented here.
262	*/
263	public boolean isBlacklisted(String url) {
264	return isListedInFilterList(blackList, url);
265	}
266
267	/**
268	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
269	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
270	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
271	*/
272	public boolean isGreylisted(String url) {
273	// TODO: alexa top sites and auto-translated product sites
274	return isListedInFilterList(greyList, url);
275	}
276
277	/**
278	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
279	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
280	*/
281	public boolean isWhitelisted(String url) {
282	return isListedInFilterList(whiteList, url);
283	}
284
285	/**
286	* Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
287	* whether it is in the discard list.
288	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
289	* By not having this method deal with actual regex for filters, this has the advantage that
290	* we don't have to remember to escape or double escape each filter to turn it into a regex.
291	*/
292	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
293
294	// if filterListFilename does not exist in the conf folder, just return
295	if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
296	System.err.println(filterListFilename + " does not exist");
297	return;
298	}
299
300	try (
301	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
302	) {
303	String filter = null;
304	while((filter = reader.readLine()) != null) {
305	// skip comments and empty lines
306	filter = filter.trim();
307	if(filter.equals("") \|\| filter.startsWith("#")) {
308	continue;
309	}
310
311	if(filter.startsWith("^") && filter.endsWith("$")) {
312	filter = filter.substring(1, filter.length()-1);
313	list.put(filter, LIST_ENTRY_MATCHES);
314	}
315	else if(filter.startsWith("^")) {
316	filter = filter.substring(1);
317	list.put(filter, LIST_ENTRY_STARTSWITH);
318	System.err.println("Match filter startswith: " + filter);
319	}
320	else if(filter.endsWith("$")) {
321	filter = filter.substring(0, filter.length()-1);
322	list.put(filter, LIST_ENTRY_ENDSWITH);
323	}
324	else {
325	list.put(filter, LIST_ENTRY_CONTAINS);
326	}
327	//System.err.println("Got filter: " + filter);
328	}
329
330	} catch (IOException ioe) {
331	ioe.printStackTrace();
332	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
333	}
334
335	}
336
337	/** Maintain a count of all WET files processed. */
338	public void setWETFileCount(int count) { this.wetFileCount = count; }
339
340	/** Maintain a count of all WET records processed. */
341	//public int getRecordCount() { return this.totalRecordCount; }
342	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
343	public void setRecordCount(int count) { this.totalRecordCount = count; }
344
345	public static void printUsage() {
346	System.err.println("Run this program as:");
347	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
348	}
349
350	/** Filename filter to only list warc.wet files or else warc.wet.gz files
351	* for which unzipped warc.wet equivalents don't yet exist.
352	*/
353	private static class WETFilenameFilter implements FilenameFilter {
354
355	public boolean accept(File dir, String name) {
356	if(name.endsWith(".warc.wet")) {
357	logger.debug("Will include " + name + " for processing.");
358	return true;
359	}
360
361	if(name.endsWith(".warc.wet.gz")) {
362	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
363	File unzippedVersion = new File(dir, nameWithoutGZext);
364	if(unzippedVersion.exists()) {
365	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
366	logger.debug("Skipping " + name);
367	return false; // don't count gzipped version if unzipped version exists.
368	}
369	else {
370	logger.debug("Only zipped version " + name + " exists.");
371	return true; // No unzipped version, so have to work with gzipped version
372	}
373	}
374
375	// we're not even interested in any other file extensions
376	logger.debug("Not a WET file. Skipping " + name);
377	return false;
378	}
379	}
380
381
382	public static void main(String[] args) {
383	if(args.length != 2) {
384	printUsage();
385	return;
386	}
387
388
389	File WETFileDir = new File(args[0]);
390	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
391	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
392	return;
393	}
394
395	File outFolder = new File(args[1]);
396	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
397	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
398	return;
399	}
400
401	try {
402	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
403
404	//ccWETFilesProcessor.processAllWETFiles();
405
406	// Will list all the warc.wet files in the input directory or else their gzipped versions
407	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
408
409	int wetRecordCount = 0;
410	int wetFileCount = 0;
411
412	for(int i = 0; i < WETFiles.length; i++) {
413	File WETFile = WETFiles[i];
414	logger.debug("Processing WETfile: " + WETFile);
415
416	// Any .gz files listed means they haven't been unzipped yet. So unzip.
417	String WETFilename = WETFile.toString();
418	if(WETFilename.endsWith(".gz")) {
419	File GZippedWETFile = WETFile;
420	String WETGZippedFilename = WETFilename;
421	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
422
423	WETFile = new File(WETFilename);
424	Utility.unzipFile(GZippedWETFile, WETFile);
425	}
426	// hereafter all WETFiles should refer to the unzipped version
427	// Check the unzipped WETFile exists
428
429	if(!WETFile.exists() \|\| !WETFile.isFile()) {
430	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
431	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
432	return;
433	}
434
435	// Finally, we can process this WETFile's records into the keep and discard pile
436	wetFileCount++;
437	logger.debug("Off to process " + WETFile);
438	WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
439	wetFileProcessor.processWETFile();
440	wetRecordCount += wetFileProcessor.getRecordCount();
441	}
442
443	// for information purposes
444	ccWETFilesProcessor.setWETFileCount(wetFileCount);
445	ccWETFilesProcessor.setRecordCount(wetRecordCount);
446
447	File seedURLsFile = new File(outFolder, "seedURLs.txt");
448	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
449	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
450	} catch(Exception e) {
451	// can get an exception when instantiating CCWETProcessor instance
452	e.printStackTrace();
453	System.err.println(e.getMessage());
454	}
455
456	return;
457
458	}
459	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: