Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33518

Last change on this file since 33518 was 33518, checked in by ak19, 5 years ago
Intermediate commit: got the seed urls file temporarily written out as domain followed by commoncrawl's urls within the domain. For the next commit, I will try splitting them into individual files per domain along with their individual regex-url txt file list restricted just to the site/domain, while returning what's output into the seed urls file back to all urls, sorted.
File size: 20.2 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.HashMap;
9	import java.util.Map;
10	import java.util.Set;
11	import java.util.TreeMap;
12	import java.util.TreeSet;
13
14	import org.apache.log4j.Logger;
15
16	/**
17	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
18	* the WET records in each, putting each WET record into a file. Each file is put into a
19	* keep or discard or greyListed folder, and its url listed written into a keep, discard
20	* or greylisted text file, based on based on
21	*
22	* 1. whether it's whitelisted, else greylisted else blacklisted
23	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
24	* enough content. Formerly, content-length and number of lines were used to determine if
25	* the content was sufficient. Now it's just word count and number of MAX characters
26	* (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
27	* in conf/config.properties.
28	*
29	* Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
30	* into the conf folder to control any url patterns that are explicitly included or excluded or
31	* set aside for inspecting later. These filter text files don't use regexes, instead their
32	* format is:
33	* - precede URL by ^ to blacklist urls that match the given prefix
34	* - succeed URL by $ to blacklist urls that match the given suffix
35	* - ^url$ will blacklist urls that match the given url completely
36	* - Without either ^ or $ symbol, urls containing the given url will get blacklisted
37	*
38	* WETProcessor.java's current implementation is that explicit whitelisting has precedence
39	* over greylisting and which takes precedence over blacklisting in turn. However, even
40	* explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
41	* and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
42	* also for nutch.
43	*
44	* A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
45	* in the given input folder. Then use a single instance of the WETProcessor class to process
46	* each single unzipped warc.wet file.
47	*
48	* To compile, including the jars in lib/ for compiling.
49	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
50	*
51	* To run, passing the log4j and other properties files in conf/ folder:
52	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
53	*
54	* e.g.
55	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
56	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
57	*
58	*/
59
60	public class CCWETProcessor {
61	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
62
63	// Properties shared across WETProcessor instances
64	public final int MAX_WORD_LENGTH;
65	public final int MIN_NUM_WORDS;
66	public final int MAX_WORDS_CAMELCASE;
67
68	private Properties configProperties = new Properties();
69
70	// File paths shared across WETProcessor instances
71	public final File WETFilesDir;
72	public final File outputFolder;
73	public final File discardFolder;
74	public final File keepFolder;
75	public final File greyListedFolder;
76	public final File keepURLsFile;
77	public final File discardURLsFile;
78	public final File greyListedFile;
79
80	/** Possible values stored in the blackList/whiteList/greyList Maps */
81	private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
82	private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
83	private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
84	private final Integer LIST_ENTRY_MATCHES = new Integer(3);
85
86	/**
87	* Store url patterns as keys and values indicated whether a url should
88	* match it exactly, start/end with it, or contain it
89	*/
90	private HashMap<String, Integer> blackList;
91	private HashMap<String, Integer> greyList;
92	private HashMap<String, Integer> whiteList;
93
94	/** Map of domains we keep and the full urls we're keeping that are of that domain.
95	* No need to use a TreeMap which preserves natural (alphabetical) ordering of keys,
96	* while a HashMap has no notion of ordering, because we just need to store urls with
97	* their domains. Whether the domains are sorted or the urls per domain are sorted becomes
98	* irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the
99	* same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html
100	* Is there any benefit to nutch when crawling if these seedURLs are ordered or not?)
101	*/
102	private Map<String, Set<String>> domainsToURLsMap;
103
104	// Keep a count of all the records that all WETProcessors instantiated
105	// by our main method combined have processed
106	private int totalRecordCount = 0;
107
108	private int wetFileCount = 0;
109
110	public CCWETProcessor(File inFolder, File outFolder) throws Exception {
111	this.WETFilesDir = inFolder;
112	this.outputFolder = outFolder;
113
114	// load up the properties from the config file
115	try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
116	configProperties = new Properties();
117	configProperties.load(infile);
118	//infile.close(); // not explicitly called in examples of try-with-resources
119
120	} catch(Exception e) {
121	System.err.println("Exception attempting to read properties from config.properties.");
122	logger.error("Exception attempting to read properties from config.properties.");
123	e.printStackTrace();
124	}
125
126	if(configProperties.size() == 0) {
127	System.err.println("*** Warning: no values read into config properties. Using defaults.");
128	}
129
130	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
131	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
132	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
133
134
135	this.discardFolder = new File(outFolder, "discard");
136	if(!discardFolder.exists()) {
137	discardFolder.mkdir();
138	}
139	this.keepFolder = new File(outFolder, "keep");
140	if(!keepFolder.exists()) {
141	keepFolder.mkdir();
142	}
143
144	this.greyListedFolder = new File(outFolder, "greylisted");
145	if(!greyListedFolder.exists()) {
146	greyListedFolder.mkdir();
147	}
148
149	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
150	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
151	throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
152	}
153	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
154	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
155	throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
156	}
157	this.greyListedFile = new File(outFolder, "greyListed.txt");
158	if(greyListedFile.exists() && !greyListedFile.delete()) {
159	throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
160	}
161
162	// prepare our blacklist, greylist (for inspection) and whitelist
163	System.err.println("Loading blacklist.");
164	blackList = new HashMap<String, Integer>();
165	initURLFilterList(blackList, "url-blacklist-filter.txt");
166
167	System.err.println("Loading greylist.");
168	greyList = new HashMap<String, Integer>();
169	initURLFilterList(greyList, "url-greylist-filter.txt");
170
171	System.err.println("Loading whitelist.");
172	whiteList = new HashMap<String, Integer>();
173	initURLFilterList(whiteList, "url-whitelist-filter.txt");
174
175	//System.err.println("Prematurely terminating for testing purposes.");
176	//System.exit(-1);
177	}
178
179	/**
180	* Using the keepURLs.txt file generated by running WETProcessor instances, produces
181	* as output the URL seed list and regex-urlfilter text files required by nutch, see
182	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
183	*/
184	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
185	// Maintain Sets of unique domains and urls
186	// TreeSet: by default, "the elements are ordered using their natural ordering"
187	// (or by a Comparator provided at set creation time).
188	// Whereas HashSet doesn't guarantee ordering.
189	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
190
191	//Set<String> domainsSet = new TreeSet<String>();
192	//Set<String> urlsSet = new TreeSet<String>();
193	domainsToURLsMap = new TreeMap<String, Set<String>>();
194
195	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.)
196
197	try (
198	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
199	) {
200
201	// read a URL at a time from urlsFile
202	String url = null;
203	String domain = null;
204	while((url = reader.readLine()) != null) { // readLine removes newline separator
205
206	// work out domain. This retains any www. or subdomain prefix:
207	int startIndex = url.indexOf("//"); // http:// or https:// prefix
208	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
209	domain = url.substring(startIndex);
210	int endIndex = domain.indexOf("/");
211	if(endIndex == -1) endIndex = domain.length();
212	domain = domain.substring(0, endIndex);
213
214	//urlsSet.add(url);
215	//domainsSet.add(domain);
216	Set<String> urlsSet;
217	if(!domainsToURLsMap.containsKey(domain)) {
218	urlsSet = new TreeSet<String>();
219	urlsSet.add(url);
220	domainsToURLsMap.put(domain, urlsSet);
221	} else {
222	urlsSet = domainsToURLsMap.get(domain);
223	urlsSet.add(url);
224	}
225
226	}
227	} catch (IOException ioe) {
228	ioe.printStackTrace();
229	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
230	}
231
232	/*
233	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
234	Iterator<String> i = urlsSet.iterator();
235	while(i.hasNext()) {
236	String url = i.next();
237	seedURLsWriter.write(url + "\n");
238	}
239
240	} catch (IOException ioe) {
241	ioe.printStackTrace();
242	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
243	}
244	*/
245
246	// write out each domain followed in sequence by all urls we found in that domain
247	// (urls with tab up front)
248	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
249	//Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet();
250	Set<String> domainsSet = domainsToURLsMap.keySet();
251	Iterator<String> domainIterator = domainsSet.iterator();
252
253	while(domainIterator.hasNext()) {
254	// write out the domain
255	String domain = domainIterator.next();
256	seedURLsWriter.write(domain + "\n");
257
258	// next write out the urls for the domain with a tab prefixed to each
259	Set<String> urlsForDomainSet = domainsToURLsMap.get(domain);
260	Iterator<String> urlIterator = urlsForDomainSet.iterator();
261	while(urlIterator.hasNext()) {
262	String url = urlIterator.next();
263	seedURLsWriter.write("\t" + url + "\n");
264	}
265	}
266
267	} catch (IOException ioe) {
268	ioe.printStackTrace();
269	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
270	}
271
272	// write out domains as regular expressions into "regex-urlfilter.txt" file
273	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
274	Set<String> domainsSet = domainsToURLsMap.keySet();
275	Iterator<String> i = domainsSet.iterator();
276	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
277	while(i.hasNext()) {
278	String domain = i.next();
279	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
280	urlFilterWriter.write(domain + "\n");
281	}
282
283	} catch (IOException ioe) {
284	ioe.printStackTrace();
285	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
286	}
287	}
288
289	private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
290	Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
291	Iterator<Map.Entry<String, Integer>> i = entries.iterator();
292	while(i.hasNext()) {
293	Map.Entry<String, Integer> entry = i.next();
294	String urlPattern = entry.getKey();
295	Integer matchRule = entry.getValue();
296
297	if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
298	return true;
299	}
300	else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
301	return true;
302	}
303	else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
304	return true;
305	}
306	else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
307	return true;
308	}
309	// else check the rest of the filter list against this url
310	// before returning false to be certain it's not been listed in the filter list
311	}
312
313	return false;
314	}
315
316	/**
317	* Returns true if the url or pattern is found in the blacklist file.
318	* Note that if eventually the same url pattern is found in the greylist or whitelist too,
319	* it won't get blacklisted after all. But that's not implemented here.
320	*/
321	public boolean isBlacklisted(String url) {
322	return isListedInFilterList(blackList, url);
323	}
324
325	/**
326	* Returns true if the url or pattern is explicitly mentioned in the greylist file.
327	* Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
328	* Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
329	*/
330	public boolean isGreylisted(String url) {
331	// TODO: alexa top sites and auto-translated product sites
332	return isListedInFilterList(greyList, url);
333	}
334
335	/**
336	* Returns true if the url or pattern is explicitly mentioned in the whitelist file
337	* Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
338	*/
339	public boolean isWhitelisted(String url) {
340	return isListedInFilterList(whiteList, url);
341	}
342
343	/**
344	* Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
345	* whether it is in the discard list.
346	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
347	* By not having this method deal with actual regex for filters, this has the advantage that
348	* we don't have to remember to escape or double escape each filter to turn it into a regex.
349	*/
350	public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
351
352	// if filterListFilename does not exist in the conf folder, just return
353	if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
354	System.err.println(filterListFilename + " does not exist");
355	return;
356	}
357
358	try (
359	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
360	) {
361	String filter = null;
362	while((filter = reader.readLine()) != null) {
363	// skip comments and empty lines
364	filter = filter.trim();
365	if(filter.equals("") \|\| filter.startsWith("#")) {
366	continue;
367	}
368
369	if(filter.startsWith("^") && filter.endsWith("$")) {
370	filter = filter.substring(1, filter.length()-1);
371	list.put(filter, LIST_ENTRY_MATCHES);
372	}
373	else if(filter.startsWith("^")) {
374	filter = filter.substring(1);
375	list.put(filter, LIST_ENTRY_STARTSWITH);
376	System.err.println("Match filter startswith: " + filter);
377	}
378	else if(filter.endsWith("$")) {
379	filter = filter.substring(0, filter.length()-1);
380	list.put(filter, LIST_ENTRY_ENDSWITH);
381	}
382	else {
383	list.put(filter, LIST_ENTRY_CONTAINS);
384	}
385	//System.err.println("Got filter: " + filter);
386	}
387
388	} catch (IOException ioe) {
389	ioe.printStackTrace();
390	System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
391	}
392
393	}
394
395	/** Maintain a count of all WET files processed. */
396	public void setWETFileCount(int count) { this.wetFileCount = count; }
397
398	/** Maintain a count of all WET records processed. */
399	//public int getRecordCount() { return this.totalRecordCount; }
400	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
401	public void setRecordCount(int count) { this.totalRecordCount = count; }
402
403	public static void printUsage() {
404	System.err.println("Run this program as:");
405	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
406	}
407
408	/** Filename filter to only list warc.wet files or else warc.wet.gz files
409	* for which unzipped warc.wet equivalents don't yet exist.
410	*/
411	private static class WETFilenameFilter implements FilenameFilter {
412
413	public boolean accept(File dir, String name) {
414	if(name.endsWith(".warc.wet")) {
415	logger.debug("Will include " + name + " for processing.");
416	return true;
417	}
418
419	if(name.endsWith(".warc.wet.gz")) {
420	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
421	File unzippedVersion = new File(dir, nameWithoutGZext);
422	if(unzippedVersion.exists()) {
423	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
424	logger.debug("Skipping " + name);
425	return false; // don't count gzipped version if unzipped version exists.
426	}
427	else {
428	logger.debug("Only zipped version " + name + " exists.");
429	return true; // No unzipped version, so have to work with gzipped version
430	}
431	}
432
433	// we're not even interested in any other file extensions
434	logger.debug("Not a WET file. Skipping " + name);
435	return false;
436	}
437	}
438
439
440	public static void main(String[] args) {
441	if(args.length != 2) {
442	printUsage();
443	return;
444	}
445
446
447	File WETFileDir = new File(args[0]);
448	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
449	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
450	return;
451	}
452
453	File outFolder = new File(args[1]);
454	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
455	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
456	return;
457	}
458
459	try {
460	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
461
462	//ccWETFilesProcessor.processAllWETFiles();
463
464	// Will list all the warc.wet files in the input directory or else their gzipped versions
465	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
466
467	int wetRecordCount = 0;
468	int wetFileCount = 0;
469
470	for(int i = 0; i < WETFiles.length; i++) {
471	File WETFile = WETFiles[i];
472	logger.debug("Processing WETfile: " + WETFile);
473
474	// Any .gz files listed means they haven't been unzipped yet. So unzip.
475	String WETFilename = WETFile.toString();
476	if(WETFilename.endsWith(".gz")) {
477	File GZippedWETFile = WETFile;
478	String WETGZippedFilename = WETFilename;
479	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
480
481	WETFile = new File(WETFilename);
482	Utility.unzipFile(GZippedWETFile, WETFile);
483	}
484	// hereafter all WETFiles should refer to the unzipped version
485	// Check the unzipped WETFile exists
486
487	if(!WETFile.exists() \|\| !WETFile.isFile()) {
488	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
489	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
490	return;
491	}
492
493	// Finally, we can process this WETFile's records into the keep and discard pile
494	wetFileCount++;
495	logger.debug("Off to process " + WETFile);
496	WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
497	wetFileProcessor.processWETFile();
498	wetRecordCount += wetFileProcessor.getRecordCount();
499	}
500
501	// for information purposes
502	ccWETFilesProcessor.setWETFileCount(wetFileCount);
503	ccWETFilesProcessor.setRecordCount(wetRecordCount);
504
505	File seedURLsFile = new File(outFolder, "seedURLs.txt");
506	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
507	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
508
509	System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
510
511	} catch(Exception e) {
512	// can get an exception when instantiating CCWETProcessor instance
513	e.printStackTrace();
514	System.err.println(e.getMessage());
515	}
516
517	return;
518
519	}
520	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: