Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33501

Last change on this file since 33501 was 33501, checked in by ak19, 5 years ago
Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.
File size: 12.7 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.Set;
9	import java.util.TreeSet;
10
11	import org.apache.log4j.Logger;
12
13	/**
14	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
15	* the WET records in each, putting each WET record into a file. Each file is put into a
16	* keep or discard folder, based on content-length and number of lines.
17	* A single instance of the WETProcessor class processes a single unzipped warc.wet file.
18	*
19	* To compile, including the jars in lib/ for compiling.
20	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
21	*
22	* To run, passing the log4j and other properties files in conf/ folder:
23	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
24	*
25	* e.g.
26	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
27	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
28	*
29	*/
30
31	public class CCWETProcessor {
32	private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
33
34	// Properties shared across WETProcessor instances
35	public final int MAX_WORD_LENGTH;
36	public final int MIN_NUM_WORDS;
37	public final int MAX_WORDS_CAMELCASE;
38
39	private Properties configProperties = new Properties();
40
41	// File paths shared across WETProcessor instances
42	public final File WETFilesDir;
43	public final File outputFolder;
44	public final File discardFolder;
45	public final File keepFolder;
46	public final File keepURLsFile;
47	public final File discardURLsFile;
48
49	// Keep a count of all the records that all WETProcessors instantiated
50	// by our main method combined have processed
51	private int totalRecordCount = 0;
52
53	private int wetFileCount = 0;
54
55	public CCWETProcessor(File inFolder, File outFolder) {
56	this.WETFilesDir = inFolder;
57	this.outputFolder = outFolder;
58
59	// load up the properties from the config file
60	try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
61	configProperties = new Properties();
62	configProperties.load(infile);
63	//infile.close();
64
65	} catch(Exception e) {
66	System.err.println("Exception attempting to read properties from config.properties.");
67	logger.error("Exception attempting to read properties from config.properties.");
68	e.printStackTrace();
69	}
70
71	if(configProperties.size() == 0) {
72	System.err.println("*** Warning: no values read into config properties. Using defaults.");
73	}
74
75	MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
76	MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
77	MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
78
79
80	this.discardFolder = new File(outFolder, "discard");
81	if(!discardFolder.exists()) {
82	discardFolder.mkdir();
83	}
84	this.keepFolder = new File(outFolder, "keep");
85	if(!keepFolder.exists()) {
86	keepFolder.mkdir();
87	}
88
89	this.keepURLsFile = new File(outFolder, "keepURLs.txt");
90	if(keepURLsFile.exists() && !keepURLsFile.delete()) {
91	System.err.println("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
92	//return;
93	}
94	this.discardURLsFile = new File(outFolder, "discardURLs.txt");
95	if(discardURLsFile.exists() && !discardURLsFile.delete()) {
96	System.err.println("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
97	//return;
98	}
99
100	}
101
102	/**
103	* Takes as input the keepURLs.txt file generated by running WETProcessor instances.
104	* As output produces the URL seed list and regex-urlfilter text files required by nutch,
105	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
106	*/
107	public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
108	// Maintain Sets of unique domains and urls
109	// TreeSet: by default, "the elements are ordered using their natural ordering"
110	// (or by a Comparator provided at set creation time).
111	// Whereas HashSet doesn't guarantee ordering.
112	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
113
114	Set<String> domainsSet = new TreeSet<String>();
115	Set<String> urlsSet = new TreeSet<String>();
116
117	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.)
118
119	try (
120	BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
121	) {
122
123	// read a URL at a time from urlsFile
124	String url = null;
125	String domain = null;
126	while((url = reader.readLine()) != null) { // readLine removes newline separator
127
128	// work out domain. This retains any www. or subdomain prefix:
129	int startIndex = url.indexOf("//"); // http:// or https:// prefix
130	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
131	domain = url.substring(startIndex);
132	int endIndex = domain.indexOf("/");
133	if(endIndex == -1) endIndex = domain.length();
134	domain = domain.substring(0, endIndex);
135
136	//if(!domainsMap.containsKey(domain)) {
137	urlsSet.add(url);
138	domainsSet.add(domain);
139	//}
140	}
141	} catch (IOException ioe) {
142	ioe.printStackTrace();
143	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
144	}
145
146	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
147	Iterator<String> i = urlsSet.iterator();
148	while(i.hasNext()) {
149	String url = i.next();
150	seedURLsWriter.write(url + "\n");
151	}
152
153	} catch (IOException ioe) {
154	ioe.printStackTrace();
155	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
156	}
157
158	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
159	Iterator<String> i = domainsSet.iterator();
160	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
161	while(i.hasNext()) {
162	String domain = i.next();
163	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
164	urlFilterWriter.write(domain + "\n");
165	}
166
167	} catch (IOException ioe) {
168	ioe.printStackTrace();
169	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
170	}
171	}
172
173	/*
174	public boolean isBlacklisted(String url) {
175	return false;
176	}
177	*/
178
179	public boolean isGreylisted(String url) {
180	// alexa top sites and auto-translated product sites
181	return false;
182	}
183
184	/**
185	* Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
186	* whether it is in the discard list.
187	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
188	* By not having this method deal with actual regex for filters, this has the advantage that
189	* we don't have to remember to escape or double escape each filter to turn it into a regex.
190	*/
191	//public boolean isInDiscardFilter(String url) {
192
193	public boolean isBlacklisted(String url) {
194	String discardFilterFile = "url-discard-filter.txt"; // in conf folder
195
196	try (
197	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
198	) {
199	String filter = null;
200	while((filter = reader.readLine()) != null) {
201	if(filter.trim().equals("")) {
202	continue;
203	}
204	//System.err.println("Got filter: " + filter);
205	if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
206	System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
207	}
208	else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
209	System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
210	return true;
211	}
212	else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
213	System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
214	return true;
215	}
216	else if(url.contains(filter)) {
217	System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
218	return true;
219	}
220
221	}
222
223	} catch (IOException ioe) {
224	ioe.printStackTrace();
225	System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
226	}
227
228	return false;
229	}
230
231	/** Maintain a count of all WET files processed. */
232	public void setWETFileCount(int count) { this.wetFileCount = count; }
233
234	/** Maintain a count of all WET records processed. */
235	//public int getRecordCount() { return this.totalRecordCount; }
236	//public void addToRecordCount(int count) { this.totalRecordCount += count; }
237	public void setRecordCount(int count) { this.totalRecordCount = count; }
238
239	public static void printUsage() {
240	System.err.println("Run this program as:");
241	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
242	}
243
244	/** Filename filter to only list warc.wet files or else warc.wet.gz files
245	* for which unzipped warc.wet equivalents don't yet exist.
246	*/
247	private static class WETFilenameFilter implements FilenameFilter {
248
249	public boolean accept(File dir, String name) {
250	if(name.endsWith(".warc.wet")) {
251	logger.debug("Will include " + name + " for processing.");
252	return true;
253	}
254
255	if(name.endsWith(".warc.wet.gz")) {
256	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
257	File unzippedVersion = new File(dir, nameWithoutGZext);
258	if(unzippedVersion.exists()) {
259	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
260	logger.debug("Skipping " + name);
261	return false; // don't count gzipped version if unzipped version exists.
262	}
263	else {
264	logger.debug("Only zipped version " + name + " exists.");
265	return true; // No unzipped version, so have to work with gzipped version
266	}
267	}
268
269	// we're not even interested in any other file extensions
270	logger.debug("Not a WET file. Skipping " + name);
271	return false;
272	}
273	}
274
275
276	public static void main(String[] args) {
277	if(args.length != 2) {
278	printUsage();
279	return;
280	}
281
282
283	File WETFileDir = new File(args[0]);
284	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
285	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
286	return;
287	}
288
289	File outFolder = new File(args[1]);
290	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
291	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
292	return;
293	}
294
295	CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
296
297	//ccWETFilesProcessor.processAllWETFiles();
298
299	// Will list all the warc.wet files in the input directory or else their gzipped versions
300	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
301
302	int wetRecordCount = 0;
303	int wetFileCount = 0;
304
305	for(int i = 0; i < WETFiles.length; i++) {
306	File WETFile = WETFiles[i];
307	logger.debug("Processing WETfile: " + WETFile);
308
309	// Any .gz files listed means they haven't been unzipped yet. So unzip.
310	String WETFilename = WETFile.toString();
311	if(WETFilename.endsWith(".gz")) {
312	File GZippedWETFile = WETFile;
313	String WETGZippedFilename = WETFilename;
314	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
315
316	WETFile = new File(WETFilename);
317	Utility.unzipFile(GZippedWETFile, WETFile);
318	}
319	// hereafter all WETFiles should refer to the unzipped version
320	// Check the unzipped WETFile exists
321
322	if(!WETFile.exists() \|\| !WETFile.isFile()) {
323	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
324	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
325	return;
326	}
327
328	// Finally, we can process this WETFile's records into the keep and discard pile
329	wetFileCount++;
330	logger.debug("Off to process " + WETFile);
331	WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
332	wetFileProcessor.processWETFile();
333	wetRecordCount += wetFileProcessor.getRecordCount();
334	}
335
336	// for information purposes
337	ccWETFilesProcessor.setWETFileCount(wetFileCount);
338	ccWETFilesProcessor.setRecordCount(wetRecordCount);
339
340	File seedURLsFile = new File(outFolder, "seedURLs.txt");
341	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
342	ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
343
344	return;
345
346	}
347	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: