Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33497

Last change on this file since 33497 was 33497, checked in by ak19, 5 years ago
First version of discard url filter file. Inefficient implementation. Better to read the file once, adjust the filters as required and keep in memory to do the comparisons with each URL.
File size: 21.8 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.Set;
9	import java.util.TreeSet;
10
11	import org.apache.log4j.Logger;
12
13	/**
14	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
15	* the WET records in each, putting each WET record into a file. Each file is put into a
16	* keep or discard folder, based on content-length and number of lines.
17	* A single instance of the WETProcessor class processes a single unzipped warc.wet file.
18	*
19	* To compile, including the jars in lib/ for compiling.
20	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
21	*
22	* To run, passing the log4j and other properties files in conf/ folder:
23	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
24	*
25	* e.g.
26	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
27	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
28	*
29	*/
30	public class WETProcessor {
31	private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
32	private static Properties configProperties = new Properties();
33
34	// In Java, can initialize static final variables inside a static block
35	// But the unavoidable try/catch in this static block prevents initialization of
36	// the static final int variables (seen further below) inside the block itself,
37	// that therefore need to be declared and initialized thereafter.
38	static {
39	// load up the properties from the config file
40	try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
41	configProperties = new Properties();
42	configProperties.load(infile);
43	//infile.close();
44
45	} catch(Exception e) {
46	System.err.println("Exception attempting to read properties from config.properties.");
47	logger.error("Exception attempting to read properties from config.properties.");
48	e.printStackTrace();
49	}
50	}
51
52	// Providing fall-back cuttoff values if config.properties doesn't load
53	// or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
54	/*
55	private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
56	private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
57	private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
58	private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
59	*/
60	private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
61	private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
62	private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
63
64	// File paths shared across WETProcessor instances
65	private static File discardFolder;
66	private static File keepFolder;
67	private static File keepURLsFile;
68	private static File discardURLsFile;
69
70	// WARC WET header lines and header line prefixes of interest
71	static final String WARC_RECORD_START = "WARC/1.0";
72	static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
73	static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
74	static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
75
76	// Keep a count of all the records that all WETProcessors instantiated
77	// by our main method combined have processed
78	//private static int recordCount = 0;
79
80	private final File outputFolder;
81	private final String WETFileID;
82
83
84	/**
85	* WET processor processes a single warc.wet file containing multiple WET records
86	* containing text identified as primary langcode=mri. Each individual WET record is written
87	* out to a uniquely named file in either the keep or discard folder depending on the WET
88	* record's content length and number of lines of actual content (excluding WARC headers).
89	*/
90	public WETProcessor(File inFile, File outFolder) {
91	this.outputFolder = outFolder;
92
93	StringBuilder record = null;
94	String line = null;
95	boolean readingRecord = false;
96
97	String WARCtargetURI = "";
98
99	int recordCount = 0;
100
101	int contentLength = -1; // of record
102	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
103
104	// We just want a unique recordID prefix, which we get from the wet file name suffix:
105	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
106	// the prefix will be everything after the last hyphen and without file extension,
107	// so "000000" in our example. Then suffix the recordCount (keeping track of the current
108	// WET record) to get a unique filename to store each WET record into.
109
110	String fileID = inFile.getName();
111	fileID = fileID.substring(fileID.lastIndexOf("-")+1);
112	fileID = fileID.substring(0, fileID.indexOf("."));
113	this.WETFileID = fileID;
114
115
116	// read from WETfile
117	try (
118	BufferedReader reader = new BufferedReader(new FileReader(inFile));
119	BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
120	BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
121	) {
122
123	while((line = reader.readLine()) != null) { // readLine removes newline separator
124
125	if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
126	readingRecord = false;
127	record = null; // drop this record, which is just an info record not actual web page's text
128	recordCount--;
129	continue;
130	}
131
132	if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
133	// process any previous record
134	if(record != null) {
135	processWETrecord(keepURLsWriter, discardURLsWriter,
136	recordCount, contentLength, lineCount,
137	WARCtargetURI, record.toString());
138	record = null;
139	contentLength = -1;
140	lineCount = -1;
141	}
142
143	recordCount++;
144	// get ready to start a new record
145	readingRecord = true;
146	record = new StringBuilder();
147	}
148
149	if(readingRecord) { // append current line to current record
150
151	if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
152	// get and store the value
153	WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
154	}
155
156	record.append(line + "\n"); // add back (unix style) line ending
157
158	// if the line is non-empty
159	// AND if we've started counting lines, which happens only when the current
160	// line is past WARC/WET headers and we're into the actual body portion
161	// of the WET record,
162	// start incrementing the line counter.
163	if(lineCount >= 0 && !line.trim().equals("")) {
164	lineCount++;
165	}
166	else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
167	String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
168	contentLength = Integer.parseInt(contentLengthStr);
169	lineCount = 0;
170	}
171
172	}
173
174	}
175
176	// flush the last record. If it was a warcinfo record, record would be null here
177	if(record != null) {
178	processWETrecord(keepURLsWriter, discardURLsWriter,
179	recordCount, contentLength, lineCount,
180	WARCtargetURI, record.toString());
181	record = null;
182	}
183
184	} catch(IOException ioe) {
185	ioe.printStackTrace();
186	}
187	}
188
189	/**
190	* Determines if a WET record belongs in the keep or discard pile depending on if it
191	* contains enough text, based on contentLength and line count of the record body.
192	* Then writes out the WET record to a uniquely named file in the keep or discard folder,
193	* and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
194	*/
195	private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
196	int recordID, int contentLength, int lineCount,
197	String recordURI, String record)
198	{
199	System.err.println("WET #" + this.WETFileID + " record #" + recordID
200	+ " - contentLength: " + contentLength
201	+ " - lineCount: " + lineCount);
202	System.err.println("URI: " + recordURI);
203	//System.err.println(record);
204	//System.err.println("--------------------------");
205
206	String paddedFileName = String.format("%04d.txt", recordID);
207
208	File parentFolder = null;
209
210	// want to match "product(s)" but not "production"
211	//if(recordURI.matches("./?product[^a-rt-z].")) {//if(recordURI.matches("./?products?/?.")) {
212
213
214	/*
215	if(recordURI.contains("product") && !recordURI.contains("production")) {
216
217	// don't want a "translated" product site/online store
218	// These curiously often tend to have "product(s)" in the URL
219	parentFolder = WETProcessor.discardFolder;
220	}
221
222	else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
223	parentFolder = WETProcessor.keepFolder;
224	System.err.println("@@@KEEPING");
225	} else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
226	int countSpaces = 0;
227	for(int i = 0; i < record.length(); i++) {
228	if(record.charAt(i) == ' ') countSpaces++;
229	}
230	if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
231	// So we have at least 500 chars (possibly on a single wrapped line)
232	// containing at least 10 spaces. Such a record is also worth keeping.
233	parentFolder = WETProcessor.keepFolder;
234	}
235	}
236	*/
237
238	if(isInDiscardFilter(recordURI)) {
239	parentFolder = WETProcessor.discardFolder;
240	}
241	else if(isInCheckFilter(recordURI)) { // products sites
242	parentFolder = WETProcessor.discardFolder; // TODO: checkfolder
243	} else {
244	// If a web page's WET record contains a certain minimum number of words,
245	// we will think it's a meaningful web page and has sufficient content for text analysis
246	// to have been successful. Cut off values at present are:
247	// - a minimum of 20 words
248	// - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
249	// to words having been glued together. This is used by irrelevant sites and moreover
250	// can't be analysed for language, so may not be actually MRI.
251
252	// Though StringTokenizer still in use, as seen in discussion at
253	// https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
254	// String.split(regex) seems better for splitting on general whitespace
255	String[] allWords = record.split("\\s");
256	int validWordCount = 0;
257	int numCamelCaseWords = 0;
258	for(int i = 0; i < allWords.length; i++) {
259	String word = allWords[i];
260
261	// throw away if n words contain camelcase, which is another case of words glued together
262	if(word.matches(".[a-z][A-Z].") && word.length() >= 5) {
263	numCamelCaseWords++;
264	}
265
266	// In Maori, word length of 1 is not uncommon
267	// but let's skip camelcased words when counting valid words
268	else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
269	}
270
271	// dump if too many camelcase words (ideally keep none of that kind?)
272	if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
273	parentFolder = WETProcessor.discardFolder;
274	System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
275	}
276	else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
277	parentFolder = WETProcessor.keepFolder;
278	System.err.println("@@@KEEPING");
279	}
280	}
281	// if parentFolder still not set, set to discard pile folder
282	if(parentFolder == null) {
283	parentFolder = WETProcessor.discardFolder;
284	System.err.println("@@@DISCARDING");
285	}
286
287	try {
288	if (parentFolder == WETProcessor.keepFolder) {
289	keepURLsWriter.write(recordURI + "\n");
290	} else {
291	discardURLsWriter.write(recordURI + "\n");
292	}
293	} catch(Exception e) {
294	System.err.println("Unable to write URL");
295	e.printStackTrace();
296	}
297
298	System.err.println("--------------------------");
299
300	File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
301
302	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
303	writer.write(record);
304	// Try-with-resources examples don't call close() explicitly:
305	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
306	//writer.close();
307	} catch(IOException ioe) {
308	ioe.printStackTrace();
309	System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
310	}
311	}
312
313
314	/**
315	* Takes as input the keepURLs.txt file generated by running WETProcessor instances.
316	* As output produces the URL seed list and regex-urlfilter text files required by nutch,
317	* https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
318	*/
319	public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {
320	// Maintain Sets of unique domains and urls
321	// TreeSet: by default, "the elements are ordered using their natural ordering"
322	// (or by a Comparator provided at set creation time).
323	// Whereas HashSet doesn't guarantee ordering.
324	// So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
325
326	Set<String> domainsSet = new TreeSet<String>();
327	Set<String> urlsSet = new TreeSet<String>();
328
329	final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)"; // https?://([a-z0-9-]+\.)
330
331	try (
332	BufferedReader reader = new BufferedReader(new FileReader(urlsFile));
333	) {
334
335	// read a URL at a time from urlsFile
336	String url = null;
337	String domain = null;
338	while((url = reader.readLine()) != null) { // readLine removes newline separator
339
340	// work out domain. This retains any www. or subdomain prefix:
341	int startIndex = url.indexOf("//"); // http:// or https:// prefix
342	startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
343	domain = url.substring(startIndex);
344	int endIndex = domain.indexOf("/");
345	if(endIndex == -1) endIndex = domain.length();
346	domain = domain.substring(0, endIndex);
347
348	//if(!domainsMap.containsKey(domain)) {
349	urlsSet.add(url);
350	domainsSet.add(domain);
351	//}
352	}
353	} catch (IOException ioe) {
354	ioe.printStackTrace();
355	System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);
356	}
357
358	try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
359	Iterator<String> i = urlsSet.iterator();
360	while(i.hasNext()) {
361	String url = i.next();
362	seedURLsWriter.write(url + "\n");
363	}
364
365	} catch (IOException ioe) {
366	ioe.printStackTrace();
367	System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
368	}
369
370	try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
371	Iterator<String> i = domainsSet.iterator();
372	// nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
373	while(i.hasNext()) {
374	String domain = i.next();
375	domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
376	urlFilterWriter.write(domain + "\n");
377	}
378
379	} catch (IOException ioe) {
380	ioe.printStackTrace();
381	System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
382	}
383	}
384
385	/**
386	* Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
387	* whether it is in the discard list.
388	* Filters don't represent actual regex, just ^ and $ as start and end terminators.
389	* By not having this method deal with actual regex for filters, this has the advantage that
390	* we don't have to remember to escape or double escape each filter to turn it into a regex.
391	*/
392	public boolean isInDiscardFilter(String url) {
393
394	String discardFilterFile = "url-discard-filter.txt"; // in conf folder
395
396	try (
397	BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
398	) {
399	String filter = null;
400	while((filter = reader.readLine()) != null) {
401	if(filter.trim().equals("")) {
402	continue;
403	}
404	//System.err.println("Got filter: " + filter);
405	if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
406	System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
407	}
408	else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
409	System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
410	return true;
411	}
412	else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
413	System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
414	return true;
415	}
416	else if(url.contains(filter)) {
417	System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
418	return true;
419	}
420
421	}
422
423	} catch (IOException ioe) {
424	ioe.printStackTrace();
425	System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
426	}
427
428	return false;
429	}
430
431	// TODO
432	public boolean isInCheckFilter(String url) {
433	//System.err.println("isInCheckFilter(url) is not yet implemented");
434	return false;
435	}
436
437	//public static int getRecordCount() { return recordCount; }
438
439	public static void printUsage() {
440	System.err.println("Run this program as:");
441	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
442	}
443
444	/** Filename filter to only list warc.wet files or else warc.wet.gz files
445	* for which unzipped warc.wet equivalents don't yet exist.
446	*/
447	private static class WETFilenameFilter implements FilenameFilter {
448
449	public boolean accept(File dir, String name) {
450	if(name.endsWith(".warc.wet")) {
451	logger.debug("Will include " + name + " for processing.");
452	return true;
453	}
454
455	if(name.endsWith(".warc.wet.gz")) {
456	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
457	File unzippedVersion = new File(dir, nameWithoutGZext);
458	if(unzippedVersion.exists()) {
459	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
460	logger.debug("Skipping " + name);
461	return false; // don't count gzipped version if unzipped version exists.
462	}
463	else {
464	logger.debug("Only zipped version " + name + " exists.");
465	return true; // No unzipped version, so have to work with gzipped version
466	}
467	}
468
469	// we're not even interested in any other file extensions
470	logger.debug("Not a WET file. Skipping " + name);
471	return false;
472	}
473	}
474
475	public static void main(String[] args) {
476	if(args.length != 2) {
477	printUsage();
478	return;
479	}
480
481
482	File WETFileDir = new File(args[0]);
483	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
484	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
485	return;
486	}
487
488	File outFolder = new File(args[1]);
489	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
490	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
491	return;
492	}
493
494	// static folders and files to be shared across all WETProcessor instances
495	WETProcessor.discardFolder = new File(outFolder, "discard");
496	if(!WETProcessor.discardFolder.exists()) {
497	WETProcessor.discardFolder.mkdir();
498	}
499	WETProcessor.keepFolder = new File(outFolder, "keep");
500	if(!WETProcessor.keepFolder.exists()) {
501	WETProcessor.keepFolder.mkdir();
502	}
503
504	WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");
505	if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
506	System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
507	return;
508	}
509	WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
510	if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
511	System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
512	return;
513	}
514
515	// Will list all the warc.wet files in the input directory or else their gzipped versions
516	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
517
518	for(int i = 0; i < WETFiles.length; i++) {
519	File WETFile = WETFiles[i];
520	logger.debug("Processing WETfile: " + WETFile);
521
522	// Any .gz files listed means they haven't been unzipped yet. So unzip.
523	String WETFilename = WETFile.toString();
524	if(WETFilename.endsWith(".gz")) {
525	File GZippedWETFile = WETFile;
526	String WETGZippedFilename = WETFilename;
527	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
528
529	WETFile = new File(WETFilename);
530	Utility.unzipFile(GZippedWETFile, WETFile);
531	}
532	// hereafter all WETFiles should refer to the unzipped version
533	// Check the unzipped WETFile exists
534
535	if(!WETFile.exists() \|\| !WETFile.isFile()) {
536	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
537	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
538	return;
539	}
540
541	// Finally, we can process this WETFile's records into the keep and discard pile
542	logger.debug("Off to process " + WETFile);
543	WETProcessor processor = new WETProcessor(WETFile, outFolder);
544
545	}
546
547	File seedURLsFile = new File(outFolder, "seedURLs.txt");
548	File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
549	WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);
550
551	return;
552	}
553	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: