Context Navigation

WETProcessor.java@ 33480

Last change on this file since 33480 was 33480, checked in by ak19, 5 years ago

Much harder to remove pages where words are fused together as some are shorter than valid word-lengths of 15 chars, some are long, when the number of valid words still come to more than the required number of 20. The next solution was to ignore pages that had more than 2 instances of camelcase, but valid pages (actual Maori language pages) may end up with a few more camelcased words if navigation items get fused together. Not sure what to do.

File size: 16.7 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7
8	import org.apache.log4j.Logger;
9
10	/**
11	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
12	* the WET records in each, putting each WET record into a file. Each file is put into a
13	* keep or discard folder, based on content-length and number of lines.
14	* A single instance of the WETProcessor class processes a single unzipped warc.wet file.
15	*
16	* To compile, including the jars in lib/ for compiling.
17	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
18	*
19	* To run, passing the log4j and other properties files in conf/ folder:
20	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
21	*
22	* e.g.
23	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
24	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
25	*
26	*/
27	public class WETProcessor {
28	private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
29	private static Properties configProperties = new Properties();
30
31	// In Java, can initialize static final variables inside a static block
32	// But the unavoidable try/catch in this static block prevents initialization of
33	// the static final int variables (seen further below) inside the block itself,
34	// that therefore need to be declared and initialized thereafter.
35	static {
36	// load up the properties from the config file
37	try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
38	configProperties = new Properties();
39	configProperties.load(infile);
40	//infile.close();
41
42	} catch(Exception e) {
43	System.err.println("Exception attempting to read properties from config.properties.");
44	logger.error("Exception attempting to read properties from config.properties.");
45	e.printStackTrace();
46	}
47	}
48
49	// Providing fall-back cuttoff values if config.properties doesn't load
50	// or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
51	/*
52	private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
53	private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
54	private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
55	private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
56	*/
57	private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
58	private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
59	private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
60
61	// File paths shared across WETProcessor instances
62	private static File discardFolder;
63	private static File keepFolder;
64	private static File keepURLsFile;
65	private static File discardURLsFile;
66
67	// WARC WET header lines and header line prefixes of interest
68	static final String WARC_RECORD_START = "WARC/1.0";
69	static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
70	static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
71	static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
72
73	// Keep a count of all the records that all WETProcessors instantiated
74	// by our main method combined have processed
75	//private static int recordCount = 0;
76
77	private final File outputFolder;
78	private final String WETFileID;
79
80
81	/**
82	* WET processor processes a single warc.wet file containing multiple WET records
83	* containing text identified as primary langcode=mri. Each individual WET record is written
84	* out to a uniquely named file in either the keep or discard folder depending on the WET
85	* record's content length and number of lines of actual content (excluding WARC headers).
86	*/
87	public WETProcessor(File inFile, File outFolder) {
88	this.outputFolder = outFolder;
89
90	StringBuilder record = null;
91	String line = null;
92	boolean readingRecord = false;
93
94	String WARCtargetURI = "";
95
96	int recordCount = 0;
97
98	int contentLength = -1; // of record
99	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
100
101	// We just want a unique recordID prefix, which we get from the wet file name suffix:
102	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
103	// the prefix will be everything after the last hyphen and without file extension,
104	// so "000000" in our example. Then suffix the recordCount (keeping track of the current
105	// WET record) to get a unique filename to store each WET record into.
106
107	String fileID = inFile.getName();
108	fileID = fileID.substring(fileID.lastIndexOf("-")+1);
109	fileID = fileID.substring(0, fileID.indexOf("."));
110	this.WETFileID = fileID;
111
112
113	// read from WETfile
114	try (
115	BufferedReader reader = new BufferedReader(new FileReader(inFile));
116	BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
117	BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
118	) {
119
120	while((line = reader.readLine()) != null) { // readLine removes newline separator
121
122	if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
123	readingRecord = false;
124	record = null; // drop this record, which is just an info record not actual web page's text
125	recordCount--;
126	continue;
127	}
128
129	if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
130	// process any previous record
131	if(record != null) {
132	processWETrecord(keepURLsWriter, discardURLsWriter,
133	recordCount, contentLength, lineCount,
134	WARCtargetURI, record.toString());
135	record = null;
136	contentLength = -1;
137	lineCount = -1;
138	}
139
140	recordCount++;
141	// get ready to start a new record
142	readingRecord = true;
143	record = new StringBuilder();
144	}
145
146	if(readingRecord) { // append current line to current record
147
148	if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
149	// get and store the value
150	WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
151	}
152
153	record.append(line + "\n"); // add back (unix style) line ending
154
155	// if the line is non-empty
156	// AND if we've started counting lines, which happens only when the current
157	// line is past WARC/WET headers and we're into the actual body portion
158	// of the WET record,
159	// start incrementing the line counter.
160	if(lineCount >= 0 && !line.trim().equals("")) {
161	lineCount++;
162	}
163	else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
164	String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
165	contentLength = Integer.parseInt(contentLengthStr);
166	lineCount = 0;
167	}
168
169	}
170
171	}
172
173	// flush the last record. If it was a warcinfo record, record would be null here
174	if(record != null) {
175	processWETrecord(keepURLsWriter, discardURLsWriter,
176	recordCount, contentLength, lineCount,
177	WARCtargetURI, record.toString());
178	record = null;
179	}
180
181	} catch(IOException ioe) {
182	ioe.printStackTrace();
183	}
184	}
185
186	/**
187	* Determines if a WET record belongs in the keep or discard pile depending on if it
188	* contains enough text, based on contentLength and line count of the record body.
189	* Then writes out the WET record to a uniquely named file in the keep or discard folder,
190	* and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
191	*/
192	private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
193	int recordID, int contentLength, int lineCount,
194	String recordURI, String record)
195	{
196	System.err.println("WET #" + this.WETFileID + " record #" + recordID
197	+ " - contentLength: " + contentLength
198	+ " - lineCount: " + lineCount);
199	System.err.println("URI: " + recordURI);
200	//System.err.println(record);
201	//System.err.println("--------------------------");
202
203	String paddedFileName = String.format("%04d.txt", recordID);
204
205	File parentFolder = null;
206
207	// want to match "product(s)" but not "production"
208	//if(recordURI.matches("./?product[^a-rt-z].")) {//if(recordURI.matches("./?products?/?.")) {
209
210
211	if(recordURI.contains("product") && !recordURI.contains("production")) {
212
213	// don't want a "translated" product site/online store
214	// These curiously often tend to have "product(s)" in the URL
215	parentFolder = WETProcessor.discardFolder;
216	}
217	/*
218	else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
219	parentFolder = WETProcessor.keepFolder;
220	System.err.println("@@@KEEPING");
221	} else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
222	int countSpaces = 0;
223	for(int i = 0; i < record.length(); i++) {
224	if(record.charAt(i) == ' ') countSpaces++;
225	}
226	if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
227	// So we have at least 500 chars (possibly on a single wrapped line)
228	// containing at least 10 spaces. Such a record is also worth keeping.
229	parentFolder = WETProcessor.keepFolder;
230	}
231	}
232	*/
233
234	else {
235	// If a web page's WET record contains a certain minimum number of words,
236	// we will think it's a meaningful web page and has sufficient content for text analysis
237	// to have been successful. Cut off values at present are:
238	// - a minimum of 20 words
239	// - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
240	// to words having been glued together. This is used by irrelevant sites and moreover
241	// can't be analysed for language, so may not be actually MRI.
242
243	// Though StringTokenizer still in use, as seen in discussion at
244	// https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
245	// String.split(regex) seems better for splitting on general whitespace
246	String[] allWords = record.split("\\s");
247	int validWordCount = 0;
248	int numCamelCaseWords = 0;
249	for(int i = 0; i < allWords.length; i++) {
250	String word = allWords[i];
251
252	// throw away if n words contain camelcase, which is another case of words glued together
253	if(word.matches(".[a-z][A-Z].") && word.length() >= 5) {
254	numCamelCaseWords++;
255	}
256
257	// In Maori, word length of 1 is not uncommon
258	// but let's skip camelcased words when counting valid words
259	else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
260	}
261
262	// dump if too many camelcase words (ideally keep none of that kind?)
263	if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
264	parentFolder = WETProcessor.discardFolder;
265	System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
266	}
267	else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
268	parentFolder = WETProcessor.keepFolder;
269	System.err.println("@@@KEEPING");
270	}
271	}
272	// if parentFolder still not set, set to discard pile folder
273	if(parentFolder == null) {
274	parentFolder = WETProcessor.discardFolder;
275	System.err.println("@@@DISCARDING");
276	}
277
278	try {
279	if (parentFolder == WETProcessor.keepFolder) {
280	keepURLsWriter.write(recordURI + "\n");
281	} else {
282	discardURLsWriter.write(recordURI + "\n");
283	}
284	} catch(Exception e) {
285	System.err.println("Unable to write URL");
286	e.printStackTrace();
287	}
288
289	System.err.println("--------------------------");
290
291	File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
292
293	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
294	writer.write(record);
295	// Try-with-resources examples don't call close() explicitly:
296	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
297	//writer.close();
298	} catch(IOException ioe) {
299	ioe.printStackTrace();
300	System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
301	}
302	}
303
304
305	//public static int getRecordCount() { return recordCount; }
306
307	public static void printUsage() {
308	System.err.println("Run this program as:");
309	System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
310	}
311
312	/** Filename filter to only list warc.wet files or else warc.wet.gz files
313	* for which unzipped warc.wet equivalents don't yet exist.
314	*/
315	private static class WETFilenameFilter implements FilenameFilter {
316
317	public boolean accept(File dir, String name) {
318	if(name.endsWith(".warc.wet")) {
319	logger.debug("Will include " + name + " for processing.");
320	return true;
321	}
322
323	if(name.endsWith(".warc.wet.gz")) {
324	String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
325	File unzippedVersion = new File(dir, nameWithoutGZext);
326	if(unzippedVersion.exists()) {
327	logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
328	logger.debug("Skipping " + name);
329	return false; // don't count gzipped version if unzipped version exists.
330	}
331	else {
332	logger.debug("Only zipped version " + name + " exists.");
333	return true; // No unzipped version, so have to work with gzipped version
334	}
335	}
336
337	// we're not even interested in any other file extensions
338	logger.debug("Not a WET file. Skipping " + name);
339	return false;
340	}
341	}
342
343	public static void main(String[] args) {
344	if(args.length != 2) {
345	printUsage();
346	return;
347	}
348
349
350	File WETFileDir = new File(args[0]);
351	if(!WETFileDir.exists() \|\| !WETFileDir.isDirectory()) {
352	System.out.println("Error: " + args[0] + " does not exist or is not a directory");
353	return;
354	}
355
356	File outFolder = new File(args[1]);
357	if(!outFolder.exists() \|\| !outFolder.isDirectory()) {
358	System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
359	return;
360	}
361
362	// static folders and files to be shared across all WETProcessor instances
363	WETProcessor.discardFolder = new File(outFolder, "discard");
364	if(!WETProcessor.discardFolder.exists()) {
365	WETProcessor.discardFolder.mkdir();
366	}
367	WETProcessor.keepFolder = new File(outFolder, "keep");
368	if(!WETProcessor.keepFolder.exists()) {
369	WETProcessor.keepFolder.mkdir();
370	}
371
372	WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");
373	if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
374	System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
375	return;
376	}
377	WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
378	if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
379	System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
380	return;
381	}
382
383	// Will list all the warc.wet files in the input directory or else their gzipped versions
384	File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
385
386	for(int i = 0; i < WETFiles.length; i++) {
387	File WETFile = WETFiles[i];
388	logger.debug("Processing WETfile: " + WETFile);
389
390	// Any .gz files listed means they haven't been unzipped yet. So unzip.
391	String WETFilename = WETFile.toString();
392	if(WETFilename.endsWith(".gz")) {
393	File GZippedWETFile = WETFile;
394	String WETGZippedFilename = WETFilename;
395	WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
396
397	WETFile = new File(WETFilename);
398	Utility.unzipFile(GZippedWETFile, WETFile);
399	}
400	// hereafter all WETFiles should refer to the unzipped version
401	// Check the unzipped WETFile exists
402
403	if(!WETFile.exists() \|\| !WETFile.isFile()) {
404	System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
405	logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
406	return;
407	}
408
409	// Finally, we can process this WETFile's records into the keep and discard pile
410	logger.debug("Off to process " + WETFile);
411	WETProcessor processor = new WETProcessor(WETFile, outFolder);
412
413	}
414
415	return;
416	}
417	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33480

Download in other formats: