Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33501

Last change on this file since 33501 was 33501, checked in by ak19, 5 years ago
Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.
File size: 11.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.Set;
9	import java.util.TreeSet;
10
11	import org.apache.log4j.Logger;
12
13	/**
14	* The main() method of this class takes a folder of warc.wet(.gz) files and goes through
15	* the WET records in each, putting each WET record into a file. Each file is put into a
16	* keep or discard folder, based on content-length and number of lines.
17	* A single instance of the WETProcessor class processes a single unzipped warc.wet file.
18	*
19	* To compile, including the jars in lib/ for compiling.
20	* maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
21	*
22	* To run, passing the log4j and other properties files in conf/ folder:
23	* maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
24	*
25	* e.g.
26	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
27	* - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 \| less
28	*
29	*/
30	public class WETProcessor {
31	private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
32
33	// WARC WET header lines and header line prefixes of interest
34	static final String WARC_RECORD_START = "WARC/1.0";
35	static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
36	static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
37	static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
38
39	private final String WETFileID;
40	private final File inFile;
41
42	private int recordCount = 0;
43
44	/** Handle to a CCWETProcessor that processes a set of WET files
45	* Whereas a WETProcessor instance only processes a single WET file
46	* containing multiple WET records.
47	*/
48	private CCWETProcessor batchProcessor;
49
50	/**
51	* WET processor processes a single warc.wet file containing multiple WET records
52	* containing text identified as primary langcode=mri. Each individual WET record is written
53	* out to a uniquely named file in either the keep or discard folder depending on the WET
54	* record's content length and number of lines of actual content (excluding WARC headers).
55	*/
56	public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
57	this.batchProcessor = batchProcessor;
58
59	this.inFile = inFile;
60	// We just want a unique recordID prefix, which we get from the wet file name suffix:
61	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
62	// the prefix will be everything after the last hyphen and without file extension,
63	// so "000000" in our example. Then suffix the recordCount (keeping track of the current
64	// WET record) to get a unique filename to store each WET record into.
65
66	String fileID = inFile.getName();
67	fileID = fileID.substring(fileID.lastIndexOf("-")+1);
68	fileID = fileID.substring(0, fileID.indexOf("."));
69	this.WETFileID = fileID;
70	}
71
72	public int processWETFile() {
73	File keepURLsFile = this.batchProcessor.keepURLsFile;
74	File discardURLsFile = this.batchProcessor.discardURLsFile;
75
76	StringBuilder record = null;
77	String line = null;
78	boolean readingRecord = false;
79
80	String WARCtargetURI = "";
81
82	//int recordCount = 0;
83
84	int contentLength = -1; // of record
85	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
86
87	// read from WETfile
88	try (
89	BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
90	BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
91	BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
92	) {
93
94	while((line = reader.readLine()) != null) { // readLine removes newline separator
95
96	if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
97	readingRecord = false;
98	record = null; // drop this record, which is just an info record not actual web page's text
99	recordCount--;
100	continue;
101	}
102
103	if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
104	// process any previous record
105	if(record != null) {
106	processWETrecord(keepURLsWriter, discardURLsWriter,
107	recordCount, contentLength, lineCount,
108	WARCtargetURI, record.toString());
109	record = null;
110	contentLength = -1;
111	lineCount = -1;
112	}
113
114	recordCount++;
115	// get ready to start a new record
116	readingRecord = true;
117	record = new StringBuilder();
118	}
119
120	if(readingRecord) { // append current line to current record
121
122	if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
123	// get and store the value
124	WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
125	}
126
127	record.append(line + "\n"); // add back (unix style) line ending
128
129	// if the line is non-empty
130	// AND if we've started counting lines, which happens only when the current
131	// line is past WARC/WET headers and we're into the actual body portion
132	// of the WET record,
133	// start incrementing the line counter.
134	if(lineCount >= 0 && !line.trim().equals("")) {
135	lineCount++;
136	}
137	else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
138	String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
139	contentLength = Integer.parseInt(contentLengthStr);
140	lineCount = 0;
141	}
142
143	}
144
145	}
146
147	// flush the last record. If it was a warcinfo record, record would be null here
148	if(record != null) {
149	processWETrecord(keepURLsWriter, discardURLsWriter,
150	recordCount, contentLength, lineCount,
151	WARCtargetURI, record.toString());
152	record = null;
153	}
154
155	} catch(IOException ioe) {
156	ioe.printStackTrace();
157	}
158
159	return recordCount;
160	}
161
162	public int getRecordCount() { return this.recordCount; }
163
164	/**
165	* Determines if a WET record belongs in the keep or discard pile depending on if it
166	* contains enough text, based on contentLength and line count of the record body.
167	* Then writes out the WET record to a uniquely named file in the keep or discard folder,
168	* and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
169	*/
170	private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
171	int recordID, int contentLength, int lineCount,
172	String recordURI, String record)
173	{
174	System.err.println("WET #" + this.WETFileID + " record #" + recordID
175	+ " - contentLength: " + contentLength
176	+ " - lineCount: " + lineCount);
177	System.err.println("URI: " + recordURI);
178	//System.err.println(record);
179	//System.err.println("--------------------------");
180
181	String paddedFileName = String.format("%04d.txt", recordID);
182
183	File parentFolder = null;
184
185	// want to match "product(s)" but not "production"
186	//if(recordURI.matches("./?product[^a-rt-z].")) {//if(recordURI.matches("./?products?/?.")) {
187
188
189	/*
190	if(recordURI.contains("product") && !recordURI.contains("production")) {
191
192	// don't want a "translated" product site/online store
193	// These curiously often tend to have "product(s)" in the URL
194	parentFolder = batchProcessor.discardFolder;
195	}
196
197	else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
198	parentFolder = batchProcessor.keepFolder;
199	System.err.println("@@@KEEPING");
200	} else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
201	int countSpaces = 0;
202	for(int i = 0; i < record.length(); i++) {
203	if(record.charAt(i) == ' ') countSpaces++;
204	}
205	if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
206	// So we have at least 500 chars (possibly on a single wrapped line)
207	// containing at least 10 spaces. Such a record is also worth keeping.
208	parentFolder = batchProcessor.keepFolder;
209	}
210	}
211	*/
212
213	if(batchProcessor.isBlacklisted(recordURI)) {
214	parentFolder = batchProcessor.discardFolder;
215	}
216	else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
217	parentFolder = batchProcessor.discardFolder; // TODO: checkfolder
218	} else {
219	// If a web page's WET record contains a certain minimum number of words,
220	// we will think it's a meaningful web page and has sufficient content for text analysis
221	// to have been successful. Cut off values at present are:
222	// - a minimum of 20 words
223	// - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
224	// to words having been glued together. This is used by irrelevant sites and moreover
225	// can't be analysed for language, so may not be actually MRI.
226
227	// Though StringTokenizer still in use, as seen in discussion at
228	// https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
229	// String.split(regex) seems better for splitting on general whitespace
230	String[] allWords = record.split("\\s");
231	int validWordCount = 0;
232	int numCamelCaseWords = 0;
233	for(int i = 0; i < allWords.length; i++) {
234	String word = allWords[i];
235
236	// throw away if n words contain camelcase, which is another case of words glued together
237	if(word.matches(".[a-z][A-Z].") && word.length() >= 5) {
238	numCamelCaseWords++;
239	}
240
241	// In Maori, word length of 1 is not uncommon
242	// but let's skip camelcased words when counting valid words
243	else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++;
244	}
245
246	// dump if too many camelcase words (ideally keep none of that kind?)
247	if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
248	parentFolder = batchProcessor.discardFolder;
249	System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
250	}
251	else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
252	parentFolder = batchProcessor.keepFolder;
253	System.err.println("@@@KEEPING");
254	}
255	}
256	// if parentFolder still not set, set to discard pile folder
257	if(parentFolder == null) {
258	parentFolder = batchProcessor.discardFolder;
259	System.err.println("@@@DISCARDING");
260	}
261
262	try {
263	if (parentFolder == batchProcessor.keepFolder) {
264	keepURLsWriter.write(recordURI + "\n");
265	} else {
266	discardURLsWriter.write(recordURI + "\n");
267	}
268	} catch(Exception e) {
269	System.err.println("Unable to write URL");
270	e.printStackTrace();
271	}
272
273	System.err.println("--------------------------");
274
275	File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
276
277	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
278	writer.write(record);
279	// Try-with-resources examples don't call close() explicitly:
280	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
281	//writer.close();
282	} catch(IOException ioe) {
283	ioe.printStackTrace();
284	System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
285	}
286	}
287	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: