Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33517

Last change on this file since 33517 was 33517, checked in by ak19, 5 years ago
Blacklists were introduced so that too many instances of camelcased words need no longer disqualify WET records from inclusion in the keep pile. Still check camelcasing of words as such words don't get counted as valid words, in the valid word count that determines if there's sufficient content in a WET record. 2. Some more commenting.
File size: 12.0 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.zip.GZIPInputStream;
7	import java.util.Iterator;
8	import java.util.Set;
9	import java.util.TreeSet;
10
11	import org.apache.log4j.Logger;
12
13	/**
14	* A single instance of the WETProcessor class can process a single unzipped warc.wet file.
15	* A WETProcessor take a warc.wet file and goes through all its WET records,
16	* putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
17	* and its url listed written into a keep, discard or greylisted text file, based on:
18	*
19	* 1. whether it's whitelisted, else greylisted else blacklisted
20	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
21	* enough content. Formerly, content-length and number of lines were used to determine if
22	* the content was sufficient. Now it's just word count and number of MAX characters
23	* (not MINIMUM characters) that determine a string is a word.
24	* Explicit whitelisting has precedence over greylisting and which takes precedence
25	* over blacklisting in turn.
26	* However, even explicitly whitelisted urls still need to have sufficient content to end
27	* up in keepURLs.txt.
28	*
29	* See CCWETProcessor.java for compile instructions and how to run.
30	*
31	*/
32	public class WETProcessor {
33	private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
34
35	// WARC WET header lines and header line prefixes of interest
36	static final String WARC_RECORD_START = "WARC/1.0";
37	static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
38	static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
39	static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
40
41	private final String WETFileID;
42	private final File inFile;
43
44	private int recordCount = 0;
45
46	/** Handle to a CCWETProcessor that processes a set of WET files
47	* Whereas a WETProcessor instance only processes a single WET file
48	* containing multiple WET records.
49	*/
50	private CCWETProcessor batchProcessor;
51
52	/**
53	* WET processor processes a single warc.wet file containing multiple WET records
54	* containing text identified as primary langcode=mri. Each individual WET record is written
55	* out to a uniquely named file in either the keep or discard folder depending on the WET
56	* record's content length and number of lines of actual content (excluding WARC headers).
57	*/
58	public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
59	this.batchProcessor = batchProcessor;
60
61	this.inFile = inFile;
62	// We just want a unique recordID prefix, which we get from the wet file name suffix:
63	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
64	// the prefix will be everything after the last hyphen and without file extension,
65	// so "000000" in our example. Then suffix the recordCount (keeping track of the current
66	// WET record) to get a unique filename to store each WET record into.
67
68	String fileID = inFile.getName();
69	fileID = fileID.substring(fileID.lastIndexOf("-")+1);
70	fileID = fileID.substring(0, fileID.indexOf("."));
71	this.WETFileID = fileID;
72	}
73
74	public int processWETFile() {
75	File keepURLsFile = this.batchProcessor.keepURLsFile;
76	File discardURLsFile = this.batchProcessor.discardURLsFile;
77	File greyListedFile = this.batchProcessor.greyListedFile;
78
79	StringBuilder record = null;
80	String line = null;
81	boolean readingRecord = false;
82
83	String WARCtargetURI = "";
84
85	//int recordCount = 0;
86
87	int contentLength = -1; // of record
88	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
89
90	// read from WETfile
91	try (
92	BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
93	BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
94	BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
95	BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
96	) {
97
98	while((line = reader.readLine()) != null) { // readLine removes newline separator
99
100	if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
101	readingRecord = false;
102	record = null; // drop this record, which is just an info record not actual web page's text
103	recordCount--;
104	continue;
105	}
106
107	if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
108	// process any previous record
109	if(record != null) {
110	processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
111	recordCount, contentLength, lineCount,
112	WARCtargetURI, record.toString());
113	record = null;
114	contentLength = -1;
115	lineCount = -1;
116	}
117
118	recordCount++;
119	// get ready to start a new record
120	readingRecord = true;
121	record = new StringBuilder();
122	}
123
124	if(readingRecord) { // append current line to current record
125
126	if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
127	// get and store the value
128	WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
129	}
130
131	record.append(line + "\n"); // add back (unix style) line ending
132
133	// if the line is non-empty
134	// AND if we've started counting lines, which happens only when the current
135	// line is past WARC/WET headers and we're into the actual body portion
136	// of the WET record,
137	// start incrementing the line counter.
138	if(lineCount >= 0 && !line.trim().equals("")) {
139	lineCount++;
140	}
141	else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
142	String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
143	contentLength = Integer.parseInt(contentLengthStr);
144	lineCount = 0;
145	}
146
147	}
148
149	}
150
151	// flush the last record. If it was a warcinfo record, record would be null here
152	if(record != null) {
153	processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
154	recordCount, contentLength, lineCount,
155	WARCtargetURI, record.toString());
156	record = null;
157	}
158
159	} catch(IOException ioe) {
160	ioe.printStackTrace();
161	}
162
163	return recordCount;
164	}
165
166	public int getRecordCount() { return this.recordCount; }
167
168	/**
169	* Determines if a WET record belongs in the keep or discard pile depending on if it
170	* contains enough text, based on contentLength and line count of the record body.
171	* Then writes out the WET record to a uniquely named file in the keep or discard folder,
172	* and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
173	*/
174	private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
175	BufferedWriter greyListedURLsWriter,
176	int recordID, int contentLength, int lineCount,
177	String recordURI, String record)
178	{
179	System.err.println("WET #" + this.WETFileID + " record #" + recordID
180	+ " - contentLength: " + contentLength
181	+ " - lineCount: " + lineCount);
182	System.err.println("URI: " + recordURI);
183	//System.err.println(record);
184	//System.err.println("--------------------------");
185
186	String paddedFileName = String.format("%04d.txt", recordID);
187
188	File parentFolder = null;
189
190
191	if(batchProcessor.isBlacklisted(recordURI)) {
192
193	// explicit whitelisting overrides blacklisting
194	if(batchProcessor.isWhitelisted(recordURI)) {
195	parentFolder = batchProcessor.keepFolder; //tentative
196	}
197	// if not whitelisted, then greylisting still overrides blacklisting
198	else if(batchProcessor.isGreylisted(recordURI)) {
199	parentFolder = batchProcessor.greyListedFolder;
200	System.err.println("@@@GREYLISTED");
201	}
202	else { // url was only blacklisted
203	parentFolder = batchProcessor.discardFolder;
204	System.err.println("@@@DISCARDING - blacklisted");
205	}
206	}
207	else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
208	// explicit whitelisting overrides greylisting
209	if(batchProcessor.isWhitelisted(recordURI)) {
210	parentFolder = batchProcessor.keepFolder; // tentative
211	}
212	else {
213	parentFolder = batchProcessor.greyListedFolder;
214	System.err.println("@@@GREYLISTED");
215	}
216	}
217
218	// If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
219	// it still can't be in the keep list as it needs further inspection:
220	// it needs sufficient content for language analysis.
221	// We don't care about the combination of number of lines and content-length,
222	// we just care about the number of "valid words" as defined by us.
223	if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed \|\| parentFolder == null
224
225	// If a web page's WET record contains a certain minimum number of words,
226	// we will think it's a meaningful web page and has sufficient content for text analysis
227	// to have been successful. Cut off values at present are:
228	// - a minimum of 20 words
229	// - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
230	// to words having been glued together. This is used by irrelevant sites and moreover
231	// can't be analysed for language, so may not be actually MRI.
232
233	// Though StringTokenizer still in use, as seen in discussion at
234	// https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
235	// String.split(regex) seems better for splitting on general whitespace
236	String[] allWords = record.split("\\s");
237	int validWordCount = 0;
238	int numCamelCaseWords = 0;
239	for(int i = 0; i < allWords.length; i++) {
240	String word = allWords[i];
241
242	// throw away if n words contain camelcase, which is another case of words glued together
243	// For now, we'll only skip camelcased words in our count of valid words
244	if(word.matches(".[a-z][A-Z].") && word.length() >= 5) {
245	numCamelCaseWords++;
246	}
247	// In Maori, word length of 1 is not uncommon
248	// but let's skip camelcased words when counting valid words
249	else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
250	validWordCount++;
251	}
252	}
253
254
255	/*
256	// dump if too many camelcase words (ideally keep no WET record of that kind?)
257	if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
258	parentFolder = batchProcessor.discardFolder;
259	System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
260	}
261	else*/
262	// For now, don't discount content with too many camelcased words
263	// Just focus on whether there are a sufficient number of valid words
264	// (camelcased words are however still ignored in our count of valid words)
265	if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
266	parentFolder = batchProcessor.keepFolder;
267	System.err.println("@@@KEEPING");
268	}
269	}
270	// if parentFolder still not set, it means that the content length/num words or lines
271	// were insufficient, so meant to be discarded
272	if(parentFolder == null) {
273	parentFolder = batchProcessor.discardFolder;
274	System.err.println("@@@DISCARDING");
275	}
276
277	try {
278	if (parentFolder == batchProcessor.keepFolder) {
279	keepURLsWriter.write(recordURI + "\n");
280	} else if (parentFolder == batchProcessor.greyListedFolder) {
281	greyListedURLsWriter.write(recordURI + "\n");
282	} else {
283	discardURLsWriter.write(recordURI + "\n");
284	}
285	} catch(Exception e) {
286	System.err.println("Unable to write URL");
287	e.printStackTrace();
288	}
289
290	System.err.println("--------------------------");
291
292	File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
293
294	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
295	writer.write(record);
296	// Try-with-resources examples don't call close() explicitly:
297	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
298	//writer.close();
299	} catch(IOException ioe) {
300	ioe.printStackTrace();
301	System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
302	}
303	}
304	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: