Context Navigation

WETProcessor.java@ 33652

Last change on this file since 33652 was 33615, checked in by ak19, 5 years ago
Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 13.1 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.Iterator;
7	import java.util.Set;
8	import java.util.TreeSet;
9
10	import org.apache.log4j.Logger;
11
12	/**
13	* A single instance of the WETProcessor class can process a single unzipped warc.wet file.
14	* A WETProcessor take a warc.wet file and goes through all its WET records,
15	* putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
16	* and its url listed written into a keep, discard or greylisted text file, based on:
17	*
18	* 1. whether it's whitelisted, else greylisted else blacklisted
19	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
20	* enough content. Formerly, content-length and number of lines were used to determine if
21	* the content was sufficient. Now it's just word count and number of MAX characters
22	* (not MINIMUM characters) that determine a string is a word.
23	* Explicit whitelisting has precedence over greylisting and which takes precedence
24	* over blacklisting in turn.
25	* However, even explicitly whitelisted urls still need to have sufficient content to end
26	* up in keepURLs.txt.
27	*
28	* See CCWETProcessor.java for compile instructions and how to run.
29	*
30	*/
31	public class WETProcessor {
32	private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
33
34	// WARC WET header lines and header line prefixes of interest
35	static final String WARC_RECORD_START = "WARC/1.0";
36	static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
37	static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
38	static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
39
40	private final String crawlID;
41	private final int WETFileID;
42	private final File inFile;
43
44	private int recordCount = 0;
45
46	/** Handle to a CCWETProcessor that processes a set of WET files
47	* Whereas a WETProcessor instance only processes a single WET file
48	* containing multiple WET records.
49	*/
50	private CCWETProcessor batchProcessor;
51
52	/**
53	* WET processor processes a single warc.wet file containing multiple WET records
54	* containing text identified as primary langcode=mri. Each individual WET record is written
55	* out to a uniquely named file in either the keep or discard folder depending on the WET
56	* record's content length and number of lines of actual content (excluding WARC headers).
57	* @param inFile the warc.wet file whose WET records are to be processed
58	* @param crawlID is the ID of the commoncrawl containing this warc.wet file
59	* and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
60	* as prefix to create unique filenames when storing each individual record).
61	*/
62	public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
63	this.batchProcessor = batchProcessor;
64
65	this.inFile = inFile;
66	this.crawlID = crawlID;
67
68	// We just want a unique recordID prefix, which we get from concatenating
69	// the commoncrawl ID with the wet file name suffix and record count within the file:
70	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
71	// the prefix will be everything after the last hyphen and without file extension,
72	// so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
73	// Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
74	// of the current WET record to get a unique filename to store each WET record into.
75	// e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
76	// common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)
77
78	String fileID = inFile.getName();
79	//logger.debug("*** Processing wetfile: " + fileID);
80	fileID = fileID.substring(fileID.lastIndexOf("0")+1);
81	if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
82	this.WETFileID = 0;
83	} else {
84	fileID = fileID.substring(0, fileID.indexOf("."));
85	this.WETFileID = Integer.parseInt(fileID);
86	}
87	}
88
89	/**
90	* Processes all the WET records of a single warc.wet file
91	*/
92	public int processWETFile() {
93	File keepURLsFile = this.batchProcessor.keepURLsFile;
94	File discardURLsFile = this.batchProcessor.discardURLsFile;
95	File greyListedFile = this.batchProcessor.greyListedFile;
96
97	StringBuilder record = null;
98	String line = null;
99	boolean readingRecord = false;
100
101	String WARCtargetURI = "";
102
103	//int recordCount = 0;
104
105	int contentLength = -1; // of record
106	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
107
108	// read from WETfile
109	try (
110	BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
111	BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
112	BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
113	BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
114	) {
115
116	while((line = reader.readLine()) != null) { // readLine removes newline separator
117
118	if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
119	readingRecord = false;
120	record = null; // drop this record, which is just an info record not actual web page's text
121	recordCount--;
122	continue;
123	}
124
125	if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
126	// process any previous record
127	if(record != null) {
128	processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
129	recordCount, contentLength, lineCount,
130	WARCtargetURI, record.toString());
131	record = null;
132	contentLength = -1;
133	lineCount = -1;
134	}
135
136	recordCount++;
137	// get ready to start a new record
138	readingRecord = true;
139	record = new StringBuilder();
140	}
141
142	if(readingRecord) { // append current line to current record
143
144	if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
145	// get and store the value
146	WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
147	}
148
149	record.append(line + "\n"); // add back (unix style) line ending
150
151	// if the line is non-empty
152	// AND if we've started counting lines, which happens only when the current
153	// line is past WARC/WET headers and we're into the actual body portion
154	// of the WET record,
155	// start incrementing the line counter.
156	if(lineCount >= 0 && !line.trim().equals("")) {
157	lineCount++;
158	}
159	else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
160	String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
161	contentLength = Integer.parseInt(contentLengthStr);
162	lineCount = 0;
163	}
164
165	}
166
167	}
168
169	// flush the last record. If it was a warcinfo record, record would be null here
170	if(record != null) {
171	processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
172	recordCount, contentLength, lineCount,
173	WARCtargetURI, record.toString());
174	record = null;
175	}
176
177	} catch(IOException ioe) {
178	ioe.printStackTrace();
179	}
180
181	return recordCount;
182	}
183
184	public int getRecordCount() { return this.recordCount; }
185
186	/**
187	* Determines if a WET record belongs in the keep or discard pile depending on if it
188	* contains enough text, based on contentLength and line count of the record body.
189	* Then writes out the WET record to a uniquely named file in the keep or discard folder,
190	* and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
191	*/
192	private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
193	BufferedWriter greyListedURLsWriter,
194	int recordID, int contentLength, int lineCount,
195	String recordURI, String record)
196	{
197	logger.info("CrawlID: CC-MAIN-" + this.crawlID
198	+ " WET #" + this.WETFileID
199	+ " record #" + recordID
200	+ " - contentLength: " + contentLength
201	+ " - lineCount: " + lineCount);
202	logger.info("URI: " + recordURI);
203	//logger.debug(record);
204	//logger.info("--------------------------");
205
206	File parentFolder = null;
207
208	if(batchProcessor.isBlacklisted(recordURI)) {
209
210	// explicit whitelisting overrides blacklisting
211	if(batchProcessor.isWhitelisted(recordURI)) {
212	parentFolder = batchProcessor.keepFolder; //tentative
213	}
214	// if not whitelisted, then greylisting still overrides blacklisting
215	else if(batchProcessor.isGreylisted(recordURI)) {
216	parentFolder = batchProcessor.greyListedFolder;
217	logger.debug("@@@GREYLISTED");
218	}
219	else { // url was only blacklisted
220	parentFolder = batchProcessor.discardFolder;
221	logger.debug("@@@DISCARDING - blacklisted");
222	}
223	}
224	else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
225	// explicit whitelisting overrides greylisting
226	if(batchProcessor.isWhitelisted(recordURI)) {
227	parentFolder = batchProcessor.keepFolder; // tentative
228	}
229	else {
230	parentFolder = batchProcessor.greyListedFolder;
231	logger.debug("@@@GREYLISTED");
232	}
233	}
234
235	// If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
236	// it still can't be in the keep list as it needs further inspection:
237	// it needs sufficient content for language analysis.
238	// We don't care about the combination of number of lines and content-length,
239	// we just care about the number of "valid words" as defined by us.
240	if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed \|\| parentFolder == null
241
242	// If a web page's WET record contains a certain minimum number of words,
243	// we will think it's a meaningful web page and has sufficient content for text analysis
244	// to have been successful. Cut off values at present are:
245	// - a minimum of 20 words
246	// - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
247	// to words having been glued together. This is used by irrelevant sites and moreover
248	// can't be analysed for language, so may not be actually MRI.
249
250	// Though StringTokenizer still in use, as seen in discussion at
251	// https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
252	// String.split(regex) seems better for splitting on general whitespace
253	String[] allWords = record.split("\\s");
254	int validWordCount = 0;
255	int numCamelCaseWords = 0;
256	for(int i = 0; i < allWords.length; i++) {
257	String word = allWords[i];
258
259	// throw away if n words contain camelcase, which is another case of words glued together
260	// For now, we'll only skip camelcased words in our count of valid words
261	if(word.matches(".[a-z][A-Z].") && word.length() >= 5) {
262	numCamelCaseWords++;
263	}
264	// In Maori, word length of 1 is not uncommon
265	// but let's skip camelcased words when counting valid words
266	else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
267	validWordCount++;
268	}
269	}
270
271
272	/*
273	// dump if too many camelcase words (ideally keep no WET record of that kind?)
274	if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
275	parentFolder = batchProcessor.discardFolder;
276	logger.debug("@@@DISCARDING - CAMELCASED CONTENTS");
277	}
278	else*/
279	// For now, don't discount content with too many camelcased words
280	// Just focus on whether there are a sufficient number of valid words
281	// (camelcased words are however still ignored in our count of valid words)
282	if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
283	parentFolder = batchProcessor.keepFolder;
284	logger.debug("@@@KEEPING");
285	}
286	}
287	// if parentFolder still not set, it means that the content length/num words or lines
288	// were insufficient, so meant to be discarded
289	if(parentFolder == null) {
290	parentFolder = batchProcessor.discardFolder;
291	logger.debug("@@@DISCARDING");
292	}
293
294	try {
295	if (parentFolder == batchProcessor.keepFolder) {
296	keepURLsWriter.write(recordURI + "\n");
297	} else if (parentFolder == batchProcessor.greyListedFolder) {
298	greyListedURLsWriter.write(recordURI + "\n");
299	} else {
300	discardURLsWriter.write(recordURI + "\n");
301	}
302	} catch(Exception e) {
303	logger.debug("Unable to write URL");
304	e.printStackTrace();
305	}
306
307	logger.debug("--------------------------");
308
309	// outFilename will look something like YYYY-##-####
310	String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
311	//= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
312	File outFile = new File(parentFolder, outFilename);
313
314	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
315	writer.write(record);
316	// Try-with-resources examples don't call close() explicitly:
317	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
318	//writer.close();
319	} catch(IOException ioe) {
320	ioe.printStackTrace();
321	logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe);
322	}
323	}
324
325
326	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33652

Download in other formats: