Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33552

Last change on this file since 33552 was 33552, checked in by ak19, 5 years ago
Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.
File size: 13.2 KB

Line
1	package org.greenstone.atea;
2
3
4	import java.io.*;
5	import java.util.Properties;
6	import java.util.Iterator;
7	import java.util.Set;
8	import java.util.TreeSet;
9
10	import org.apache.log4j.Logger;
11
12	/**
13	* A single instance of the WETProcessor class can process a single unzipped warc.wet file.
14	* A WETProcessor take a warc.wet file and goes through all its WET records,
15	* putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
16	* and its url listed written into a keep, discard or greylisted text file, based on:
17	*
18	* 1. whether it's whitelisted, else greylisted else blacklisted
19	* 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
20	* enough content. Formerly, content-length and number of lines were used to determine if
21	* the content was sufficient. Now it's just word count and number of MAX characters
22	* (not MINIMUM characters) that determine a string is a word.
23	* Explicit whitelisting has precedence over greylisting and which takes precedence
24	* over blacklisting in turn.
25	* However, even explicitly whitelisted urls still need to have sufficient content to end
26	* up in keepURLs.txt.
27	*
28	* See CCWETProcessor.java for compile instructions and how to run.
29	*
30	*/
31	public class WETProcessor {
32	private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
33
34	// WARC WET header lines and header line prefixes of interest
35	static final String WARC_RECORD_START = "WARC/1.0";
36	static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
37	static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
38	static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
39
40	private final String crawlID;
41	private final int WETFileID;
42	private final File inFile;
43
44	private int recordCount = 0;
45
46	/** Handle to a CCWETProcessor that processes a set of WET files
47	* Whereas a WETProcessor instance only processes a single WET file
48	* containing multiple WET records.
49	*/
50	private CCWETProcessor batchProcessor;
51
52	/**
53	* WET processor processes a single warc.wet file containing multiple WET records
54	* containing text identified as primary langcode=mri. Each individual WET record is written
55	* out to a uniquely named file in either the keep or discard folder depending on the WET
56	* record's content length and number of lines of actual content (excluding WARC headers).
57	* @param inFile the warc.wet file whose WET records are to be processed
58	* @param crawlID is the ID of the commoncrawl containing this warc.wet file
59	* and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
60	* as prefix to create unique filenames when storing each individual record).
61	*/
62	public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
63	this.batchProcessor = batchProcessor;
64
65	this.inFile = inFile;
66	this.crawlID = crawlID;
67
68	// We just want a unique recordID prefix, which we get from concatenating
69	// the commoncrawl ID with the wet file name suffix and record count within the file:
70	// inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
71	// the prefix will be everything after the last hyphen and without file extension,
72	// so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
73	// Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
74	// of the current WET record to get a unique filename to store each WET record into.
75	// e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
76	// common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)
77
78	String fileID = inFile.getName();
79	//System.err.println("*** Processing wetfile: " + fileID);
80	fileID = fileID.substring(fileID.lastIndexOf("0")+1);
81	if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
82	this.WETFileID = 0;
83	} else {
84	fileID = fileID.substring(0, fileID.indexOf("."));
85	this.WETFileID = Integer.parseInt(fileID);
86	}
87	}
88
89	/**
90	* Processes all the WET records of a single warc.wet file
91	*/
92	public int processWETFile() {
93	File keepURLsFile = this.batchProcessor.keepURLsFile;
94	File discardURLsFile = this.batchProcessor.discardURLsFile;
95	File greyListedFile = this.batchProcessor.greyListedFile;
96
97	StringBuilder record = null;
98	String line = null;
99	boolean readingRecord = false;
100
101	String WARCtargetURI = "";
102
103	//int recordCount = 0;
104
105	int contentLength = -1; // of record
106	int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
107
108	// read from WETfile
109	try (
110	BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
111	BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
112	BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
113	BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
114	) {
115
116	while((line = reader.readLine()) != null) { // readLine removes newline separator
117
118	if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
119	readingRecord = false;
120	record = null; // drop this record, which is just an info record not actual web page's text
121	recordCount--;
122	continue;
123	}
124
125	if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
126	// process any previous record
127	if(record != null) {
128	processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
129	recordCount, contentLength, lineCount,
130	WARCtargetURI, record.toString());
131	record = null;
132	contentLength = -1;
133	lineCount = -1;
134	}
135
136	recordCount++;
137	// get ready to start a new record
138	readingRecord = true;
139	record = new StringBuilder();
140	}
141
142	if(readingRecord) { // append current line to current record
143
144	if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
145	// get and store the value
146	WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
147	}
148
149	record.append(line + "\n"); // add back (unix style) line ending
150
151	// if the line is non-empty
152	// AND if we've started counting lines, which happens only when the current
153	// line is past WARC/WET headers and we're into the actual body portion
154	// of the WET record,
155	// start incrementing the line counter.
156	if(lineCount >= 0 && !line.trim().equals("")) {
157	lineCount++;
158	}
159	else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
160	String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
161	contentLength = Integer.parseInt(contentLengthStr);
162	lineCount = 0;
163	}
164
165	}
166
167	}
168
169	// flush the last record. If it was a warcinfo record, record would be null here
170	if(record != null) {
171	processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
172	recordCount, contentLength, lineCount,
173	WARCtargetURI, record.toString());
174	record = null;
175	}
176
177	} catch(IOException ioe) {
178	ioe.printStackTrace();
179	}
180
181	return recordCount;
182	}
183
184	public int getRecordCount() { return this.recordCount; }
185
186	/**
187	* Determines if a WET record belongs in the keep or discard pile depending on if it
188	* contains enough text, based on contentLength and line count of the record body.
189	* Then writes out the WET record to a uniquely named file in the keep or discard folder,
190	* and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
191	*/
192	private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
193	BufferedWriter greyListedURLsWriter,
194	int recordID, int contentLength, int lineCount,
195	String recordURI, String record)
196	{
197	System.err.println("CrawlID: CC-MAIN-" + this.crawlID
198	+ " WET #" + this.WETFileID
199	+ " record #" + recordID
200	+ " - contentLength: " + contentLength
201	+ " - lineCount: " + lineCount);
202	System.err.println("URI: " + recordURI);
203	//System.err.println(record);
204	//System.err.println("--------------------------");
205
206	File parentFolder = null;
207
208	if(batchProcessor.isBlacklisted(recordURI)) {
209
210	// explicit whitelisting overrides blacklisting
211	if(batchProcessor.isWhitelisted(recordURI)) {
212	parentFolder = batchProcessor.keepFolder; //tentative
213	}
214	// if not whitelisted, then greylisting still overrides blacklisting
215	else if(batchProcessor.isGreylisted(recordURI)) {
216	parentFolder = batchProcessor.greyListedFolder;
217	System.err.println("@@@GREYLISTED");
218	}
219	else { // url was only blacklisted
220	parentFolder = batchProcessor.discardFolder;
221	System.err.println("@@@DISCARDING - blacklisted");
222	}
223	}
224	else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
225	// explicit whitelisting overrides greylisting
226	if(batchProcessor.isWhitelisted(recordURI)) {
227	parentFolder = batchProcessor.keepFolder; // tentative
228	}
229	else {
230	parentFolder = batchProcessor.greyListedFolder;
231	System.err.println("@@@GREYLISTED");
232	}
233	}
234
235	// If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
236	// it still can't be in the keep list as it needs further inspection:
237	// it needs sufficient content for language analysis.
238	// We don't care about the combination of number of lines and content-length,
239	// we just care about the number of "valid words" as defined by us.
240	if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed \|\| parentFolder == null
241
242	// If a web page's WET record contains a certain minimum number of words,
243	// we will think it's a meaningful web page and has sufficient content for text analysis
244	// to have been successful. Cut off values at present are:
245	// - a minimum of 20 words
246	// - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
247	// to words having been glued together. This is used by irrelevant sites and moreover
248	// can't be analysed for language, so may not be actually MRI.
249
250	// Though StringTokenizer still in use, as seen in discussion at
251	// https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
252	// String.split(regex) seems better for splitting on general whitespace
253	String[] allWords = record.split("\\s");
254	int validWordCount = 0;
255	int numCamelCaseWords = 0;
256	for(int i = 0; i < allWords.length; i++) {
257	String word = allWords[i];
258
259	// throw away if n words contain camelcase, which is another case of words glued together
260	// For now, we'll only skip camelcased words in our count of valid words
261	if(word.matches(".[a-z][A-Z].") && word.length() >= 5) {
262	numCamelCaseWords++;
263	}
264	// In Maori, word length of 1 is not uncommon
265	// but let's skip camelcased words when counting valid words
266	else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
267	validWordCount++;
268	}
269	}
270
271
272	/*
273	// dump if too many camelcase words (ideally keep no WET record of that kind?)
274	if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
275	parentFolder = batchProcessor.discardFolder;
276	System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
277	}
278	else*/
279	// For now, don't discount content with too many camelcased words
280	// Just focus on whether there are a sufficient number of valid words
281	// (camelcased words are however still ignored in our count of valid words)
282	if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
283	parentFolder = batchProcessor.keepFolder;
284	System.err.println("@@@KEEPING");
285	}
286	}
287	// if parentFolder still not set, it means that the content length/num words or lines
288	// were insufficient, so meant to be discarded
289	if(parentFolder == null) {
290	parentFolder = batchProcessor.discardFolder;
291	System.err.println("@@@DISCARDING");
292	}
293
294	try {
295	if (parentFolder == batchProcessor.keepFolder) {
296	keepURLsWriter.write(recordURI + "\n");
297	} else if (parentFolder == batchProcessor.greyListedFolder) {
298	greyListedURLsWriter.write(recordURI + "\n");
299	} else {
300	discardURLsWriter.write(recordURI + "\n");
301	}
302	} catch(Exception e) {
303	System.err.println("Unable to write URL");
304	e.printStackTrace();
305	}
306
307	System.err.println("--------------------------");
308
309	// outFilename will look something like YYYY-##-####
310	String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
311	//= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
312	File outFile = new File(parentFolder, outFilename);
313
314	try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
315	writer.write(record);
316	// Try-with-resources examples don't call close() explicitly:
317	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
318	//writer.close();
319	} catch(IOException ioe) {
320	ioe.printStackTrace();
321	System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
322	}
323	}
324	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: