source: other-projects/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33652

Last change on this file since 33652 was 33615, checked in by ak19, 5 years ago
  1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 13.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.Iterator;
7import java.util.Set;
8import java.util.TreeSet;
9
10import org.apache.log4j.Logger;
11
12/**
13 * A single instance of the WETProcessor class can process a single unzipped warc.wet file.
14 * A WETProcessor take a warc.wet file and goes through all its WET records,
15 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
16 * and its url listed written into a keep, discard or greylisted text file, based on:
17 *
18 * 1. whether it's whitelisted, else greylisted else blacklisted
19 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
20 * enough content. Formerly, content-length and number of lines were used to determine if
21 * the content was sufficient. Now it's just word count and number of MAX characters
22 * (not MINIMUM characters) that determine a string is a word.
23 * Explicit whitelisting has precedence over greylisting and which takes precedence
24 * over blacklisting in turn.
25 * However, even explicitly whitelisted urls still need to have sufficient content to end
26 * up in keepURLs.txt.
27 *
28 * See CCWETProcessor.java for compile instructions and how to run.
29 *
30*/
31public class WETProcessor {
32 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
33
34 // WARC WET header lines and header line prefixes of interest
35 static final String WARC_RECORD_START = "WARC/1.0";
36 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
37 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
38 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
39
40 private final String crawlID;
41 private final int WETFileID;
42 private final File inFile;
43
44 private int recordCount = 0;
45
46 /** Handle to a CCWETProcessor that processes a set of WET files
47 * Whereas a WETProcessor instance only processes a single WET file
48 * containing multiple WET records.
49 */
50 private CCWETProcessor batchProcessor;
51
52 /**
53 * WET processor processes a single warc.wet file containing multiple WET records
54 * containing text identified as primary langcode=mri. Each individual WET record is written
55 * out to a uniquely named file in either the keep or discard folder depending on the WET
56 * record's content length and number of lines of actual content (excluding WARC headers).
57 * @param inFile the warc.wet file whose WET records are to be processed
58 * @param crawlID is the ID of the commoncrawl containing this warc.wet file
59 * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
60 * as prefix to create unique filenames when storing each individual record).
61 */
62 public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
63 this.batchProcessor = batchProcessor;
64
65 this.inFile = inFile;
66 this.crawlID = crawlID;
67
68 // We just want a unique recordID prefix, which we get from concatenating
69 // the commoncrawl ID with the wet file name suffix and record count within the file:
70 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
71 // the prefix will be everything after the last hyphen and without file extension,
72 // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
73 // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
74 // of the current WET record to get a unique filename to store each WET record into.
75 // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
76 // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)
77
78 String fileID = inFile.getName();
79 //logger.debug("*** Processing wetfile: " + fileID);
80 fileID = fileID.substring(fileID.lastIndexOf("0")+1);
81 if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
82 this.WETFileID = 0;
83 } else {
84 fileID = fileID.substring(0, fileID.indexOf("."));
85 this.WETFileID = Integer.parseInt(fileID);
86 }
87 }
88
89 /**
90 * Processes all the WET records of a single warc.wet file
91 */
92 public int processWETFile() {
93 File keepURLsFile = this.batchProcessor.keepURLsFile;
94 File discardURLsFile = this.batchProcessor.discardURLsFile;
95 File greyListedFile = this.batchProcessor.greyListedFile;
96
97 StringBuilder record = null;
98 String line = null;
99 boolean readingRecord = false;
100
101 String WARCtargetURI = "";
102
103 //int recordCount = 0;
104
105 int contentLength = -1; // of record
106 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
107
108 // read from WETfile
109 try (
110 BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
111 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
112 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
113 BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
114 ) {
115
116 while((line = reader.readLine()) != null) { // readLine removes newline separator
117
118 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
119 readingRecord = false;
120 record = null; // drop this record, which is just an info record not actual web page's text
121 recordCount--;
122 continue;
123 }
124
125 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
126 // process any previous record
127 if(record != null) {
128 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
129 recordCount, contentLength, lineCount,
130 WARCtargetURI, record.toString());
131 record = null;
132 contentLength = -1;
133 lineCount = -1;
134 }
135
136 recordCount++;
137 // get ready to start a new record
138 readingRecord = true;
139 record = new StringBuilder();
140 }
141
142 if(readingRecord) { // append current line to current record
143
144 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
145 // get and store the value
146 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
147 }
148
149 record.append(line + "\n"); // add back (unix style) line ending
150
151 // if the line is non-empty
152 // AND if we've started counting lines, which happens only when the current
153 // line is past WARC/WET headers and we're into the actual body portion
154 // of the WET record,
155 // start incrementing the line counter.
156 if(lineCount >= 0 && !line.trim().equals("")) {
157 lineCount++;
158 }
159 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
160 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
161 contentLength = Integer.parseInt(contentLengthStr);
162 lineCount = 0;
163 }
164
165 }
166
167 }
168
169 // flush the last record. If it was a warcinfo record, record would be null here
170 if(record != null) {
171 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
172 recordCount, contentLength, lineCount,
173 WARCtargetURI, record.toString());
174 record = null;
175 }
176
177 } catch(IOException ioe) {
178 ioe.printStackTrace();
179 }
180
181 return recordCount;
182 }
183
184 public int getRecordCount() { return this.recordCount; }
185
186 /**
187 * Determines if a WET record belongs in the keep or discard pile depending on if it
188 * contains enough text, based on contentLength and line count of the record body.
189 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
190 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
191 */
192 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
193 BufferedWriter greyListedURLsWriter,
194 int recordID, int contentLength, int lineCount,
195 String recordURI, String record)
196 {
197 logger.info("CrawlID: CC-MAIN-" + this.crawlID
198 + " WET #" + this.WETFileID
199 + " record #" + recordID
200 + " - contentLength: " + contentLength
201 + " - lineCount: " + lineCount);
202 logger.info("URI: " + recordURI);
203 //logger.debug(record);
204 //logger.info("--------------------------");
205
206 File parentFolder = null;
207
208 if(batchProcessor.isBlacklisted(recordURI)) {
209
210 // explicit whitelisting overrides blacklisting
211 if(batchProcessor.isWhitelisted(recordURI)) {
212 parentFolder = batchProcessor.keepFolder; //tentative
213 }
214 // if not whitelisted, then greylisting still overrides blacklisting
215 else if(batchProcessor.isGreylisted(recordURI)) {
216 parentFolder = batchProcessor.greyListedFolder;
217 logger.debug("@@@GREYLISTED");
218 }
219 else { // url was only blacklisted
220 parentFolder = batchProcessor.discardFolder;
221 logger.debug("@@@DISCARDING - blacklisted");
222 }
223 }
224 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
225 // explicit whitelisting overrides greylisting
226 if(batchProcessor.isWhitelisted(recordURI)) {
227 parentFolder = batchProcessor.keepFolder; // tentative
228 }
229 else {
230 parentFolder = batchProcessor.greyListedFolder;
231 logger.debug("@@@GREYLISTED");
232 }
233 }
234
235 // If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
236 // it still can't be in the keep list as it needs further inspection:
237 // it needs sufficient content for language analysis.
238 // We don't care about the combination of number of lines and content-length,
239 // we just care about the number of "valid words" as defined by us.
240 if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
241
242 // If a web page's WET record contains a certain minimum number of words,
243 // we will think it's a meaningful web page and has sufficient content for text analysis
244 // to have been successful. Cut off values at present are:
245 // - a minimum of 20 words
246 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
247 // to words having been glued together. This is used by irrelevant sites and moreover
248 // can't be analysed for language, so may not be actually MRI.
249
250 // Though StringTokenizer still in use, as seen in discussion at
251 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
252 // String.split(regex) seems better for splitting on general whitespace
253 String[] allWords = record.split("\\s");
254 int validWordCount = 0;
255 int numCamelCaseWords = 0;
256 for(int i = 0; i < allWords.length; i++) {
257 String word = allWords[i];
258
259 // throw away if n words contain camelcase, which is another case of words glued together
260 // For now, we'll only skip camelcased words in our count of valid words
261 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
262 numCamelCaseWords++;
263 }
264 // In Maori, word length of 1 is not uncommon
265 // but let's skip camelcased words when counting valid words
266 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
267 validWordCount++;
268 }
269 }
270
271
272 /*
273 // dump if too many camelcase words (ideally keep no WET record of that kind?)
274 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
275 parentFolder = batchProcessor.discardFolder;
276 logger.debug("@@@DISCARDING - CAMELCASED CONTENTS");
277 }
278 else*/
279 // For now, don't discount content with too many camelcased words
280 // Just focus on whether there are a sufficient number of valid words
281 // (camelcased words are however still ignored in our count of valid words)
282 if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
283 parentFolder = batchProcessor.keepFolder;
284 logger.debug("@@@KEEPING");
285 }
286 }
287 // if parentFolder still not set, it means that the content length/num words or lines
288 // were insufficient, so meant to be discarded
289 if(parentFolder == null) {
290 parentFolder = batchProcessor.discardFolder;
291 logger.debug("@@@DISCARDING");
292 }
293
294 try {
295 if (parentFolder == batchProcessor.keepFolder) {
296 keepURLsWriter.write(recordURI + "\n");
297 } else if (parentFolder == batchProcessor.greyListedFolder) {
298 greyListedURLsWriter.write(recordURI + "\n");
299 } else {
300 discardURLsWriter.write(recordURI + "\n");
301 }
302 } catch(Exception e) {
303 logger.debug("Unable to write URL");
304 e.printStackTrace();
305 }
306
307 logger.debug("--------------------------");
308
309 // outFilename will look something like YYYY-##-####
310 String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
311 //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
312 File outFile = new File(parentFolder, outFilename);
313
314 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
315 writer.write(record);
316 // Try-with-resources examples don't call close() explicitly:
317 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
318 //writer.close();
319 } catch(IOException ioe) {
320 ioe.printStackTrace();
321 logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe);
322 }
323 }
324
325
326}
Note: See TracBrowser for help on using the repository browser.