source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33552

Last change on this file since 33552 was 33552, checked in by ak19, 5 years ago
  1. Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.
File size: 13.2 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.Iterator;
7import java.util.Set;
8import java.util.TreeSet;
9
10import org.apache.log4j.Logger;
11
12/**
13 * A single instance of the WETProcessor class can process a single unzipped warc.wet file.
14 * A WETProcessor take a warc.wet file and goes through all its WET records,
15 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
16 * and its url listed written into a keep, discard or greylisted text file, based on:
17 *
18 * 1. whether it's whitelisted, else greylisted else blacklisted
19 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
20 * enough content. Formerly, content-length and number of lines were used to determine if
21 * the content was sufficient. Now it's just word count and number of MAX characters
22 * (not MINIMUM characters) that determine a string is a word.
23 * Explicit whitelisting has precedence over greylisting and which takes precedence
24 * over blacklisting in turn.
25 * However, even explicitly whitelisted urls still need to have sufficient content to end
26 * up in keepURLs.txt.
27 *
28 * See CCWETProcessor.java for compile instructions and how to run.
29 *
30*/
31public class WETProcessor {
32 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
33
34 // WARC WET header lines and header line prefixes of interest
35 static final String WARC_RECORD_START = "WARC/1.0";
36 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
37 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
38 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
39
40 private final String crawlID;
41 private final int WETFileID;
42 private final File inFile;
43
44 private int recordCount = 0;
45
46 /** Handle to a CCWETProcessor that processes a set of WET files
47 * Whereas a WETProcessor instance only processes a single WET file
48 * containing multiple WET records.
49 */
50 private CCWETProcessor batchProcessor;
51
52 /**
53 * WET processor processes a single warc.wet file containing multiple WET records
54 * containing text identified as primary langcode=mri. Each individual WET record is written
55 * out to a uniquely named file in either the keep or discard folder depending on the WET
56 * record's content length and number of lines of actual content (excluding WARC headers).
57 * @param inFile the warc.wet file whose WET records are to be processed
58 * @param crawlID is the ID of the commoncrawl containing this warc.wet file
59 * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
60 * as prefix to create unique filenames when storing each individual record).
61 */
62 public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
63 this.batchProcessor = batchProcessor;
64
65 this.inFile = inFile;
66 this.crawlID = crawlID;
67
68 // We just want a unique recordID prefix, which we get from concatenating
69 // the commoncrawl ID with the wet file name suffix and record count within the file:
70 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
71 // the prefix will be everything after the last hyphen and without file extension,
72 // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
73 // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
74 // of the current WET record to get a unique filename to store each WET record into.
75 // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
76 // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)
77
78 String fileID = inFile.getName();
79 //System.err.println("*** Processing wetfile: " + fileID);
80 fileID = fileID.substring(fileID.lastIndexOf("0")+1);
81 if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
82 this.WETFileID = 0;
83 } else {
84 fileID = fileID.substring(0, fileID.indexOf("."));
85 this.WETFileID = Integer.parseInt(fileID);
86 }
87 }
88
89 /**
90 * Processes all the WET records of a single warc.wet file
91 */
92 public int processWETFile() {
93 File keepURLsFile = this.batchProcessor.keepURLsFile;
94 File discardURLsFile = this.batchProcessor.discardURLsFile;
95 File greyListedFile = this.batchProcessor.greyListedFile;
96
97 StringBuilder record = null;
98 String line = null;
99 boolean readingRecord = false;
100
101 String WARCtargetURI = "";
102
103 //int recordCount = 0;
104
105 int contentLength = -1; // of record
106 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
107
108 // read from WETfile
109 try (
110 BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
111 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
112 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
113 BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
114 ) {
115
116 while((line = reader.readLine()) != null) { // readLine removes newline separator
117
118 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
119 readingRecord = false;
120 record = null; // drop this record, which is just an info record not actual web page's text
121 recordCount--;
122 continue;
123 }
124
125 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
126 // process any previous record
127 if(record != null) {
128 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
129 recordCount, contentLength, lineCount,
130 WARCtargetURI, record.toString());
131 record = null;
132 contentLength = -1;
133 lineCount = -1;
134 }
135
136 recordCount++;
137 // get ready to start a new record
138 readingRecord = true;
139 record = new StringBuilder();
140 }
141
142 if(readingRecord) { // append current line to current record
143
144 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
145 // get and store the value
146 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
147 }
148
149 record.append(line + "\n"); // add back (unix style) line ending
150
151 // if the line is non-empty
152 // AND if we've started counting lines, which happens only when the current
153 // line is past WARC/WET headers and we're into the actual body portion
154 // of the WET record,
155 // start incrementing the line counter.
156 if(lineCount >= 0 && !line.trim().equals("")) {
157 lineCount++;
158 }
159 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
160 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
161 contentLength = Integer.parseInt(contentLengthStr);
162 lineCount = 0;
163 }
164
165 }
166
167 }
168
169 // flush the last record. If it was a warcinfo record, record would be null here
170 if(record != null) {
171 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
172 recordCount, contentLength, lineCount,
173 WARCtargetURI, record.toString());
174 record = null;
175 }
176
177 } catch(IOException ioe) {
178 ioe.printStackTrace();
179 }
180
181 return recordCount;
182 }
183
184 public int getRecordCount() { return this.recordCount; }
185
186 /**
187 * Determines if a WET record belongs in the keep or discard pile depending on if it
188 * contains enough text, based on contentLength and line count of the record body.
189 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
190 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
191 */
192 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
193 BufferedWriter greyListedURLsWriter,
194 int recordID, int contentLength, int lineCount,
195 String recordURI, String record)
196 {
197 System.err.println("CrawlID: CC-MAIN-" + this.crawlID
198 + " WET #" + this.WETFileID
199 + " record #" + recordID
200 + " - contentLength: " + contentLength
201 + " - lineCount: " + lineCount);
202 System.err.println("URI: " + recordURI);
203 //System.err.println(record);
204 //System.err.println("--------------------------");
205
206 File parentFolder = null;
207
208 if(batchProcessor.isBlacklisted(recordURI)) {
209
210 // explicit whitelisting overrides blacklisting
211 if(batchProcessor.isWhitelisted(recordURI)) {
212 parentFolder = batchProcessor.keepFolder; //tentative
213 }
214 // if not whitelisted, then greylisting still overrides blacklisting
215 else if(batchProcessor.isGreylisted(recordURI)) {
216 parentFolder = batchProcessor.greyListedFolder;
217 System.err.println("@@@GREYLISTED");
218 }
219 else { // url was only blacklisted
220 parentFolder = batchProcessor.discardFolder;
221 System.err.println("@@@DISCARDING - blacklisted");
222 }
223 }
224 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
225 // explicit whitelisting overrides greylisting
226 if(batchProcessor.isWhitelisted(recordURI)) {
227 parentFolder = batchProcessor.keepFolder; // tentative
228 }
229 else {
230 parentFolder = batchProcessor.greyListedFolder;
231 System.err.println("@@@GREYLISTED");
232 }
233 }
234
235 // If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
236 // it still can't be in the keep list as it needs further inspection:
237 // it needs sufficient content for language analysis.
238 // We don't care about the combination of number of lines and content-length,
239 // we just care about the number of "valid words" as defined by us.
240 if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
241
242 // If a web page's WET record contains a certain minimum number of words,
243 // we will think it's a meaningful web page and has sufficient content for text analysis
244 // to have been successful. Cut off values at present are:
245 // - a minimum of 20 words
246 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
247 // to words having been glued together. This is used by irrelevant sites and moreover
248 // can't be analysed for language, so may not be actually MRI.
249
250 // Though StringTokenizer still in use, as seen in discussion at
251 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
252 // String.split(regex) seems better for splitting on general whitespace
253 String[] allWords = record.split("\\s");
254 int validWordCount = 0;
255 int numCamelCaseWords = 0;
256 for(int i = 0; i < allWords.length; i++) {
257 String word = allWords[i];
258
259 // throw away if n words contain camelcase, which is another case of words glued together
260 // For now, we'll only skip camelcased words in our count of valid words
261 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
262 numCamelCaseWords++;
263 }
264 // In Maori, word length of 1 is not uncommon
265 // but let's skip camelcased words when counting valid words
266 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
267 validWordCount++;
268 }
269 }
270
271
272 /*
273 // dump if too many camelcase words (ideally keep no WET record of that kind?)
274 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
275 parentFolder = batchProcessor.discardFolder;
276 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
277 }
278 else*/
279 // For now, don't discount content with too many camelcased words
280 // Just focus on whether there are a sufficient number of valid words
281 // (camelcased words are however still ignored in our count of valid words)
282 if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
283 parentFolder = batchProcessor.keepFolder;
284 System.err.println("@@@KEEPING");
285 }
286 }
287 // if parentFolder still not set, it means that the content length/num words or lines
288 // were insufficient, so meant to be discarded
289 if(parentFolder == null) {
290 parentFolder = batchProcessor.discardFolder;
291 System.err.println("@@@DISCARDING");
292 }
293
294 try {
295 if (parentFolder == batchProcessor.keepFolder) {
296 keepURLsWriter.write(recordURI + "\n");
297 } else if (parentFolder == batchProcessor.greyListedFolder) {
298 greyListedURLsWriter.write(recordURI + "\n");
299 } else {
300 discardURLsWriter.write(recordURI + "\n");
301 }
302 } catch(Exception e) {
303 System.err.println("Unable to write URL");
304 e.printStackTrace();
305 }
306
307 System.err.println("--------------------------");
308
309 // outFilename will look something like YYYY-##-####
310 String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
311 //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
312 File outFile = new File(parentFolder, outFilename);
313
314 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
315 writer.write(record);
316 // Try-with-resources examples don't call close() explicitly:
317 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
318 //writer.close();
319 } catch(IOException ioe) {
320 ioe.printStackTrace();
321 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
322 }
323 }
324}
Note: See TracBrowser for help on using the repository browser.