source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33517

Last change on this file since 33517 was 33517, checked in by ak19, 5 years ago
  1. Blacklists were introduced so that too many instances of camelcased words need no longer disqualify WET records from inclusion in the keep pile. Still check camelcasing of words as such words don't get counted as valid words, in the valid word count that determines if there's sufficient content in a WET record. 2. Some more commenting.
File size: 12.0 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.Set;
9import java.util.TreeSet;
10
11import org.apache.log4j.Logger;
12
13/**
14 * A single instance of the WETProcessor class can process a single unzipped warc.wet file.
15 * A WETProcessor take a warc.wet file and goes through all its WET records,
16 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
17 * and its url listed written into a keep, discard or greylisted text file, based on:
18 *
19 * 1. whether it's whitelisted, else greylisted else blacklisted
20 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
21 * enough content. Formerly, content-length and number of lines were used to determine if
22 * the content was sufficient. Now it's just word count and number of MAX characters
23 * (not MINIMUM characters) that determine a string is a word.
24 * Explicit whitelisting has precedence over greylisting and which takes precedence
25 * over blacklisting in turn.
26 * However, even explicitly whitelisted urls still need to have sufficient content to end
27 * up in keepURLs.txt.
28 *
29 * See CCWETProcessor.java for compile instructions and how to run.
30 *
31*/
32public class WETProcessor {
33 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
34
35 // WARC WET header lines and header line prefixes of interest
36 static final String WARC_RECORD_START = "WARC/1.0";
37 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
38 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
39 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
40
41 private final String WETFileID;
42 private final File inFile;
43
44 private int recordCount = 0;
45
46 /** Handle to a CCWETProcessor that processes a set of WET files
47 * Whereas a WETProcessor instance only processes a single WET file
48 * containing multiple WET records.
49 */
50 private CCWETProcessor batchProcessor;
51
52 /**
53 * WET processor processes a single warc.wet file containing multiple WET records
54 * containing text identified as primary langcode=mri. Each individual WET record is written
55 * out to a uniquely named file in either the keep or discard folder depending on the WET
56 * record's content length and number of lines of actual content (excluding WARC headers).
57 */
58 public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
59 this.batchProcessor = batchProcessor;
60
61 this.inFile = inFile;
62 // We just want a unique recordID prefix, which we get from the wet file name suffix:
63 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
64 // the prefix will be everything after the last hyphen and without file extension,
65 // so "000000" in our example. Then suffix the recordCount (keeping track of the current
66 // WET record) to get a unique filename to store each WET record into.
67
68 String fileID = inFile.getName();
69 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
70 fileID = fileID.substring(0, fileID.indexOf("."));
71 this.WETFileID = fileID;
72 }
73
74 public int processWETFile() {
75 File keepURLsFile = this.batchProcessor.keepURLsFile;
76 File discardURLsFile = this.batchProcessor.discardURLsFile;
77 File greyListedFile = this.batchProcessor.greyListedFile;
78
79 StringBuilder record = null;
80 String line = null;
81 boolean readingRecord = false;
82
83 String WARCtargetURI = "";
84
85 //int recordCount = 0;
86
87 int contentLength = -1; // of record
88 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
89
90 // read from WETfile
91 try (
92 BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
93 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
94 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
95 BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
96 ) {
97
98 while((line = reader.readLine()) != null) { // readLine removes newline separator
99
100 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
101 readingRecord = false;
102 record = null; // drop this record, which is just an info record not actual web page's text
103 recordCount--;
104 continue;
105 }
106
107 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
108 // process any previous record
109 if(record != null) {
110 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
111 recordCount, contentLength, lineCount,
112 WARCtargetURI, record.toString());
113 record = null;
114 contentLength = -1;
115 lineCount = -1;
116 }
117
118 recordCount++;
119 // get ready to start a new record
120 readingRecord = true;
121 record = new StringBuilder();
122 }
123
124 if(readingRecord) { // append current line to current record
125
126 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
127 // get and store the value
128 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
129 }
130
131 record.append(line + "\n"); // add back (unix style) line ending
132
133 // if the line is non-empty
134 // AND if we've started counting lines, which happens only when the current
135 // line is past WARC/WET headers and we're into the actual body portion
136 // of the WET record,
137 // start incrementing the line counter.
138 if(lineCount >= 0 && !line.trim().equals("")) {
139 lineCount++;
140 }
141 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
142 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
143 contentLength = Integer.parseInt(contentLengthStr);
144 lineCount = 0;
145 }
146
147 }
148
149 }
150
151 // flush the last record. If it was a warcinfo record, record would be null here
152 if(record != null) {
153 processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
154 recordCount, contentLength, lineCount,
155 WARCtargetURI, record.toString());
156 record = null;
157 }
158
159 } catch(IOException ioe) {
160 ioe.printStackTrace();
161 }
162
163 return recordCount;
164 }
165
166 public int getRecordCount() { return this.recordCount; }
167
168 /**
169 * Determines if a WET record belongs in the keep or discard pile depending on if it
170 * contains enough text, based on contentLength and line count of the record body.
171 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
172 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
173 */
174 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
175 BufferedWriter greyListedURLsWriter,
176 int recordID, int contentLength, int lineCount,
177 String recordURI, String record)
178 {
179 System.err.println("WET #" + this.WETFileID + " record #" + recordID
180 + " - contentLength: " + contentLength
181 + " - lineCount: " + lineCount);
182 System.err.println("URI: " + recordURI);
183 //System.err.println(record);
184 //System.err.println("--------------------------");
185
186 String paddedFileName = String.format("%04d.txt", recordID);
187
188 File parentFolder = null;
189
190
191 if(batchProcessor.isBlacklisted(recordURI)) {
192
193 // explicit whitelisting overrides blacklisting
194 if(batchProcessor.isWhitelisted(recordURI)) {
195 parentFolder = batchProcessor.keepFolder; //tentative
196 }
197 // if not whitelisted, then greylisting still overrides blacklisting
198 else if(batchProcessor.isGreylisted(recordURI)) {
199 parentFolder = batchProcessor.greyListedFolder;
200 System.err.println("@@@GREYLISTED");
201 }
202 else { // url was only blacklisted
203 parentFolder = batchProcessor.discardFolder;
204 System.err.println("@@@DISCARDING - blacklisted");
205 }
206 }
207 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
208 // explicit whitelisting overrides greylisting
209 if(batchProcessor.isWhitelisted(recordURI)) {
210 parentFolder = batchProcessor.keepFolder; // tentative
211 }
212 else {
213 parentFolder = batchProcessor.greyListedFolder;
214 System.err.println("@@@GREYLISTED");
215 }
216 }
217
218 // If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
219 // it still can't be in the keep list as it needs further inspection:
220 // it needs sufficient content for language analysis.
221 // We don't care about the combination of number of lines and content-length,
222 // we just care about the number of "valid words" as defined by us.
223 if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
224
225 // If a web page's WET record contains a certain minimum number of words,
226 // we will think it's a meaningful web page and has sufficient content for text analysis
227 // to have been successful. Cut off values at present are:
228 // - a minimum of 20 words
229 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
230 // to words having been glued together. This is used by irrelevant sites and moreover
231 // can't be analysed for language, so may not be actually MRI.
232
233 // Though StringTokenizer still in use, as seen in discussion at
234 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
235 // String.split(regex) seems better for splitting on general whitespace
236 String[] allWords = record.split("\\s");
237 int validWordCount = 0;
238 int numCamelCaseWords = 0;
239 for(int i = 0; i < allWords.length; i++) {
240 String word = allWords[i];
241
242 // throw away if n words contain camelcase, which is another case of words glued together
243 // For now, we'll only skip camelcased words in our count of valid words
244 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
245 numCamelCaseWords++;
246 }
247 // In Maori, word length of 1 is not uncommon
248 // but let's skip camelcased words when counting valid words
249 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
250 validWordCount++;
251 }
252 }
253
254
255 /*
256 // dump if too many camelcase words (ideally keep no WET record of that kind?)
257 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
258 parentFolder = batchProcessor.discardFolder;
259 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
260 }
261 else*/
262 // For now, don't discount content with too many camelcased words
263 // Just focus on whether there are a sufficient number of valid words
264 // (camelcased words are however still ignored in our count of valid words)
265 if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
266 parentFolder = batchProcessor.keepFolder;
267 System.err.println("@@@KEEPING");
268 }
269 }
270 // if parentFolder still not set, it means that the content length/num words or lines
271 // were insufficient, so meant to be discarded
272 if(parentFolder == null) {
273 parentFolder = batchProcessor.discardFolder;
274 System.err.println("@@@DISCARDING");
275 }
276
277 try {
278 if (parentFolder == batchProcessor.keepFolder) {
279 keepURLsWriter.write(recordURI + "\n");
280 } else if (parentFolder == batchProcessor.greyListedFolder) {
281 greyListedURLsWriter.write(recordURI + "\n");
282 } else {
283 discardURLsWriter.write(recordURI + "\n");
284 }
285 } catch(Exception e) {
286 System.err.println("Unable to write URL");
287 e.printStackTrace();
288 }
289
290 System.err.println("--------------------------");
291
292 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
293
294 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
295 writer.write(record);
296 // Try-with-resources examples don't call close() explicitly:
297 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
298 //writer.close();
299 } catch(IOException ioe) {
300 ioe.printStackTrace();
301 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
302 }
303 }
304}
Note: See TracBrowser for help on using the repository browser.