source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java@ 33501

Last change on this file since 33501 was 33501, checked in by ak19, 5 years ago

Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.

File size: 11.1 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.Set;
9import java.util.TreeSet;
10
11import org.apache.log4j.Logger;
12
13/**
14 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
15 * the WET records in each, putting each WET record into a file. Each file is put into a
16 * keep or discard folder, based on content-length and number of lines.
17 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
18 *
19 * To compile, including the jars in lib/ for compiling.
20 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
21 *
22 * To run, passing the log4j and other properties files in conf/ folder:
23 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
24 *
25 * e.g.
26 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
27 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
28 *
29*/
30public class WETProcessor {
31 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
32
33 // WARC WET header lines and header line prefixes of interest
34 static final String WARC_RECORD_START = "WARC/1.0";
35 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
36 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
37 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
38
39 private final String WETFileID;
40 private final File inFile;
41
42 private int recordCount = 0;
43
44 /** Handle to a CCWETProcessor that processes a set of WET files
45 * Whereas a WETProcessor instance only processes a single WET file
46 * containing multiple WET records.
47 */
48 private CCWETProcessor batchProcessor;
49
50 /**
51 * WET processor processes a single warc.wet file containing multiple WET records
52 * containing text identified as primary langcode=mri. Each individual WET record is written
53 * out to a uniquely named file in either the keep or discard folder depending on the WET
54 * record's content length and number of lines of actual content (excluding WARC headers).
55 */
56 public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
57 this.batchProcessor = batchProcessor;
58
59 this.inFile = inFile;
60 // We just want a unique recordID prefix, which we get from the wet file name suffix:
61 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
62 // the prefix will be everything after the last hyphen and without file extension,
63 // so "000000" in our example. Then suffix the recordCount (keeping track of the current
64 // WET record) to get a unique filename to store each WET record into.
65
66 String fileID = inFile.getName();
67 fileID = fileID.substring(fileID.lastIndexOf("-")+1);
68 fileID = fileID.substring(0, fileID.indexOf("."));
69 this.WETFileID = fileID;
70 }
71
72 public int processWETFile() {
73 File keepURLsFile = this.batchProcessor.keepURLsFile;
74 File discardURLsFile = this.batchProcessor.discardURLsFile;
75
76 StringBuilder record = null;
77 String line = null;
78 boolean readingRecord = false;
79
80 String WARCtargetURI = "";
81
82 //int recordCount = 0;
83
84 int contentLength = -1; // of record
85 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
86
87 // read from WETfile
88 try (
89 BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
90 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
91 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
92 ) {
93
94 while((line = reader.readLine()) != null) { // readLine removes newline separator
95
96 if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo"
97 readingRecord = false;
98 record = null; // drop this record, which is just an info record not actual web page's text
99 recordCount--;
100 continue;
101 }
102
103 if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record
104 // process any previous record
105 if(record != null) {
106 processWETrecord(keepURLsWriter, discardURLsWriter,
107 recordCount, contentLength, lineCount,
108 WARCtargetURI, record.toString());
109 record = null;
110 contentLength = -1;
111 lineCount = -1;
112 }
113
114 recordCount++;
115 // get ready to start a new record
116 readingRecord = true;
117 record = new StringBuilder();
118 }
119
120 if(readingRecord) { // append current line to current record
121
122 if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:"
123 // get and store the value
124 WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim();
125 }
126
127 record.append(line + "\n"); // add back (unix style) line ending
128
129 // if the line is non-empty
130 // AND if we've started counting lines, which happens only when the current
131 // line is past WARC/WET headers and we're into the actual body portion
132 // of the WET record,
133 // start incrementing the line counter.
134 if(lineCount >= 0 && !line.trim().equals("")) {
135 lineCount++;
136 }
137 else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:"
138 String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim();
139 contentLength = Integer.parseInt(contentLengthStr);
140 lineCount = 0;
141 }
142
143 }
144
145 }
146
147 // flush the last record. If it was a warcinfo record, record would be null here
148 if(record != null) {
149 processWETrecord(keepURLsWriter, discardURLsWriter,
150 recordCount, contentLength, lineCount,
151 WARCtargetURI, record.toString());
152 record = null;
153 }
154
155 } catch(IOException ioe) {
156 ioe.printStackTrace();
157 }
158
159 return recordCount;
160 }
161
162 public int getRecordCount() { return this.recordCount; }
163
164 /**
165 * Determines if a WET record belongs in the keep or discard pile depending on if it
166 * contains enough text, based on contentLength and line count of the record body.
167 * Then writes out the WET record to a uniquely named file in the keep or discard folder,
168 * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file.
169 */
170 private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
171 int recordID, int contentLength, int lineCount,
172 String recordURI, String record)
173 {
174 System.err.println("WET #" + this.WETFileID + " record #" + recordID
175 + " - contentLength: " + contentLength
176 + " - lineCount: " + lineCount);
177 System.err.println("URI: " + recordURI);
178 //System.err.println(record);
179 //System.err.println("--------------------------");
180
181 String paddedFileName = String.format("%04d.txt", recordID);
182
183 File parentFolder = null;
184
185 // want to match "product(s)" but not "production"
186 //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) {
187
188
189 /*
190 if(recordURI.contains("product") && !recordURI.contains("production")) {
191
192 // don't want a "translated" product site/online store
193 // These curiously often tend to have "product(s)" in the URL
194 parentFolder = batchProcessor.discardFolder;
195 }
196
197 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
198 parentFolder = batchProcessor.keepFolder;
199 System.err.println("@@@KEEPING");
200 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
201 int countSpaces = 0;
202 for(int i = 0; i < record.length(); i++) {
203 if(record.charAt(i) == ' ') countSpaces++;
204 }
205 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
206 // So we have at least 500 chars (possibly on a single wrapped line)
207 // containing at least 10 spaces. Such a record is also worth keeping.
208 parentFolder = batchProcessor.keepFolder;
209 }
210 }
211 */
212
213 if(batchProcessor.isBlacklisted(recordURI)) {
214 parentFolder = batchProcessor.discardFolder;
215 }
216 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
217 parentFolder = batchProcessor.discardFolder; // TODO: checkfolder
218 } else {
219 // If a web page's WET record contains a certain minimum number of words,
220 // we will think it's a meaningful web page and has sufficient content for text analysis
221 // to have been successful. Cut off values at present are:
222 // - a minimum of 20 words
223 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point
224 // to words having been glued together. This is used by irrelevant sites and moreover
225 // can't be analysed for language, so may not be actually MRI.
226
227 // Though StringTokenizer still in use, as seen in discussion at
228 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated
229 // String.split(regex) seems better for splitting on general whitespace
230 String[] allWords = record.split("\\s");
231 int validWordCount = 0;
232 int numCamelCaseWords = 0;
233 for(int i = 0; i < allWords.length; i++) {
234 String word = allWords[i];
235
236 // throw away if n words contain camelcase, which is another case of words glued together
237 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
238 numCamelCaseWords++;
239 }
240
241 // In Maori, word length of 1 is not uncommon
242 // but let's skip camelcased words when counting valid words
243 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++;
244 }
245
246 // dump if too many camelcase words (ideally keep none of that kind?)
247 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
248 parentFolder = batchProcessor.discardFolder;
249 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
250 }
251 else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
252 parentFolder = batchProcessor.keepFolder;
253 System.err.println("@@@KEEPING");
254 }
255 }
256 // if parentFolder still not set, set to discard pile folder
257 if(parentFolder == null) {
258 parentFolder = batchProcessor.discardFolder;
259 System.err.println("@@@DISCARDING");
260 }
261
262 try {
263 if (parentFolder == batchProcessor.keepFolder) {
264 keepURLsWriter.write(recordURI + "\n");
265 } else {
266 discardURLsWriter.write(recordURI + "\n");
267 }
268 } catch(Exception e) {
269 System.err.println("Unable to write URL");
270 e.printStackTrace();
271 }
272
273 System.err.println("--------------------------");
274
275 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
276
277 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
278 writer.write(record);
279 // Try-with-resources examples don't call close() explicitly:
280 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
281 //writer.close();
282 } catch(IOException ioe) {
283 ioe.printStackTrace();
284 System.err.println("\n@@@@@@@@@ Error writing to file " + outFile);
285 }
286 }
287}
Note: See TracBrowser for help on using the repository browser.