source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33501

Last change on this file since 33501 was 33501, checked in by ak19, 5 years ago

Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.

File size: 12.7 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.Set;
9import java.util.TreeSet;
10
11import org.apache.log4j.Logger;
12
13/**
14 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
15 * the WET records in each, putting each WET record into a file. Each file is put into a
16 * keep or discard folder, based on content-length and number of lines.
17 * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
18 *
19 * To compile, including the jars in lib/ for compiling.
20 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
21 *
22 * To run, passing the log4j and other properties files in conf/ folder:
23 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
24 *
25 * e.g.
26 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
27 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
28 *
29*/
30
31public class CCWETProcessor {
32 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
33
34 // Properties shared across WETProcessor instances
35 public final int MAX_WORD_LENGTH;
36 public final int MIN_NUM_WORDS;
37 public final int MAX_WORDS_CAMELCASE;
38
39 private Properties configProperties = new Properties();
40
41 // File paths shared across WETProcessor instances
42 public final File WETFilesDir;
43 public final File outputFolder;
44 public final File discardFolder;
45 public final File keepFolder;
46 public final File keepURLsFile;
47 public final File discardURLsFile;
48
49 // Keep a count of all the records that all WETProcessors instantiated
50 // by our main method combined have processed
51 private int totalRecordCount = 0;
52
53 private int wetFileCount = 0;
54
55 public CCWETProcessor(File inFolder, File outFolder) {
56 this.WETFilesDir = inFolder;
57 this.outputFolder = outFolder;
58
59 // load up the properties from the config file
60 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
61 configProperties = new Properties();
62 configProperties.load(infile);
63 //infile.close();
64
65 } catch(Exception e) {
66 System.err.println("Exception attempting to read properties from config.properties.");
67 logger.error("Exception attempting to read properties from config.properties.");
68 e.printStackTrace();
69 }
70
71 if(configProperties.size() == 0) {
72 System.err.println("*** Warning: no values read into config properties. Using defaults.");
73 }
74
75 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
76 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
77 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
78
79
80 this.discardFolder = new File(outFolder, "discard");
81 if(!discardFolder.exists()) {
82 discardFolder.mkdir();
83 }
84 this.keepFolder = new File(outFolder, "keep");
85 if(!keepFolder.exists()) {
86 keepFolder.mkdir();
87 }
88
89 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
90 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
91 System.err.println("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
92 //return;
93 }
94 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
95 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
96 System.err.println("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
97 //return;
98 }
99
100 }
101
102 /**
103 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
104 * As output produces the URL seed list and regex-urlfilter text files required by nutch,
105 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
106 */
107 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
108 // Maintain Sets of unique domains and urls
109 // TreeSet: by default, "the elements are ordered using their natural ordering"
110 // (or by a Comparator provided at set creation time).
111 // Whereas HashSet doesn't guarantee ordering.
112 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
113
114 Set<String> domainsSet = new TreeSet<String>();
115 Set<String> urlsSet = new TreeSet<String>();
116
117 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
118
119 try (
120 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
121 ) {
122
123 // read a URL at a time from urlsFile
124 String url = null;
125 String domain = null;
126 while((url = reader.readLine()) != null) { // readLine removes newline separator
127
128 // work out domain. This retains any www. or subdomain prefix:
129 int startIndex = url.indexOf("//"); // http:// or https:// prefix
130 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
131 domain = url.substring(startIndex);
132 int endIndex = domain.indexOf("/");
133 if(endIndex == -1) endIndex = domain.length();
134 domain = domain.substring(0, endIndex);
135
136 //if(!domainsMap.containsKey(domain)) {
137 urlsSet.add(url);
138 domainsSet.add(domain);
139 //}
140 }
141 } catch (IOException ioe) {
142 ioe.printStackTrace();
143 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
144 }
145
146 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
147 Iterator<String> i = urlsSet.iterator();
148 while(i.hasNext()) {
149 String url = i.next();
150 seedURLsWriter.write(url + "\n");
151 }
152
153 } catch (IOException ioe) {
154 ioe.printStackTrace();
155 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
156 }
157
158 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
159 Iterator<String> i = domainsSet.iterator();
160 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
161 while(i.hasNext()) {
162 String domain = i.next();
163 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
164 urlFilterWriter.write(domain + "\n");
165 }
166
167 } catch (IOException ioe) {
168 ioe.printStackTrace();
169 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
170 }
171 }
172
173 /*
174 public boolean isBlacklisted(String url) {
175 return false;
176 }
177 */
178
179 public boolean isGreylisted(String url) {
180 // alexa top sites and auto-translated product sites
181 return false;
182 }
183
184 /**
185 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
186 * whether it is in the discard list.
187 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
188 * By not having this method deal with actual regex for filters, this has the advantage that
189 * we don't have to remember to escape or double escape each filter to turn it into a regex.
190 */
191 //public boolean isInDiscardFilter(String url) {
192
193 public boolean isBlacklisted(String url) {
194 String discardFilterFile = "url-discard-filter.txt"; // in conf folder
195
196 try (
197 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
198 ) {
199 String filter = null;
200 while((filter = reader.readLine()) != null) {
201 if(filter.trim().equals("")) {
202 continue;
203 }
204 //System.err.println("Got filter: " + filter);
205 if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
206 System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
207 }
208 else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
209 System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
210 return true;
211 }
212 else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
213 System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
214 return true;
215 }
216 else if(url.contains(filter)) {
217 System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
218 return true;
219 }
220
221 }
222
223 } catch (IOException ioe) {
224 ioe.printStackTrace();
225 System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
226 }
227
228 return false;
229 }
230
231 /** Maintain a count of all WET files processed. */
232 public void setWETFileCount(int count) { this.wetFileCount = count; }
233
234 /** Maintain a count of all WET records processed. */
235 //public int getRecordCount() { return this.totalRecordCount; }
236 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
237 public void setRecordCount(int count) { this.totalRecordCount = count; }
238
239 public static void printUsage() {
240 System.err.println("Run this program as:");
241 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
242 }
243
244 /** Filename filter to only list warc.wet files or else warc.wet.gz files
245 * for which unzipped warc.wet equivalents don't yet exist.
246 */
247 private static class WETFilenameFilter implements FilenameFilter {
248
249 public boolean accept(File dir, String name) {
250 if(name.endsWith(".warc.wet")) {
251 logger.debug("Will include " + name + " for processing.");
252 return true;
253 }
254
255 if(name.endsWith(".warc.wet.gz")) {
256 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
257 File unzippedVersion = new File(dir, nameWithoutGZext);
258 if(unzippedVersion.exists()) {
259 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
260 logger.debug("Skipping " + name);
261 return false; // don't count gzipped version if unzipped version exists.
262 }
263 else {
264 logger.debug("Only zipped version " + name + " exists.");
265 return true; // No unzipped version, so have to work with gzipped version
266 }
267 }
268
269 // we're not even interested in any other file extensions
270 logger.debug("Not a WET file. Skipping " + name);
271 return false;
272 }
273 }
274
275
276 public static void main(String[] args) {
277 if(args.length != 2) {
278 printUsage();
279 return;
280 }
281
282
283 File WETFileDir = new File(args[0]);
284 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
285 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
286 return;
287 }
288
289 File outFolder = new File(args[1]);
290 if(!outFolder.exists() || !outFolder.isDirectory()) {
291 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
292 return;
293 }
294
295 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
296
297 //ccWETFilesProcessor.processAllWETFiles();
298
299 // Will list all the warc.wet files in the input directory or else their gzipped versions
300 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
301
302 int wetRecordCount = 0;
303 int wetFileCount = 0;
304
305 for(int i = 0; i < WETFiles.length; i++) {
306 File WETFile = WETFiles[i];
307 logger.debug("Processing WETfile: " + WETFile);
308
309 // Any .gz files listed means they haven't been unzipped yet. So unzip.
310 String WETFilename = WETFile.toString();
311 if(WETFilename.endsWith(".gz")) {
312 File GZippedWETFile = WETFile;
313 String WETGZippedFilename = WETFilename;
314 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
315
316 WETFile = new File(WETFilename);
317 Utility.unzipFile(GZippedWETFile, WETFile);
318 }
319 // hereafter all WETFiles should refer to the unzipped version
320 // Check the unzipped WETFile exists
321
322 if(!WETFile.exists() || !WETFile.isFile()) {
323 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
324 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
325 return;
326 }
327
328 // Finally, we can process this WETFile's records into the keep and discard pile
329 wetFileCount++;
330 logger.debug("Off to process " + WETFile);
331 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
332 wetFileProcessor.processWETFile();
333 wetRecordCount += wetFileProcessor.getRecordCount();
334 }
335
336 // for information purposes
337 ccWETFilesProcessor.setWETFileCount(wetFileCount);
338 ccWETFilesProcessor.setRecordCount(wetRecordCount);
339
340 File seedURLsFile = new File(outFolder, "seedURLs.txt");
341 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
342 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
343
344 return;
345
346 }
347}
Note: See TracBrowser for help on using the repository browser.