source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java@ 33515

Last change on this file since 33515 was 33515, checked in by ak19, 5 years ago

Removed an unused function

File size: 17.7 KB
Line 
1package org.greenstone.atea;
2
3
4import java.io.*;
5import java.util.Properties;
6import java.util.zip.GZIPInputStream;
7import java.util.Iterator;
8import java.util.HashMap;
9import java.util.Map;
10import java.util.Set;
11import java.util.TreeSet;
12
13import org.apache.log4j.Logger;
14
15/**
16 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
17 * the WET records in each, putting each WET record into a file. Each file is put into a
18 * keep or discard or greyListed folder, and its url listed written into a keep, discard
19 * or greylisted text file, based on based on
20 *
21 * 1. whether it's whitelisted, else greylisted else blacklisted
22 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
23 * enough content. Formerly, content-length and number of lines were used to determine if
24 * the content was sufficient. Now it's just word count and number of MAX characters
25 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
26 * in conf/config.properties.
27 *
28 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
29 * into the conf folder to control any url patterns that are explicitly included or excluded or
30 * set aside for inspecting later. These filter text files don't use regexes, instead their
31 * format is:
32 * - precede URL by ^ to blacklist urls that match the given prefix
33 * - succeed URL by $ to blacklist urls that match the given suffix
34 * - ^url$ will blacklist urls that match the given url completely
35 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
36 *
37 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
38 * over greylisting and which takes precedence over blacklisting in turn. However, even
39 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
40 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
41 * also for nutch.
42 *
43 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
44 * in the given input folder. Then use a single instance of the WETProcessor class to process
45 * each single unzipped warc.wet file.
46 *
47 * To compile, including the jars in lib/ for compiling.
48 * maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/CCWETProcessor.java
49 *
50 * To run, passing the log4j and other properties files in conf/ folder:
51 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
52 *
53 * e.g.
54 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
55 * - java -cp ".:../conf:../lib/*" org.greenstone.atea.CCWETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
56 *
57*/
58
59public class CCWETProcessor {
60 private static Logger logger = Logger.getLogger(org.greenstone.atea.CCWETProcessor.class.getName());
61
62 // Properties shared across WETProcessor instances
63 public final int MAX_WORD_LENGTH;
64 public final int MIN_NUM_WORDS;
65 public final int MAX_WORDS_CAMELCASE;
66
67 private Properties configProperties = new Properties();
68
69 // File paths shared across WETProcessor instances
70 public final File WETFilesDir;
71 public final File outputFolder;
72 public final File discardFolder;
73 public final File keepFolder;
74 public final File greyListedFolder;
75 public final File keepURLsFile;
76 public final File discardURLsFile;
77 public final File greyListedFile;
78
79 private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
80 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
81 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
82 private final Integer LIST_ENTRY_MATCHES = new Integer(3);
83
84 private HashMap<String, Integer> blackList;
85 private HashMap<String, Integer> greyList;
86 private HashMap<String, Integer> whiteList;
87
88 // Keep a count of all the records that all WETProcessors instantiated
89 // by our main method combined have processed
90 private int totalRecordCount = 0;
91
92 private int wetFileCount = 0;
93
94 public CCWETProcessor(File inFolder, File outFolder) throws Exception {
95 this.WETFilesDir = inFolder;
96 this.outputFolder = outFolder;
97
98 // load up the properties from the config file
99 try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
100 configProperties = new Properties();
101 configProperties.load(infile);
102 //infile.close(); // not explicitly called in examples of try-with-resources
103
104 } catch(Exception e) {
105 System.err.println("Exception attempting to read properties from config.properties.");
106 logger.error("Exception attempting to read properties from config.properties.");
107 e.printStackTrace();
108 }
109
110 if(configProperties.size() == 0) {
111 System.err.println("*** Warning: no values read into config properties. Using defaults.");
112 }
113
114 MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15"));
115 MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
116 MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
117
118
119 this.discardFolder = new File(outFolder, "discard");
120 if(!discardFolder.exists()) {
121 discardFolder.mkdir();
122 }
123 this.keepFolder = new File(outFolder, "keep");
124 if(!keepFolder.exists()) {
125 keepFolder.mkdir();
126 }
127
128 this.greyListedFolder = new File(outFolder, "greylisted");
129 if(!greyListedFolder.exists()) {
130 greyListedFolder.mkdir();
131 }
132
133 this.keepURLsFile = new File(outFolder, "keepURLs.txt");
134 if(keepURLsFile.exists() && !keepURLsFile.delete()) {
135 throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
136 }
137 this.discardURLsFile = new File(outFolder, "discardURLs.txt");
138 if(discardURLsFile.exists() && !discardURLsFile.delete()) {
139 throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
140 }
141 this.greyListedFile = new File(outFolder, "greyListed.txt");
142 if(greyListedFile.exists() && !greyListedFile.delete()) {
143 throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
144 }
145
146 System.err.println("Loading blacklist.");
147 blackList = new HashMap<String, Integer>();
148 initURLFilterList(blackList, "url-blacklist-filter.txt");
149 System.err.println("Loading greylist.");
150 greyList = new HashMap<String, Integer>();
151 initURLFilterList(greyList, "url-greylist-filter.txt");
152 System.err.println("Loading whitelist.");
153 whiteList = new HashMap<String, Integer>();
154 initURLFilterList(whiteList, "url-whitelist-filter.txt");
155
156 //System.err.println("Prematurely terminating for testing purposes.");
157 //System.exit(-1);
158 }
159
160 /**
161 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
162 * As output produces the URL seed list and regex-urlfilter text files required by nutch,
163 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
164 */
165 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
166 // Maintain Sets of unique domains and urls
167 // TreeSet: by default, "the elements are ordered using their natural ordering"
168 // (or by a Comparator provided at set creation time).
169 // Whereas HashSet doesn't guarantee ordering.
170 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
171
172 Set<String> domainsSet = new TreeSet<String>();
173 Set<String> urlsSet = new TreeSet<String>();
174
175 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
176
177 try (
178 BufferedReader reader = new BufferedReader(new FileReader(this.keepURLsFile));
179 ) {
180
181 // read a URL at a time from urlsFile
182 String url = null;
183 String domain = null;
184 while((url = reader.readLine()) != null) { // readLine removes newline separator
185
186 // work out domain. This retains any www. or subdomain prefix:
187 int startIndex = url.indexOf("//"); // http:// or https:// prefix
188 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
189 domain = url.substring(startIndex);
190 int endIndex = domain.indexOf("/");
191 if(endIndex == -1) endIndex = domain.length();
192 domain = domain.substring(0, endIndex);
193
194 //if(!domainsMap.containsKey(domain)) {
195 urlsSet.add(url);
196 domainsSet.add(domain);
197 //}
198 }
199 } catch (IOException ioe) {
200 ioe.printStackTrace();
201 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
202 }
203
204 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
205 Iterator<String> i = urlsSet.iterator();
206 while(i.hasNext()) {
207 String url = i.next();
208 seedURLsWriter.write(url + "\n");
209 }
210
211 } catch (IOException ioe) {
212 ioe.printStackTrace();
213 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
214 }
215
216 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
217 Iterator<String> i = domainsSet.iterator();
218 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
219 while(i.hasNext()) {
220 String domain = i.next();
221 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
222 urlFilterWriter.write(domain + "\n");
223 }
224
225 } catch (IOException ioe) {
226 ioe.printStackTrace();
227 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
228 }
229 }
230
231 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
232 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
233 Iterator<Map.Entry<String, Integer>> i = entries.iterator();
234 while(i.hasNext()) {
235 Map.Entry<String, Integer> entry = i.next();
236 String urlPattern = entry.getKey();
237 Integer matchRule = entry.getValue();
238
239 if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
240 return true;
241 }
242 else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
243 return true;
244 }
245 else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
246 return true;
247 }
248 else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
249 return true;
250 }
251 // else check the rest of the filter list against this url
252 // before returning false to be certain it's not been listed in the filter list
253 }
254
255 return false;
256 }
257
258 /**
259 * Returns true if the url or pattern is found in the blacklist file.
260 * Note that if eventually the same url pattern is found in the greylist or whitelist too,
261 * it won't get blacklisted after all. But that's not implemented here.
262 */
263 public boolean isBlacklisted(String url) {
264 return isListedInFilterList(blackList, url);
265 }
266
267 /**
268 * Returns true if the url or pattern is explicitly mentioned in the greylist file.
269 * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
270 * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
271 */
272 public boolean isGreylisted(String url) {
273 // TODO: alexa top sites and auto-translated product sites
274 return isListedInFilterList(greyList, url);
275 }
276
277 /**
278 * Returns true if the url or pattern is explicitly mentioned in the whitelist file
279 * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
280 */
281 public boolean isWhitelisted(String url) {
282 return isListedInFilterList(whiteList, url);
283 }
284
285 /**
286 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
287 * whether it is in the discard list.
288 * Filters don't represent actual regex, just ^ and $ as start and end terminators.
289 * By not having this method deal with actual regex for filters, this has the advantage that
290 * we don't have to remember to escape or double escape each filter to turn it into a regex.
291 */
292 public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
293
294 // if filterListFilename does not exist in the conf folder, just return
295 if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
296 System.err.println(filterListFilename + " does not exist");
297 return;
298 }
299
300 try (
301 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
302 ) {
303 String filter = null;
304 while((filter = reader.readLine()) != null) {
305 // skip comments and empty lines
306 filter = filter.trim();
307 if(filter.equals("") || filter.startsWith("#")) {
308 continue;
309 }
310
311 if(filter.startsWith("^") && filter.endsWith("$")) {
312 filter = filter.substring(1, filter.length()-1);
313 list.put(filter, LIST_ENTRY_MATCHES);
314 }
315 else if(filter.startsWith("^")) {
316 filter = filter.substring(1);
317 list.put(filter, LIST_ENTRY_STARTSWITH);
318 System.err.println("Match filter startswith: " + filter);
319 }
320 else if(filter.endsWith("$")) {
321 filter = filter.substring(0, filter.length()-1);
322 list.put(filter, LIST_ENTRY_ENDSWITH);
323 }
324 else {
325 list.put(filter, LIST_ENTRY_CONTAINS);
326 }
327 //System.err.println("Got filter: " + filter);
328 }
329
330 } catch (IOException ioe) {
331 ioe.printStackTrace();
332 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
333 }
334
335 }
336
337 /** Maintain a count of all WET files processed. */
338 public void setWETFileCount(int count) { this.wetFileCount = count; }
339
340 /** Maintain a count of all WET records processed. */
341 //public int getRecordCount() { return this.totalRecordCount; }
342 //public void addToRecordCount(int count) { this.totalRecordCount += count; }
343 public void setRecordCount(int count) { this.totalRecordCount = count; }
344
345 public static void printUsage() {
346 System.err.println("Run this program as:");
347 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
348 }
349
350 /** Filename filter to only list warc.wet files or else warc.wet.gz files
351 * for which unzipped warc.wet equivalents don't yet exist.
352 */
353 private static class WETFilenameFilter implements FilenameFilter {
354
355 public boolean accept(File dir, String name) {
356 if(name.endsWith(".warc.wet")) {
357 logger.debug("Will include " + name + " for processing.");
358 return true;
359 }
360
361 if(name.endsWith(".warc.wet.gz")) {
362 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
363 File unzippedVersion = new File(dir, nameWithoutGZext);
364 if(unzippedVersion.exists()) {
365 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
366 logger.debug("Skipping " + name);
367 return false; // don't count gzipped version if unzipped version exists.
368 }
369 else {
370 logger.debug("Only zipped version " + name + " exists.");
371 return true; // No unzipped version, so have to work with gzipped version
372 }
373 }
374
375 // we're not even interested in any other file extensions
376 logger.debug("Not a WET file. Skipping " + name);
377 return false;
378 }
379 }
380
381
382 public static void main(String[] args) {
383 if(args.length != 2) {
384 printUsage();
385 return;
386 }
387
388
389 File WETFileDir = new File(args[0]);
390 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
391 System.out.println("Error: " + args[0] + " does not exist or is not a directory");
392 return;
393 }
394
395 File outFolder = new File(args[1]);
396 if(!outFolder.exists() || !outFolder.isDirectory()) {
397 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
398 return;
399 }
400
401 try {
402 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
403
404 //ccWETFilesProcessor.processAllWETFiles();
405
406 // Will list all the warc.wet files in the input directory or else their gzipped versions
407 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
408
409 int wetRecordCount = 0;
410 int wetFileCount = 0;
411
412 for(int i = 0; i < WETFiles.length; i++) {
413 File WETFile = WETFiles[i];
414 logger.debug("Processing WETfile: " + WETFile);
415
416 // Any .gz files listed means they haven't been unzipped yet. So unzip.
417 String WETFilename = WETFile.toString();
418 if(WETFilename.endsWith(".gz")) {
419 File GZippedWETFile = WETFile;
420 String WETGZippedFilename = WETFilename;
421 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
422
423 WETFile = new File(WETFilename);
424 Utility.unzipFile(GZippedWETFile, WETFile);
425 }
426 // hereafter all WETFiles should refer to the unzipped version
427 // Check the unzipped WETFile exists
428
429 if(!WETFile.exists() || !WETFile.isFile()) {
430 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
431 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
432 return;
433 }
434
435 // Finally, we can process this WETFile's records into the keep and discard pile
436 wetFileCount++;
437 logger.debug("Off to process " + WETFile);
438 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
439 wetFileProcessor.processWETFile();
440 wetRecordCount += wetFileProcessor.getRecordCount();
441 }
442
443 // for information purposes
444 ccWETFilesProcessor.setWETFileCount(wetFileCount);
445 ccWETFilesProcessor.setRecordCount(wetRecordCount);
446
447 File seedURLsFile = new File(outFolder, "seedURLs.txt");
448 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
449 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
450 } catch(Exception e) {
451 // can get an exception when instantiating CCWETProcessor instance
452 e.printStackTrace();
453 System.err.println(e.getMessage());
454 }
455
456 return;
457
458 }
459}
Note: See TracBrowser for help on using the repository browser.