- Timestamp:
- 2019-09-23T21:28:06+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33497 r33501 29 29 */ 30 30 public class WETProcessor { 31 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 32 private static Properties configProperties = new Properties(); 33 34 // In Java, can initialize static final variables inside a static block 35 // But the unavoidable try/catch in this static block prevents initialization of 36 // the static final int variables (seen further below) inside the block itself, 37 // that therefore need to be declared and initialized thereafter. 38 static { 39 // load up the properties from the config file 40 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 41 configProperties = new Properties(); 42 configProperties.load(infile); 43 //infile.close(); 44 45 } catch(Exception e) { 46 System.err.println("Exception attempting to read properties from config.properties."); 47 logger.error("Exception attempting to read properties from config.properties."); 48 e.printStackTrace(); 49 } 50 } 51 52 // Providing fall-back cuttoff values if config.properties doesn't load 53 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception? 54 /* 55 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100")); 56 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2")); 57 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 58 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 59 */ 60 private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces) 61 private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20")); 62 private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10")); 63 64 // File paths shared across WETProcessor instances 65 private static File discardFolder; 66 private static File keepFolder; 67 private static File keepURLsFile; 68 private static File discardURLsFile; 31 private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 69 32 70 33 // WARC WET header lines and header line prefixes of interest … … 72 35 static final String WARC_INFO_HEADER = "WARC-Type: warcinfo"; 73 36 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; 74 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; 37 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; 38 39 private final String WETFileID; 40 private final File inFile; 41 42 private int recordCount = 0; 43 44 /** Handle to a CCWETProcessor that processes a set of WET files 45 * Whereas a WETProcessor instance only processes a single WET file 46 * containing multiple WET records. 47 */ 48 private CCWETProcessor batchProcessor; 75 49 76 // Keep a count of all the records that all WETProcessors instantiated77 // by our main method combined have processed78 //private static int recordCount = 0;79 80 private final File outputFolder;81 private final String WETFileID;82 83 84 50 /** 85 51 * WET processor processes a single warc.wet file containing multiple WET records … … 88 54 * record's content length and number of lines of actual content (excluding WARC headers). 89 55 */ 90 public WETProcessor(File inFile, File outFolder) { 91 this.outputFolder = outFolder; 92 93 StringBuilder record = null; 94 String line = null; 95 boolean readingRecord = false; 96 97 String WARCtargetURI = ""; 98 99 int recordCount = 0; 100 101 int contentLength = -1; // of record 102 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) 103 56 public WETProcessor(File inFile, CCWETProcessor batchProcessor) { 57 this.batchProcessor = batchProcessor; 58 59 this.inFile = inFile; 104 60 // We just want a unique recordID prefix, which we get from the wet file name suffix: 105 61 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet … … 112 68 fileID = fileID.substring(0, fileID.indexOf(".")); 113 69 this.WETFileID = fileID; 114 70 } 71 72 public int processWETFile() { 73 File keepURLsFile = this.batchProcessor.keepURLsFile; 74 File discardURLsFile = this.batchProcessor.discardURLsFile; 75 76 StringBuilder record = null; 77 String line = null; 78 boolean readingRecord = false; 79 80 String WARCtargetURI = ""; 81 82 //int recordCount = 0; 83 84 int contentLength = -1; // of record 85 int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) 115 86 116 87 // read from WETfile 117 88 try ( 118 BufferedReader reader = new BufferedReader(new FileReader( inFile));89 BufferedReader reader = new BufferedReader(new FileReader(this.inFile)); 119 90 BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); 120 91 BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append … … 185 156 ioe.printStackTrace(); 186 157 } 158 159 return recordCount; 187 160 } 161 162 public int getRecordCount() { return this.recordCount; } 188 163 189 164 /** … … 217 192 // don't want a "translated" product site/online store 218 193 // These curiously often tend to have "product(s)" in the URL 219 parentFolder = WETProcessor.discardFolder;194 parentFolder = batchProcessor.discardFolder; 220 195 } 221 196 222 197 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 223 parentFolder = WETProcessor.keepFolder;198 parentFolder = batchProcessor.keepFolder; 224 199 System.err.println("@@@KEEPING"); 225 200 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) { … … 231 206 // So we have at least 500 chars (possibly on a single wrapped line) 232 207 // containing at least 10 spaces. Such a record is also worth keeping. 233 parentFolder = WETProcessor.keepFolder;208 parentFolder = batchProcessor.keepFolder; 234 209 } 235 210 } 236 211 */ 237 212 238 if( isInDiscardFilter(recordURI)) {239 parentFolder = WETProcessor.discardFolder;240 } 241 else if( isInCheckFilter(recordURI)) { //products sites242 parentFolder = WETProcessor.discardFolder; // TODO: checkfolder213 if(batchProcessor.isBlacklisted(recordURI)) { 214 parentFolder = batchProcessor.discardFolder; 215 } 216 else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites 217 parentFolder = batchProcessor.discardFolder; // TODO: checkfolder 243 218 } else { 244 219 // If a web page's WET record contains a certain minimum number of words, … … 266 241 // In Maori, word length of 1 is not uncommon 267 242 // but let's skip camelcased words when counting valid words 268 else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;243 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++; 269 244 } 270 245 271 246 // dump if too many camelcase words (ideally keep none of that kind?) 272 if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {273 parentFolder = WETProcessor.discardFolder;247 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 248 parentFolder = batchProcessor.discardFolder; 274 249 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 275 250 } 276 else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words277 parentFolder = WETProcessor.keepFolder;251 else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 252 parentFolder = batchProcessor.keepFolder; 278 253 System.err.println("@@@KEEPING"); 279 254 } … … 281 256 // if parentFolder still not set, set to discard pile folder 282 257 if(parentFolder == null) { 283 parentFolder = WETProcessor.discardFolder;258 parentFolder = batchProcessor.discardFolder; 284 259 System.err.println("@@@DISCARDING"); 285 260 } 286 261 287 262 try { 288 if (parentFolder == WETProcessor.keepFolder) {263 if (parentFolder == batchProcessor.keepFolder) { 289 264 keepURLsWriter.write(recordURI + "\n"); 290 265 } else { … … 310 285 } 311 286 } 312 313 314 /**315 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.316 * As output produces the URL seed list and regex-urlfilter text files required by nutch,317 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial318 */319 public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {320 // Maintain Sets of unique domains and urls321 // TreeSet: by default, "the elements are ordered using their natural ordering"322 // (or by a Comparator provided at set creation time).323 // Whereas HashSet doesn't guarantee ordering.324 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.325 326 Set<String> domainsSet = new TreeSet<String>();327 Set<String> urlsSet = new TreeSet<String>();328 329 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*330 331 try (332 BufferedReader reader = new BufferedReader(new FileReader(urlsFile));333 ) {334 335 // read a URL at a time from urlsFile336 String url = null;337 String domain = null;338 while((url = reader.readLine()) != null) { // readLine removes newline separator339 340 // work out domain. This retains any www. or subdomain prefix:341 int startIndex = url.indexOf("//"); // http:// or https:// prefix342 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion343 domain = url.substring(startIndex);344 int endIndex = domain.indexOf("/");345 if(endIndex == -1) endIndex = domain.length();346 domain = domain.substring(0, endIndex);347 348 //if(!domainsMap.containsKey(domain)) {349 urlsSet.add(url);350 domainsSet.add(domain);351 //}352 }353 } catch (IOException ioe) {354 ioe.printStackTrace();355 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);356 }357 358 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {359 Iterator<String> i = urlsSet.iterator();360 while(i.hasNext()) {361 String url = i.next();362 seedURLsWriter.write(url + "\n");363 }364 365 } catch (IOException ioe) {366 ioe.printStackTrace();367 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);368 }369 370 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {371 Iterator<String> i = domainsSet.iterator();372 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/373 while(i.hasNext()) {374 String domain = i.next();375 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";376 urlFilterWriter.write(domain + "\n");377 }378 379 } catch (IOException ioe) {380 ioe.printStackTrace();381 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);382 }383 }384 385 /**386 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide387 * whether it is in the discard list.388 * Filters don't represent actual regex, just ^ and $ as start and end terminators.389 * By not having this method deal with actual regex for filters, this has the advantage that390 * we don't have to remember to escape or double escape each filter to turn it into a regex.391 */392 public boolean isInDiscardFilter(String url) {393 394 String discardFilterFile = "url-discard-filter.txt"; // in conf folder395 396 try (397 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));398 ) {399 String filter = null;400 while((filter = reader.readLine()) != null) {401 if(filter.trim().equals("")) {402 continue;403 }404 //System.err.println("Got filter: " + filter);405 if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {406 System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);407 }408 else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {409 System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);410 return true;411 }412 else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {413 System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);414 return true;415 }416 else if(url.contains(filter)) {417 System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);418 return true;419 }420 421 }422 423 } catch (IOException ioe) {424 ioe.printStackTrace();425 System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);426 }427 428 return false;429 }430 431 // TODO432 public boolean isInCheckFilter(String url) {433 //System.err.println("isInCheckFilter(url) is not yet implemented");434 return false;435 }436 437 //public static int getRecordCount() { return recordCount; }438 439 public static void printUsage() {440 System.err.println("Run this program as:");441 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");442 }443 444 /** Filename filter to only list warc.wet files or else warc.wet.gz files445 * for which unzipped warc.wet equivalents don't yet exist.446 */447 private static class WETFilenameFilter implements FilenameFilter {448 449 public boolean accept(File dir, String name) {450 if(name.endsWith(".warc.wet")) {451 logger.debug("Will include " + name + " for processing.");452 return true;453 }454 455 if(name.endsWith(".warc.wet.gz")) {456 String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));457 File unzippedVersion = new File(dir, nameWithoutGZext);458 if(unzippedVersion.exists()) {459 logger.debug("--- Unzipped version " + unzippedVersion + " exists.");460 logger.debug("Skipping " + name);461 return false; // don't count gzipped version if unzipped version exists.462 }463 else {464 logger.debug("Only zipped version " + name + " exists.");465 return true; // No unzipped version, so have to work with gzipped version466 }467 }468 469 // we're not even interested in any other file extensions470 logger.debug("Not a WET file. Skipping " + name);471 return false;472 }473 }474 475 public static void main(String[] args) {476 if(args.length != 2) {477 printUsage();478 return;479 }480 481 482 File WETFileDir = new File(args[0]);483 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {484 System.out.println("Error: " + args[0] + " does not exist or is not a directory");485 return;486 }487 488 File outFolder = new File(args[1]);489 if(!outFolder.exists() || !outFolder.isDirectory()) {490 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");491 return;492 }493 494 // static folders and files to be shared across all WETProcessor instances495 WETProcessor.discardFolder = new File(outFolder, "discard");496 if(!WETProcessor.discardFolder.exists()) {497 WETProcessor.discardFolder.mkdir();498 }499 WETProcessor.keepFolder = new File(outFolder, "keep");500 if(!WETProcessor.keepFolder.exists()) {501 WETProcessor.keepFolder.mkdir();502 }503 504 WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");505 if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {506 System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");507 return;508 }509 WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");510 if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {511 System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");512 return;513 }514 515 // Will list all the warc.wet files in the input directory or else their gzipped versions516 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());517 518 for(int i = 0; i < WETFiles.length; i++) {519 File WETFile = WETFiles[i];520 logger.debug("Processing WETfile: " + WETFile);521 522 // Any .gz files listed means they haven't been unzipped yet. So unzip.523 String WETFilename = WETFile.toString();524 if(WETFilename.endsWith(".gz")) {525 File GZippedWETFile = WETFile;526 String WETGZippedFilename = WETFilename;527 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));528 529 WETFile = new File(WETFilename);530 Utility.unzipFile(GZippedWETFile, WETFile);531 }532 // hereafter all WETFiles should refer to the unzipped version533 // Check the unzipped WETFile exists534 535 if(!WETFile.exists() || !WETFile.isFile()) {536 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");537 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");538 return;539 }540 541 // Finally, we can process this WETFile's records into the keep and discard pile542 logger.debug("Off to process " + WETFile);543 WETProcessor processor = new WETProcessor(WETFile, outFolder);544 545 }546 547 File seedURLsFile = new File(outFolder, "seedURLs.txt");548 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");549 WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);550 551 return;552 }553 287 }
Note:
See TracChangeset
for help on using the changeset viewer.