Changeset 33552
- Timestamp:
- 2019-10-04T22:00:46+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33519 r33552 69 69 70 70 // File paths shared across WETProcessor instances 71 public final File WETFilesDir;71 public final File commoncrawlDir; 72 72 public final File outputFolder; 73 73 public final File discardFolder; … … 109 109 110 110 public CCWETProcessor(File inFolder, File outFolder) throws Exception { 111 this. WETFilesDir = inFolder;111 this.commoncrawlDir = inFolder; 112 112 this.outputFolder = outFolder; 113 113 … … 178 178 179 179 /** 180 * Using the keepURLs.txt file generated by running WETProcessor instances, produces180 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces 181 181 * as output the URL seed list and regex-urlfilter text files required by nutch, see 182 182 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 183 183 */ 184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile ) {184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) { 185 185 // Maintain Sets of unique domains and urls 186 186 // TreeSet: by default, "the elements are ordered using their natural ordering" … … 193 193 domainsToURLsMap = new TreeMap<String, Set<String>>(); 194 194 195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 196 196 197 197 try ( … … 230 230 } 231 231 232 // We'd have pruned out duplicates by now and have a sorted list of domains, 233 // each of which maps to seed URLs in the commoncrawl for that domain 234 232 235 /* 233 236 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { … … 243 246 } 244 247 */ 245 248 246 249 int domainCount = 0; 247 250 File sitesFolder = new File(outputFolder, "sites"); … … 254 257 // (urls with tab up front) 255 258 try ( 259 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 260 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile)); 256 261 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 257 262 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) … … 274 279 urlFilterWriter.write(regexed_domain + "\n"); 275 280 276 // for every domain, we need sites/0000x/ folder containing its own277 // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt281 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing 282 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt 278 283 // We still have a global seedURLs.txt and regex-urlfilter.txt too. 279 284 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt … … 283 288 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile)); 284 289 ) { 285 // only write urls and no domain into single global seedurls file 286 // But write domain and tabbed urls into individual sites/0000x.txt files 287 // and write regexed domain into it too 290 291 // write all sorted unique domains into global domains file 292 domainURLsWriter.write(domain + "\n"); 293 294 // Only write urls and no domain into single global seedurls file 295 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt 296 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 297 // If we ever run nutch on a single seedURLs listing containing 298 // all seed pages to crawl sites from, the above two files will work for that. 288 299 siteURLsWriter.write(domain + "\n"); 289 300 siteRegexWriter.write(regexed_domain + "\n"); … … 381 392 382 393 /** 383 * Checks URL parameter against each line ("filter") of conf/url- discard-filter.txt to decide384 * whether it is in the discardlist.394 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide 395 * whether it is in the mentioned black|grey|white list. 385 396 * Filters don't represent actual regex, just ^ and $ as start and end terminators. 386 397 * By not having this method deal with actual regex for filters, this has the advantage that … … 439 450 //public void addToRecordCount(int count) { this.totalRecordCount += count; } 440 451 public void setRecordCount(int count) { this.totalRecordCount = count; } 452 453 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) { 454 455 // Will list all the warc.wet files in the input directory or else their gzipped versions 456 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter()); 457 458 int wetRecordCount = 0; 459 int wetFileCount = 0; 460 461 for(int i = 0; i < WETFiles.length; i++) { 462 File WETFile = WETFiles[i]; 463 logger.debug("Processing WETfile: " + WETFile); 464 465 // Any .gz files listed means they haven't been unzipped yet. So unzip. 466 String WETFilename = WETFile.toString(); 467 if(WETFilename.endsWith(".gz")) { 468 File GZippedWETFile = WETFile; 469 String WETGZippedFilename = WETFilename; 470 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz")); 471 472 WETFile = new File(WETFilename); 473 Utility.unzipFile(GZippedWETFile, WETFile); 474 } 475 // hereafter all WETFiles should refer to the unzipped version 476 // Check the unzipped WETFile exists 477 478 if(!WETFile.exists() || !WETFile.isFile()) { 479 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 480 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 481 return; 482 } 483 484 // Finally, we can process this WETFile's records into the keep and discard pile 485 wetFileCount++; 486 logger.debug("Off to process " + WETFile); 487 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 488 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-## 489 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this); 490 wetFileProcessor.processWETFile(); 491 wetRecordCount += wetFileProcessor.getRecordCount(); 492 } 493 494 // for information purposes 495 this.setWETFileCount(wetFileCount); 496 this.setRecordCount(wetRecordCount); 497 } 441 498 442 499 public static void printUsage() { … … 475 532 } 476 533 } 477 534 535 536 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter { 537 538 public boolean accept(File dir, String name) { 539 File f = new File (dir, name); 540 if(f.isDirectory()) { 541 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) { 542 return true; 543 } 544 } 545 else { 546 System.err.println("File " + f + " is not a directory"); 547 } 548 return false; 549 } 550 } 478 551 479 552 public static void main(String[] args) { … … 483 556 } 484 557 485 486 File WETFileDir = new File(args[0]); 487 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 558 File commoncrawlDir = new File(args[0]); 559 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { 488 560 System.out.println("Error: " + args[0] + " does not exist or is not a directory"); 489 561 return; … … 497 569 498 570 try { 499 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder); 500 501 //ccWETFilesProcessor.processAllWETFiles(); 502 503 // Will list all the warc.wet files in the input directory or else their gzipped versions 504 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 505 506 int wetRecordCount = 0; 507 int wetFileCount = 0; 508 509 for(int i = 0; i < WETFiles.length; i++) { 510 File WETFile = WETFiles[i]; 511 logger.debug("Processing WETfile: " + WETFile); 512 513 // Any .gz files listed means they haven't been unzipped yet. So unzip. 514 String WETFilename = WETFile.toString(); 515 if(WETFilename.endsWith(".gz")) { 516 File GZippedWETFile = WETFile; 517 String WETGZippedFilename = WETFilename; 518 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz")); 519 520 WETFile = new File(WETFilename); 521 Utility.unzipFile(GZippedWETFile, WETFile); 522 } 523 // hereafter all WETFiles should refer to the unzipped version 524 // Check the unzipped WETFile exists 525 526 if(!WETFile.exists() || !WETFile.isFile()) { 527 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 528 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 529 return; 530 } 531 532 // Finally, we can process this WETFile's records into the keep and discard pile 533 wetFileCount++; 534 logger.debug("Off to process " + WETFile); 535 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor); 536 wetFileProcessor.processWETFile(); 537 wetRecordCount += wetFileProcessor.getRecordCount(); 538 } 539 540 // for information purposes 541 ccWETFilesProcessor.setWETFileCount(wetFileCount); 542 ccWETFilesProcessor.setRecordCount(wetRecordCount); 543 571 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder); 572 573 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter()); 574 575 for(int i = 0; i < ccrawlFolders.length; i++) { 576 File ccrawlFolder = ccrawlFolders[i]; 577 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder); 578 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder); 579 } 580 581 // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 582 // The former is the only unique one. seedURLs and regex-urlfilters are 583 // repeated on a per site/domain basis too, stored in the sites folder 544 584 File seedURLsFile = new File(outFolder, "seedURLs.txt"); 545 585 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 546 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 586 File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 587 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile); 547 588 548 589 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33517 r33552 4 4 import java.io.*; 5 5 import java.util.Properties; 6 import java.util.zip.GZIPInputStream;7 6 import java.util.Iterator; 8 7 import java.util.Set; … … 38 37 static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; 39 38 static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; 40 41 private final String WETFileID; 39 40 private final String crawlID; 41 private final int WETFileID; 42 42 private final File inFile; 43 43 … … 55 55 * out to a uniquely named file in either the keep or discard folder depending on the WET 56 56 * record's content length and number of lines of actual content (excluding WARC headers). 57 * @param inFile the warc.wet file whose WET records are to be processed 58 * @param crawlID is the ID of the commoncrawl containing this warc.wet file 59 * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used 60 * as prefix to create unique filenames when storing each individual record). 57 61 */ 58 public WETProcessor(File inFile, CCWETProcessor batchProcessor) {62 public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) { 59 63 this.batchProcessor = batchProcessor; 60 64 61 65 this.inFile = inFile; 62 // We just want a unique recordID prefix, which we get from the wet file name suffix: 66 this.crawlID = crawlID; 67 68 // We just want a unique recordID prefix, which we get from concatenating 69 // the commoncrawl ID with the wet file name suffix and record count within the file: 63 70 // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet 64 71 // the prefix will be everything after the last hyphen and without file extension, 65 // so "000000" in our example. Then suffix the recordCount (keeping track of the current 66 // WET record) to get a unique filename to store each WET record into. 72 // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00. 73 // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track 74 // of the current WET record to get a unique filename to store each WET record into. 75 // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the 76 // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz) 67 77 68 78 String fileID = inFile.getName(); 69 fileID = fileID.substring(fileID.lastIndexOf("-")+1); 70 fileID = fileID.substring(0, fileID.indexOf(".")); 71 this.WETFileID = fileID; 79 //System.err.println("*** Processing wetfile: " + fileID); 80 fileID = fileID.substring(fileID.lastIndexOf("0")+1); 81 if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet 82 this.WETFileID = 0; 83 } else { 84 fileID = fileID.substring(0, fileID.indexOf(".")); 85 this.WETFileID = Integer.parseInt(fileID); 86 } 72 87 } 73 88 89 /** 90 * Processes all the WET records of a single warc.wet file 91 */ 74 92 public int processWETFile() { 75 93 File keepURLsFile = this.batchProcessor.keepURLsFile; … … 177 195 String recordURI, String record) 178 196 { 179 System.err.println("WET #" + this.WETFileID + " record #" + recordID 197 System.err.println("CrawlID: CC-MAIN-" + this.crawlID 198 + " WET #" + this.WETFileID 199 + " record #" + recordID 180 200 + " - contentLength: " + contentLength 181 201 + " - lineCount: " + lineCount); … … 184 204 //System.err.println("--------------------------"); 185 205 186 String paddedFileName = String.format("%04d.txt", recordID);187 188 206 File parentFolder = null; 189 190 207 191 208 if(batchProcessor.isBlacklisted(recordURI)) { … … 289 306 290 307 System.err.println("--------------------------"); 291 292 File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName); 308 309 // outFilename will look something like YYYY-##-#### 310 String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID); 311 //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID); 312 File outFile = new File(parentFolder, outFilename); 293 313 294 314 try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
Note:
See TracChangeset
for help on using the changeset viewer.