- Timestamp:
- 2019-10-04T22:00:46+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33519 r33552 69 69 70 70 // File paths shared across WETProcessor instances 71 public final File WETFilesDir;71 public final File commoncrawlDir; 72 72 public final File outputFolder; 73 73 public final File discardFolder; … … 109 109 110 110 public CCWETProcessor(File inFolder, File outFolder) throws Exception { 111 this. WETFilesDir = inFolder;111 this.commoncrawlDir = inFolder; 112 112 this.outputFolder = outFolder; 113 113 … … 178 178 179 179 /** 180 * Using the keepURLs.txt file generated by running WETProcessor instances, produces180 * Using the keepURLs.txt file generated by running WETProcessor instances, this produces 181 181 * as output the URL seed list and regex-urlfilter text files required by nutch, see 182 182 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 183 183 */ 184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile ) {184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) { 185 185 // Maintain Sets of unique domains and urls 186 186 // TreeSet: by default, "the elements are ordered using their natural ordering" … … 193 193 domainsToURLsMap = new TreeMap<String, Set<String>>(); 194 194 195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 196 196 197 197 try ( … … 230 230 } 231 231 232 // We'd have pruned out duplicates by now and have a sorted list of domains, 233 // each of which maps to seed URLs in the commoncrawl for that domain 234 232 235 /* 233 236 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { … … 243 246 } 244 247 */ 245 248 246 249 int domainCount = 0; 247 250 File sitesFolder = new File(outputFolder, "sites"); … … 254 257 // (urls with tab up front) 255 258 try ( 259 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 260 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile)); 256 261 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 257 262 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) … … 274 279 urlFilterWriter.write(regexed_domain + "\n"); 275 280 276 // for every domain, we need sites/0000x/ folder containing its own277 // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt281 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing 282 // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt 278 283 // We still have a global seedURLs.txt and regex-urlfilter.txt too. 279 284 File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt … … 283 288 BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile)); 284 289 ) { 285 // only write urls and no domain into single global seedurls file 286 // But write domain and tabbed urls into individual sites/0000x.txt files 287 // and write regexed domain into it too 290 291 // write all sorted unique domains into global domains file 292 domainURLsWriter.write(domain + "\n"); 293 294 // Only write urls and no domain into single global seedurls file 295 // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt 296 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 297 // If we ever run nutch on a single seedURLs listing containing 298 // all seed pages to crawl sites from, the above two files will work for that. 288 299 siteURLsWriter.write(domain + "\n"); 289 300 siteRegexWriter.write(regexed_domain + "\n"); … … 381 392 382 393 /** 383 * Checks URL parameter against each line ("filter") of conf/url- discard-filter.txt to decide384 * whether it is in the discardlist.394 * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide 395 * whether it is in the mentioned black|grey|white list. 385 396 * Filters don't represent actual regex, just ^ and $ as start and end terminators. 386 397 * By not having this method deal with actual regex for filters, this has the advantage that … … 439 450 //public void addToRecordCount(int count) { this.totalRecordCount += count; } 440 451 public void setRecordCount(int count) { this.totalRecordCount = count; } 452 453 public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) { 454 455 // Will list all the warc.wet files in the input directory or else their gzipped versions 456 File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter()); 457 458 int wetRecordCount = 0; 459 int wetFileCount = 0; 460 461 for(int i = 0; i < WETFiles.length; i++) { 462 File WETFile = WETFiles[i]; 463 logger.debug("Processing WETfile: " + WETFile); 464 465 // Any .gz files listed means they haven't been unzipped yet. So unzip. 466 String WETFilename = WETFile.toString(); 467 if(WETFilename.endsWith(".gz")) { 468 File GZippedWETFile = WETFile; 469 String WETGZippedFilename = WETFilename; 470 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz")); 471 472 WETFile = new File(WETFilename); 473 Utility.unzipFile(GZippedWETFile, WETFile); 474 } 475 // hereafter all WETFiles should refer to the unzipped version 476 // Check the unzipped WETFile exists 477 478 if(!WETFile.exists() || !WETFile.isFile()) { 479 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 480 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 481 return; 482 } 483 484 // Finally, we can process this WETFile's records into the keep and discard pile 485 wetFileCount++; 486 logger.debug("Off to process " + WETFile); 487 String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 488 crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-## 489 WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this); 490 wetFileProcessor.processWETFile(); 491 wetRecordCount += wetFileProcessor.getRecordCount(); 492 } 493 494 // for information purposes 495 this.setWETFileCount(wetFileCount); 496 this.setRecordCount(wetRecordCount); 497 } 441 498 442 499 public static void printUsage() { … … 475 532 } 476 533 } 477 534 535 536 private static class CCrawlWETFolderFilenameFilter implements FilenameFilter { 537 538 public boolean accept(File dir, String name) { 539 File f = new File (dir, name); 540 if(f.isDirectory()) { 541 if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) { 542 return true; 543 } 544 } 545 else { 546 System.err.println("File " + f + " is not a directory"); 547 } 548 return false; 549 } 550 } 478 551 479 552 public static void main(String[] args) { … … 483 556 } 484 557 485 486 File WETFileDir = new File(args[0]); 487 if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 558 File commoncrawlDir = new File(args[0]); 559 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { 488 560 System.out.println("Error: " + args[0] + " does not exist or is not a directory"); 489 561 return; … … 497 569 498 570 try { 499 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder); 500 501 //ccWETFilesProcessor.processAllWETFiles(); 502 503 // Will list all the warc.wet files in the input directory or else their gzipped versions 504 File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 505 506 int wetRecordCount = 0; 507 int wetFileCount = 0; 508 509 for(int i = 0; i < WETFiles.length; i++) { 510 File WETFile = WETFiles[i]; 511 logger.debug("Processing WETfile: " + WETFile); 512 513 // Any .gz files listed means they haven't been unzipped yet. So unzip. 514 String WETFilename = WETFile.toString(); 515 if(WETFilename.endsWith(".gz")) { 516 File GZippedWETFile = WETFile; 517 String WETGZippedFilename = WETFilename; 518 WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz")); 519 520 WETFile = new File(WETFilename); 521 Utility.unzipFile(GZippedWETFile, WETFile); 522 } 523 // hereafter all WETFiles should refer to the unzipped version 524 // Check the unzipped WETFile exists 525 526 if(!WETFile.exists() || !WETFile.isFile()) { 527 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 528 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 529 return; 530 } 531 532 // Finally, we can process this WETFile's records into the keep and discard pile 533 wetFileCount++; 534 logger.debug("Off to process " + WETFile); 535 WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor); 536 wetFileProcessor.processWETFile(); 537 wetRecordCount += wetFileProcessor.getRecordCount(); 538 } 539 540 // for information purposes 541 ccWETFilesProcessor.setWETFileCount(wetFileCount); 542 ccWETFilesProcessor.setRecordCount(wetRecordCount); 543 571 CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder); 572 573 File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter()); 574 575 for(int i = 0; i < ccrawlFolders.length; i++) { 576 File ccrawlFolder = ccrawlFolders[i]; 577 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder); 578 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder); 579 } 580 581 // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 582 // The former is the only unique one. seedURLs and regex-urlfilters are 583 // repeated on a per site/domain basis too, stored in the sites folder 544 584 File seedURLsFile = new File(outFolder, "seedURLs.txt"); 545 585 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 546 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 586 File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 587 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile); 547 588 548 589 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
Note:
See TracChangeset
for help on using the changeset viewer.