Changeset 33552

Show
Ignore:
Timestamp:
04.10.2019 22:00:46 (13 days ago)
Author:
ak19
Message:

1. Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33519 r33552  
    6969 
    7070    // File paths shared across WETProcessor instances 
    71     public final File WETFilesDir; 
     71    public final File commoncrawlDir; 
    7272    public final File outputFolder; 
    7373    public final File discardFolder; 
     
    109109     
    110110    public CCWETProcessor(File inFolder, File outFolder) throws Exception { 
    111     this.WETFilesDir = inFolder; 
     111    this.commoncrawlDir = inFolder; 
    112112    this.outputFolder = outFolder; 
    113113 
     
    178178     
    179179    /** 
    180      * Using the keepURLs.txt file generated by running WETProcessor instances, produces 
     180     * Using the keepURLs.txt file generated by running WETProcessor instances, this produces 
    181181     * as output the URL seed list and regex-urlfilter text files required by nutch, see 
    182182     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 
    183183     */ 
    184     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) { 
     184    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) { 
    185185    // Maintain Sets of unique domains and urls 
    186186    // TreeSet: by default, "the elements are ordered using their natural ordering" 
     
    193193    domainsToURLsMap = new TreeMap<String, Set<String>>(); 
    194194     
    195     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 
     195    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 
    196196     
    197197    try ( 
     
    230230    } 
    231231 
     232    // We'd have pruned out duplicates by now and have a sorted list of domains, 
     233    // each of which maps to seed URLs in the commoncrawl for that domain 
     234     
    232235    /* 
    233236    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
     
    243246    } 
    244247    */ 
    245  
     248     
    246249    int domainCount = 0; 
    247250    File sitesFolder = new File(outputFolder, "sites"); 
     
    254257    // (urls with tab up front) 
    255258    try ( 
     259         // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 
     260         BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));  
    256261         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 
    257262         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) 
     
    274279        urlFilterWriter.write(regexed_domain + "\n"); 
    275280 
    276         // for every domain, we need sites/0000x/ folder containing its own 
    277         // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt 
     281        // for every domain, we need a sites/0000x/ folder, where x is domain#, containing 
     282        // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt 
    278283        // We still have a global seedURLs.txt and regex-urlfilter.txt too. 
    279284        File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt 
     
    283288             BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile)); 
    284289             ) { 
    285             // only write urls and no domain into single global seedurls file 
    286             // But write domain and tabbed urls into individual sites/0000x.txt files 
    287             // and write regexed domain into it too 
     290 
     291            // write all sorted unique domains into global domains file 
     292            domainURLsWriter.write(domain + "\n"); 
     293 
     294            // Only write urls and no domain into single global seedurls file 
     295            // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt 
     296            // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 
     297            // If we ever run nutch on a single seedURLs listing containing 
     298            // all seed pages to crawl sites from, the above two files will work for that. 
    288299            siteURLsWriter.write(domain + "\n");         
    289300            siteRegexWriter.write(regexed_domain + "\n"); 
     
    381392     
    382393    /** 
    383      * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide 
    384      * whether it is in the discard list. 
     394     * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide 
     395     * whether it is in the mentioned black|grey|white list. 
    385396     * Filters don't represent actual regex, just ^ and $ as start and end terminators. 
    386397     * By not having this method deal with actual regex for filters, this has the advantage that 
     
    439450    //public void addToRecordCount(int count) { this.totalRecordCount += count; } 
    440451    public void setRecordCount(int count) { this.totalRecordCount = count; } 
     452 
     453    public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) { 
     454     
     455    // Will list all the warc.wet files in the input directory or else their gzipped versions 
     456    File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter()); 
     457 
     458    int wetRecordCount = 0; 
     459    int wetFileCount = 0; 
     460     
     461    for(int i = 0; i < WETFiles.length; i++) { 
     462        File WETFile = WETFiles[i];      
     463        logger.debug("Processing WETfile: " + WETFile); 
     464 
     465        // Any .gz files listed means they haven't been unzipped yet. So unzip. 
     466        String WETFilename = WETFile.toString(); 
     467        if(WETFilename.endsWith(".gz")) { 
     468        File GZippedWETFile = WETFile; 
     469        String WETGZippedFilename = WETFilename; 
     470        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));      
     471 
     472        WETFile = new File(WETFilename); 
     473        Utility.unzipFile(GZippedWETFile, WETFile); 
     474        } 
     475        // hereafter all WETFiles should refer to the unzipped version 
     476        // Check the unzipped WETFile exists         
     477 
     478        if(!WETFile.exists() || !WETFile.isFile()) { 
     479        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 
     480        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 
     481        return; 
     482        } 
     483 
     484        // Finally, we can process this WETFile's records into the keep and discard pile 
     485        wetFileCount++; 
     486        logger.debug("Off to process " + WETFile); 
     487        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files 
     488        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##      
     489        WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this); 
     490        wetFileProcessor.processWETFile(); 
     491        wetRecordCount += wetFileProcessor.getRecordCount(); 
     492    } 
     493 
     494    // for information purposes 
     495    this.setWETFileCount(wetFileCount); 
     496    this.setRecordCount(wetRecordCount); 
     497    } 
    441498     
    442499    public static void printUsage() { 
     
    475532    } 
    476533    } 
    477      
     534 
     535 
     536    private static class CCrawlWETFolderFilenameFilter implements FilenameFilter { 
     537     
     538    public boolean accept(File dir, String name) { 
     539        File f = new File (dir, name); 
     540        if(f.isDirectory()) { 
     541        if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) { 
     542            return true; 
     543        } 
     544        } 
     545        else { 
     546        System.err.println("File " + f + " is not a directory"); 
     547        } 
     548        return false;                   
     549    } 
     550    } 
    478551 
    479552    public static void main(String[] args) { 
     
    483556    } 
    484557     
    485      
    486     File WETFileDir = new File(args[0]); 
    487     if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 
     558    File commoncrawlDir = new File(args[0]); 
     559    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { 
    488560        System.out.println("Error: " + args[0] + " does not exist or is not a directory"); 
    489561        return; 
     
    497569 
    498570    try { 
    499     CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder); 
    500  
    501     //ccWETFilesProcessor.processAllWETFiles(); 
    502      
    503     // Will list all the warc.wet files in the input directory or else their gzipped versions 
    504     File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 
    505  
    506     int wetRecordCount = 0; 
    507     int wetFileCount = 0; 
    508      
    509     for(int i = 0; i < WETFiles.length; i++) { 
    510         File WETFile = WETFiles[i];      
    511         logger.debug("Processing WETfile: " + WETFile); 
    512  
    513         // Any .gz files listed means they haven't been unzipped yet. So unzip. 
    514         String WETFilename = WETFile.toString(); 
    515         if(WETFilename.endsWith(".gz")) { 
    516         File GZippedWETFile = WETFile; 
    517         String WETGZippedFilename = WETFilename; 
    518         WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));      
    519  
    520         WETFile = new File(WETFilename); 
    521         Utility.unzipFile(GZippedWETFile, WETFile); 
    522         } 
    523         // hereafter all WETFiles should refer to the unzipped version 
    524         // Check the unzipped WETFile exists         
    525  
    526         if(!WETFile.exists() || !WETFile.isFile()) { 
    527         System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 
    528         logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 
    529         return; 
    530         } 
    531  
    532         // Finally, we can process this WETFile's records into the keep and discard pile 
    533         wetFileCount++; 
    534         logger.debug("Off to process " + WETFile); 
    535         WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor); 
    536         wetFileProcessor.processWETFile(); 
    537         wetRecordCount += wetFileProcessor.getRecordCount(); 
    538     } 
    539  
    540     // for information purposes 
    541     ccWETFilesProcessor.setWETFileCount(wetFileCount); 
    542     ccWETFilesProcessor.setRecordCount(wetRecordCount); 
    543      
     571    CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder); 
     572 
     573    File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter()); 
     574         
     575    for(int i = 0; i < ccrawlFolders.length; i++) { 
     576        File ccrawlFolder = ccrawlFolders[i]; 
     577        System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder); 
     578        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);         
     579    } 
     580 
     581    // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 
     582    // The former is the only unique one. seedURLs and regex-urlfilters are 
     583    // repeated on a per site/domain basis too, stored in the sites folder 
    544584    File seedURLsFile = new File(outFolder, "seedURLs.txt"); 
    545585    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 
    546     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 
     586    File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 
     587    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile); 
    547588 
    548589    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33517 r33552  
    44import java.io.*; 
    55import java.util.Properties; 
    6 import java.util.zip.GZIPInputStream; 
    76import java.util.Iterator; 
    87import java.util.Set; 
     
    3837    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; 
    3938    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";     
    40    
    41     private final String WETFileID; 
     39 
     40    private final String crawlID; 
     41    private final int WETFileID; 
    4242    private final File inFile; 
    4343 
     
    5555     * out to a uniquely named file in either the keep or discard folder depending on the WET 
    5656     * record's content length and number of lines of actual content (excluding WARC headers). 
     57     * @param inFile the warc.wet file whose WET records are to be processed 
     58     * @param crawlID is the ID of the commoncrawl containing this warc.wet file 
     59     * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used 
     60     * as prefix to create unique filenames when storing each individual record). 
    5761     */ 
    58     public WETProcessor(File inFile, CCWETProcessor batchProcessor) { 
     62    public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) { 
    5963    this.batchProcessor = batchProcessor; 
    6064     
    6165    this.inFile = inFile; 
    62     // We just want a unique recordID prefix, which we get from the wet file name suffix: 
     66    this.crawlID = crawlID; 
     67 
     68    // We just want a unique recordID prefix, which we get from concatenating 
     69    // the commoncrawl ID with the wet file name suffix and record count within the file: 
    6370    // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet 
    6471    // the prefix will be everything after the last hyphen and without file extension, 
    65     // so "000000" in our example. Then suffix the recordCount (keeping track of the current 
    66     // WET record) to get a unique filename to store each WET record into. 
     72    // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00. 
     73    // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track 
     74    // of the current WET record to get a unique filename to store each WET record into. 
     75    // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the 
     76    // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz) 
    6777 
    6878    String fileID = inFile.getName(); 
    69     fileID = fileID.substring(fileID.lastIndexOf("-")+1); 
    70     fileID = fileID.substring(0, fileID.indexOf(".")); 
    71     this.WETFileID = fileID; 
     79    //System.err.println("*** Processing wetfile: " + fileID);           
     80    fileID = fileID.substring(fileID.lastIndexOf("0")+1); 
     81    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet 
     82        this.WETFileID = 0; 
     83    } else { 
     84        fileID = fileID.substring(0, fileID.indexOf(".")); 
     85        this.WETFileID = Integer.parseInt(fileID); 
     86    } 
    7287    } 
    7388 
     89    /** 
     90     * Processes all the WET records of a single warc.wet file 
     91     */ 
    7492    public int processWETFile() { 
    7593    File keepURLsFile = this.batchProcessor.keepURLsFile; 
     
    177195                  String recordURI, String record) 
    178196    { 
    179     System.err.println("WET #" + this.WETFileID + " record #" + recordID 
     197    System.err.println("CrawlID: CC-MAIN-" + this.crawlID 
     198               + " WET #" + this.WETFileID 
     199               + " record #" + recordID 
    180200               + " - contentLength: " + contentLength 
    181201               + " - lineCount: " + lineCount); 
     
    184204    //System.err.println("--------------------------"); 
    185205 
    186     String paddedFileName = String.format("%04d.txt", recordID); 
    187      
    188206    File parentFolder = null; 
    189  
    190207 
    191208    if(batchProcessor.isBlacklisted(recordURI)) { 
     
    289306    
    290307    System.err.println("--------------------------"); 
    291      
    292     File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName); 
     308 
     309    // outFilename will look something like YYYY-##-#### 
     310    String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID); 
     311        //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID); 
     312    File outFile = new File(parentFolder, outFilename); 
    293313     
    294314    try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {