Changeset 33552


Ignore:
Timestamp:
2019-10-04T22:00:46+13:00 (5 years ago)
Author:
ak19
Message:
  1. Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.
Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33519 r33552  
    6969
    7070    // File paths shared across WETProcessor instances
    71     public final File WETFilesDir;
     71    public final File commoncrawlDir;
    7272    public final File outputFolder;
    7373    public final File discardFolder;
     
    109109   
    110110    public CCWETProcessor(File inFolder, File outFolder) throws Exception {
    111     this.WETFilesDir = inFolder;
     111    this.commoncrawlDir = inFolder;
    112112    this.outputFolder = outFolder;
    113113
     
    178178   
    179179    /**
    180      * Using the keepURLs.txt file generated by running WETProcessor instances, produces
     180     * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
    181181     * as output the URL seed list and regex-urlfilter text files required by nutch, see
    182182     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
    183183     */
    184     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
     184    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) {
    185185    // Maintain Sets of unique domains and urls
    186186    // TreeSet: by default, "the elements are ordered using their natural ordering"
     
    193193    domainsToURLsMap = new TreeMap<String, Set<String>>();
    194194   
    195     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
     195    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
    196196   
    197197    try (
     
    230230    }
    231231
     232    // We'd have pruned out duplicates by now and have a sorted list of domains,
     233    // each of which maps to seed URLs in the commoncrawl for that domain
     234   
    232235    /*
    233236    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
     
    243246    }
    244247    */
    245 
     248   
    246249    int domainCount = 0;
    247250    File sitesFolder = new File(outputFolder, "sites");
     
    254257    // (urls with tab up front)
    255258    try (
     259         // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
     260         BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
    256261         BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
    257262         BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
     
    274279        urlFilterWriter.write(regexed_domain + "\n");
    275280
    276         // for every domain, we need sites/0000x/ folder containing its own
    277         // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
     281        // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
     282        // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
    278283        // We still have a global seedURLs.txt and regex-urlfilter.txt too.
    279284        File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
     
    283288             BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
    284289             ) {
    285             // only write urls and no domain into single global seedurls file
    286             // But write domain and tabbed urls into individual sites/0000x.txt files
    287             // and write regexed domain into it too
     290
     291            // write all sorted unique domains into global domains file
     292            domainURLsWriter.write(domain + "\n");
     293
     294            // Only write urls and no domain into single global seedurls file
     295            // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
     296            // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
     297            // If we ever run nutch on a single seedURLs listing containing
     298            // all seed pages to crawl sites from, the above two files will work for that.
    288299            siteURLsWriter.write(domain + "\n");       
    289300            siteRegexWriter.write(regexed_domain + "\n");
     
    381392   
    382393    /**
    383      * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
    384      * whether it is in the discard list.
     394     * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
     395     * whether it is in the mentioned black|grey|white list.
    385396     * Filters don't represent actual regex, just ^ and $ as start and end terminators.
    386397     * By not having this method deal with actual regex for filters, this has the advantage that
     
    439450    //public void addToRecordCount(int count) { this.totalRecordCount += count; }
    440451    public void setRecordCount(int count) { this.totalRecordCount = count; }
     452
     453    public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
     454   
     455    // Will list all the warc.wet files in the input directory or else their gzipped versions
     456    File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
     457
     458    int wetRecordCount = 0;
     459    int wetFileCount = 0;
     460   
     461    for(int i = 0; i < WETFiles.length; i++) {
     462        File WETFile = WETFiles[i];     
     463        logger.debug("Processing WETfile: " + WETFile);
     464
     465        // Any .gz files listed means they haven't been unzipped yet. So unzip.
     466        String WETFilename = WETFile.toString();
     467        if(WETFilename.endsWith(".gz")) {
     468        File GZippedWETFile = WETFile;
     469        String WETGZippedFilename = WETFilename;
     470        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));     
     471
     472        WETFile = new File(WETFilename);
     473        Utility.unzipFile(GZippedWETFile, WETFile);
     474        }
     475        // hereafter all WETFiles should refer to the unzipped version
     476        // Check the unzipped WETFile exists       
     477
     478        if(!WETFile.exists() || !WETFile.isFile()) {
     479        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
     480        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
     481        return;
     482        }
     483
     484        // Finally, we can process this WETFile's records into the keep and discard pile
     485        wetFileCount++;
     486        logger.debug("Off to process " + WETFile);
     487        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
     488        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##     
     489        WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
     490        wetFileProcessor.processWETFile();
     491        wetRecordCount += wetFileProcessor.getRecordCount();
     492    }
     493
     494    // for information purposes
     495    this.setWETFileCount(wetFileCount);
     496    this.setRecordCount(wetRecordCount);
     497    }
    441498   
    442499    public static void printUsage() {
     
    475532    }
    476533    }
    477    
     534
     535
     536    private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
     537   
     538    public boolean accept(File dir, String name) {
     539        File f = new File (dir, name);
     540        if(f.isDirectory()) {
     541        if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
     542            return true;
     543        }
     544        }
     545        else {
     546        System.err.println("File " + f + " is not a directory");
     547        }
     548        return false;                 
     549    }
     550    }
    478551
    479552    public static void main(String[] args) {
     
    483556    }
    484557   
    485    
    486     File WETFileDir = new File(args[0]);
    487     if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
     558    File commoncrawlDir = new File(args[0]);
     559    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
    488560        System.out.println("Error: " + args[0] + " does not exist or is not a directory");
    489561        return;
     
    497569
    498570    try {
    499     CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
    500 
    501     //ccWETFilesProcessor.processAllWETFiles();
    502    
    503     // Will list all the warc.wet files in the input directory or else their gzipped versions
    504     File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
    505 
    506     int wetRecordCount = 0;
    507     int wetFileCount = 0;
    508    
    509     for(int i = 0; i < WETFiles.length; i++) {
    510         File WETFile = WETFiles[i];     
    511         logger.debug("Processing WETfile: " + WETFile);
    512 
    513         // Any .gz files listed means they haven't been unzipped yet. So unzip.
    514         String WETFilename = WETFile.toString();
    515         if(WETFilename.endsWith(".gz")) {
    516         File GZippedWETFile = WETFile;
    517         String WETGZippedFilename = WETFilename;
    518         WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));     
    519 
    520         WETFile = new File(WETFilename);
    521         Utility.unzipFile(GZippedWETFile, WETFile);
    522         }
    523         // hereafter all WETFiles should refer to the unzipped version
    524         // Check the unzipped WETFile exists       
    525 
    526         if(!WETFile.exists() || !WETFile.isFile()) {
    527         System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
    528         logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
    529         return;
    530         }
    531 
    532         // Finally, we can process this WETFile's records into the keep and discard pile
    533         wetFileCount++;
    534         logger.debug("Off to process " + WETFile);
    535         WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
    536         wetFileProcessor.processWETFile();
    537         wetRecordCount += wetFileProcessor.getRecordCount();
    538     }
    539 
    540     // for information purposes
    541     ccWETFilesProcessor.setWETFileCount(wetFileCount);
    542     ccWETFilesProcessor.setRecordCount(wetRecordCount);
    543    
     571    CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
     572
     573    File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
     574       
     575    for(int i = 0; i < ccrawlFolders.length; i++) {
     576        File ccrawlFolder = ccrawlFolders[i];
     577        System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
     578        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);       
     579    }
     580
     581    // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
     582    // The former is the only unique one. seedURLs and regex-urlfilters are
     583    // repeated on a per site/domain basis too, stored in the sites folder
    544584    File seedURLsFile = new File(outFolder, "seedURLs.txt");
    545585    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
    546     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
     586    File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
     587    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile);
    547588
    548589    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33517 r33552  
    44import java.io.*;
    55import java.util.Properties;
    6 import java.util.zip.GZIPInputStream;
    76import java.util.Iterator;
    87import java.util.Set;
     
    3837    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
    3938    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";   
    40  
    41     private final String WETFileID;
     39
     40    private final String crawlID;
     41    private final int WETFileID;
    4242    private final File inFile;
    4343
     
    5555     * out to a uniquely named file in either the keep or discard folder depending on the WET
    5656     * record's content length and number of lines of actual content (excluding WARC headers).
     57     * @param inFile the warc.wet file whose WET records are to be processed
     58     * @param crawlID is the ID of the commoncrawl containing this warc.wet file
     59     * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
     60     * as prefix to create unique filenames when storing each individual record).
    5761     */
    58     public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
     62    public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
    5963    this.batchProcessor = batchProcessor;
    6064   
    6165    this.inFile = inFile;
    62     // We just want a unique recordID prefix, which we get from the wet file name suffix:
     66    this.crawlID = crawlID;
     67
     68    // We just want a unique recordID prefix, which we get from concatenating
     69    // the commoncrawl ID with the wet file name suffix and record count within the file:
    6370    // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
    6471    // the prefix will be everything after the last hyphen and without file extension,
    65     // so "000000" in our example. Then suffix the recordCount (keeping track of the current
    66     // WET record) to get a unique filename to store each WET record into.
     72    // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
     73    // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
     74    // of the current WET record to get a unique filename to store each WET record into.
     75    // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
     76    // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)
    6777
    6878    String fileID = inFile.getName();
    69     fileID = fileID.substring(fileID.lastIndexOf("-")+1);
    70     fileID = fileID.substring(0, fileID.indexOf("."));
    71     this.WETFileID = fileID;
     79    //System.err.println("*** Processing wetfile: " + fileID);         
     80    fileID = fileID.substring(fileID.lastIndexOf("0")+1);
     81    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
     82        this.WETFileID = 0;
     83    } else {
     84        fileID = fileID.substring(0, fileID.indexOf("."));
     85        this.WETFileID = Integer.parseInt(fileID);
     86    }
    7287    }
    7388
     89    /**
     90     * Processes all the WET records of a single warc.wet file
     91     */
    7492    public int processWETFile() {
    7593    File keepURLsFile = this.batchProcessor.keepURLsFile;
     
    177195                  String recordURI, String record)
    178196    {
    179     System.err.println("WET #" + this.WETFileID + " record #" + recordID
     197    System.err.println("CrawlID: CC-MAIN-" + this.crawlID
     198               + " WET #" + this.WETFileID
     199               + " record #" + recordID
    180200               + " - contentLength: " + contentLength
    181201               + " - lineCount: " + lineCount);
     
    184204    //System.err.println("--------------------------");
    185205
    186     String paddedFileName = String.format("%04d.txt", recordID);
    187    
    188206    File parentFolder = null;
    189 
    190207
    191208    if(batchProcessor.isBlacklisted(recordURI)) {
     
    289306   
    290307    System.err.println("--------------------------");
    291    
    292     File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
     308
     309    // outFilename will look something like YYYY-##-####
     310    String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
     311        //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
     312    File outFile = new File(parentFolder, outFilename);
    293313   
    294314    try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
Note: See TracChangeset for help on using the changeset viewer.