Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33552

Timestamp:

2019-10-04T22:00:46+13:00 (5 years ago)

Author:

ak19

Message:

Code now processes ccrawldata folder, containing each individual common crawl folder (CC-MAIN-YYYY-##) of warc.wet(.gz) files. 2. global file containing all domains we're going to crawl. 3. WET records we're keeping that are stored in individual files now have better filenames.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 2 edited

CCWETProcessor.java (modified) (14 diffs)
WETProcessor.java (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33519
+              r33552
     // File paths shared across WETProcessor instances
     public final File WETFilesDir;
+    public final File commoncrawlDir;
     public final File outputFolder;
     public final File discardFolder;
 …
     public CCWETProcessor(File inFolder, File outFolder) throws Exception {
     this.WETFilesDir = inFolder;
+    this.commoncrawlDir = inFolder;
     this.outputFolder = outFolder;
 …
     /**
      * Using the keepURLs.txt file generated by running WETProcessor instances, produces
+     * Using the keepURLs.txt file generated by running WETProcessor instances, this produces
      * as output the URL seed list and regex-urlfilter text files required by nutch, see
      * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
      */
     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile) {
+    public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) {
     // Maintain Sets of unique domains and urls
     // TreeSet: by default, "the elements are ordered using their natural ordering"
 …
     domainsToURLsMap = new TreeMap<String, Set<String>>();
     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
+    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt
     try (
 …
+    }
+    // We'd have pruned out duplicates by now and have a sorted list of domains,
+    // each of which maps to seed URLs in the commoncrawl for that domain
     /*
     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
 …
+    }
     */
     int domainCount = 0;
     File sitesFolder = new File(outputFolder, "sites");
 …
     // (urls with tab up front)
     try (
+         // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
+         BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile));
          BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile));
          BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))
 …
         urlFilterWriter.write(regexed_domain + "\n");
         // for every domain, we need sites/0000x/ folder containing its own
         // INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
+        // for every domain, we need a sites/0000x/ folder, where x is domain#, containing
+        // its own INDIVIDUAL seedURLs.txt and regex-urlfilter.txt
         // We still have a global seedURLs.txt and regex-urlfilter.txt too.
         File siteSeedsFile = new File(domainFolder, "seedURLs.txt"); // e.g. sites/00001/seedURLs.txt
 …
              BufferedWriter siteRegexWriter = new BufferedWriter(new FileWriter(siteRegexFile));
              ) {
+            // only write urls and no domain into single global seedurls file
+            // But write domain and tabbed urls into individual sites/0000x.txt files
+            // and write regexed domain into it too
+            // write all sorted unique domains into global domains file
+            domainURLsWriter.write(domain + "\n");
+            // Only write urls and no domain into single global seedurls file
+            // But write domain and tabbed urls into individual sites/0000#/seedURLs.txt
+            // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt)
+            // If we ever run nutch on a single seedURLs listing containing
+            // all seed pages to crawl sites from, the above two files will work for that.
             siteURLsWriter.write(domain + "\n");
             siteRegexWriter.write(regexed_domain + "\n");
 …
     /**
      * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
      * whether it is in the discard list.
+     * Checks URL parameter against each line ("filter") of conf/url-black|grey|whitelist-filter.txt to decide
+     * whether it is in the mentioned black|grey|white list.
      * Filters don't represent actual regex, just ^ and $ as start and end terminators.
      * By not having this method deal with actual regex for filters, this has the advantage that
 …
     //public void addToRecordCount(int count) { this.totalRecordCount += count; }
     public void setRecordCount(int count) { this.totalRecordCount = count; }
+    public void processAllWETFilesOfCrawl(File ccrawlWETFileDir) {
+    // Will list all the warc.wet files in the input directory or else their gzipped versions
+    File[] WETFiles = ccrawlWETFileDir.listFiles(new WETFilenameFilter());
+    int wetRecordCount = 0;
+    int wetFileCount = 0;
+    for(int i = 0; i < WETFiles.length; i++) {
+        File WETFile = WETFiles[i];
+        logger.debug("Processing WETfile: " + WETFile);
+        // Any .gz files listed means they haven't been unzipped yet. So unzip.
+        String WETFilename = WETFile.toString();
+        if(WETFilename.endsWith(".gz")) {
+        File GZippedWETFile = WETFile;
+        String WETGZippedFilename = WETFilename;
+        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
+        WETFile = new File(WETFilename);
+        Utility.unzipFile(GZippedWETFile, WETFile);
+        }
+        // hereafter all WETFiles should refer to the unzipped version
+        // Check the unzipped WETFile exists
+        if(!WETFile.exists() || !WETFile.isFile()) {
+        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
+        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
+        return;
+        }
+        // Finally, we can process this WETFile's records into the keep and discard pile
+        wetFileCount++;
+        logger.debug("Off to process " + WETFile);
+        String crawlID = ccrawlWETFileDir.getName(); // something like CC-MAIN-YYYY-##-wet-files
+        crawlID = crawlID.substring("CC-MAIN-".length(), crawlID.indexOf("-wet-files")); // YYYY-##
+        WETProcessor wetFileProcessor = new WETProcessor(WETFile, crawlID, this);
+        wetFileProcessor.processWETFile();
+        wetRecordCount += wetFileProcessor.getRecordCount();
+    }
+    // for information purposes
+    this.setWETFileCount(wetFileCount);
+    this.setRecordCount(wetRecordCount);
+    }
     public static void printUsage() {
 …
+    }
+    }
+    private static class CCrawlWETFolderFilenameFilter implements FilenameFilter {
+    public boolean accept(File dir, String name) {
+        File f = new File (dir, name);
+        if(f.isDirectory()) {
+        if(name.matches("CC-MAIN-\\d{4}-\\d{2}-wet-files")) {
+            return true;
+        }
+        }
+        else {
+        System.err.println("File " + f + " is not a directory");
+        }
+        return false;
+    }
+    }
     public static void main(String[] args) {
 …
+    }
+    File WETFileDir = new File(args[0]);
+    if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
+    File commoncrawlDir = new File(args[0]);
+    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
         System.out.println("Error: " + args[0] + " does not exist or is not a directory");
         return;
 …
     try {
+    CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
+    //ccWETFilesProcessor.processAllWETFiles();
+    // Will list all the warc.wet files in the input directory or else their gzipped versions
+    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
+    int wetRecordCount = 0;
+    int wetFileCount = 0;
+    for(int i = 0; i < WETFiles.length; i++) {
+        File WETFile = WETFiles[i];
+        logger.debug("Processing WETfile: " + WETFile);
+        // Any .gz files listed means they haven't been unzipped yet. So unzip.
+        String WETFilename = WETFile.toString();
+        if(WETFilename.endsWith(".gz")) {
+        File GZippedWETFile = WETFile;
+        String WETGZippedFilename = WETFilename;
+        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
+        WETFile = new File(WETFilename);
+        Utility.unzipFile(GZippedWETFile, WETFile);
+        }
+        // hereafter all WETFiles should refer to the unzipped version
+        // Check the unzipped WETFile exists
+        if(!WETFile.exists() || !WETFile.isFile()) {
+        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
+        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
+        return;
+        }
+        // Finally, we can process this WETFile's records into the keep and discard pile
+        wetFileCount++;
+        logger.debug("Off to process " + WETFile);
+        WETProcessor wetFileProcessor = new WETProcessor(WETFile, ccWETFilesProcessor);
+        wetFileProcessor.processWETFile();
+        wetRecordCount += wetFileProcessor.getRecordCount();
+    }
+    // for information purposes
+    ccWETFilesProcessor.setWETFileCount(wetFileCount);
+    ccWETFilesProcessor.setRecordCount(wetRecordCount);
+    CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(commoncrawlDir, outFolder);
+    File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
+    for(int i = 0; i < ccrawlFolders.length; i++) {
+        File ccrawlFolder = ccrawlFolders[i];
+        System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
+        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
+    }
+    // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
+    // The former is the only unique one. seedURLs and regex-urlfilters are
+    // repeated on a per site/domain basis too, stored in the sites folder
     File seedURLsFile = new File(outFolder, "seedURLs.txt");
     File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
+    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
+    File domainURLsFile = new File(outFolder, "all-domain-urls.txt");
+    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile);
     System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

-              r33517
+              r33552
 import java.io.*;
 import java.util.Properties;
-import java.util.zip.GZIPInputStream;
 import java.util.Iterator;
 import java.util.Set;
 …
     static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
     static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
+    private final String WETFileID;
+    private final String crawlID;
+    private final int WETFileID;
     private final File inFile;
 …
      * out to a uniquely named file in either the keep or discard folder depending on the WET
      * record's content length and number of lines of actual content (excluding WARC headers).
+     * @param inFile the warc.wet file whose WET records are to be processed
+     * @param crawlID is the ID of the commoncrawl containing this warc.wet file
+     * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used
+     * as prefix to create unique filenames when storing each individual record).
      */
     public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
+    public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) {
     this.batchProcessor = batchProcessor;
     this.inFile = inFile;
+    // We just want a unique recordID prefix, which we get from the wet file name suffix:
+    this.crawlID = crawlID;
+    // We just want a unique recordID prefix, which we get from concatenating
+    // the commoncrawl ID with the wet file name suffix and record count within the file:
     // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
     // the prefix will be everything after the last hyphen and without file extension,
+    // so "000000" in our example. Then suffix the recordCount (keeping track of the current
+    // WET record) to get a unique filename to store each WET record into.
+    // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00.
+    // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track
+    // of the current WET record to get a unique filename to store each WET record into.
+    // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the
+    // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz)
     String fileID = inFile.getName();
+    fileID = fileID.substring(fileID.lastIndexOf("-")+1);
+    fileID = fileID.substring(0, fileID.indexOf("."));
+    this.WETFileID = fileID;
+    //System.err.println("*** Processing wetfile: " + fileID);
+    fileID = fileID.substring(fileID.lastIndexOf("0")+1);
+    if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet
+        this.WETFileID = 0;
+    } else {
+        fileID = fileID.substring(0, fileID.indexOf("."));
+        this.WETFileID = Integer.parseInt(fileID);
+    }
+    }
+    /**
+     * Processes all the WET records of a single warc.wet file
+     */
     public int processWETFile() {
     File keepURLsFile = this.batchProcessor.keepURLsFile;
 …
                   String recordURI, String record)
+    {
+    System.err.println("WET #" + this.WETFileID + " record #" + recordID
+    System.err.println("CrawlID: CC-MAIN-" + this.crawlID
+               + " WET #" + this.WETFileID
+               + " record #" + recordID
                + " - contentLength: " + contentLength
                + " - lineCount: " + lineCount);
 …
     //System.err.println("--------------------------");
-    String paddedFileName = String.format("%04d.txt", recordID);
     File parentFolder = null;
     if(batchProcessor.isBlacklisted(recordURI)) {
 …
     System.err.println("--------------------------");
+    File outFile = new File(parentFolder, this.WETFileID + "-" + paddedFileName);
+    // outFilename will look something like YYYY-##-####
+    String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID);
+        //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID);
+    File outFile = new File(parentFolder, outFilename);
     try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33552

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

Download in other formats: