package org.greenstone.atea; import java.io.*; import java.util.Properties; import java.util.Iterator; import java.util.Set; import java.util.TreeSet; import org.apache.log4j.Logger; /** * A single instance of the WETProcessor class can process a single unzipped warc.wet file. * A WETProcessor take a warc.wet file and goes through all its WET records, * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder * and its url listed written into a keep, discard or greylisted text file, based on: * * 1. whether it's whitelisted, else greylisted else blacklisted * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's * enough content. Formerly, content-length and number of lines were used to determine if * the content was sufficient. Now it's just word count and number of MAX characters * (not MINIMUM characters) that determine a string is a word. * Explicit whitelisting has precedence over greylisting and which takes precedence * over blacklisting in turn. * However, even explicitly whitelisted urls still need to have sufficient content to end * up in keepURLs.txt. * * See CCWETProcessor.java for compile instructions and how to run. * */ public class WETProcessor { private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); // WARC WET header lines and header line prefixes of interest static final String WARC_RECORD_START = "WARC/1.0"; static final String WARC_INFO_HEADER = "WARC-Type: warcinfo"; static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; private final String crawlID; private final int WETFileID; private final File inFile; private int recordCount = 0; /** Handle to a CCWETProcessor that processes a set of WET files * Whereas a WETProcessor instance only processes a single WET file * containing multiple WET records. */ private CCWETProcessor batchProcessor; /** * WET processor processes a single warc.wet file containing multiple WET records * containing text identified as primary langcode=mri. Each individual WET record is written * out to a uniquely named file in either the keep or discard folder depending on the WET * record's content length and number of lines of actual content (excluding WARC headers). * @param inFile the warc.wet file whose WET records are to be processed * @param crawlID is the ID of the commoncrawl containing this warc.wet file * and is of the format YYYY-## (of full crawlID CC-MAIN-YYYY-##) which will be used * as prefix to create unique filenames when storing each individual record). */ public WETProcessor(File inFile, String crawlID, CCWETProcessor batchProcessor) { this.batchProcessor = batchProcessor; this.inFile = inFile; this.crawlID = crawlID; // We just want a unique recordID prefix, which we get from concatenating // the commoncrawl ID with the wet file name suffix and record count within the file: // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet // the prefix will be everything after the last hyphen and without file extension, // so "000000" in our example. Then converted into a number and padded to 2, e.g. 00. // Then prefix the crawlID and suffix the 4-digit padded recordCount keeping track // of the current WET record to get a unique filename to store each WET record into. // e.g. 2019-30-00-0015 is the 15th WET record in the *00.warc.wet file of the // common crawl CC-MAIN-2019-30 (15th WET record of CC-MAIN-2019-30-*-000000.warc.wet.gz) String fileID = inFile.getName(); //logger.debug("*** Processing wetfile: " + fileID); fileID = fileID.substring(fileID.lastIndexOf("0")+1); if(fileID.startsWith(".")) { // took off too many zeroes, as happens with *000000.warc.wet this.WETFileID = 0; } else { fileID = fileID.substring(0, fileID.indexOf(".")); this.WETFileID = Integer.parseInt(fileID); } } /** * Processes all the WET records of a single warc.wet file */ public int processWETFile() { File keepURLsFile = this.batchProcessor.keepURLsFile; File discardURLsFile = this.batchProcessor.discardURLsFile; File greyListedFile = this.batchProcessor.greyListedFile; StringBuilder record = null; String line = null; boolean readingRecord = false; String WARCtargetURI = ""; //int recordCount = 0; int contentLength = -1; // of record int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) // read from WETfile try ( BufferedReader reader = new BufferedReader(new FileReader(this.inFile)); BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append ) { while((line = reader.readLine()) != null) { // readLine removes newline separator if(line.startsWith(WARC_INFO_HEADER)) { // "WARC-Type: warcinfo" readingRecord = false; record = null; // drop this record, which is just an info record not actual web page's text recordCount--; continue; } if(line.startsWith(WARC_RECORD_START)) { // "WARC/1.0" means finished previous WET record // process any previous record if(record != null) { processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter, recordCount, contentLength, lineCount, WARCtargetURI, record.toString()); record = null; contentLength = -1; lineCount = -1; } recordCount++; // get ready to start a new record readingRecord = true; record = new StringBuilder(); } if(readingRecord) { // append current line to current record if(line.startsWith(WARC_TARGET_URI_HEADER_PREFIX)) { // "WARC-Target-URI:" // get and store the value WARCtargetURI = line.substring(WARC_TARGET_URI_HEADER_PREFIX.length()).trim(); } record.append(line + "\n"); // add back (unix style) line ending // if the line is non-empty // AND if we've started counting lines, which happens only when the current // line is past WARC/WET headers and we're into the actual body portion // of the WET record, // start incrementing the line counter. if(lineCount >= 0 && !line.trim().equals("")) { lineCount++; } else if(line.startsWith(WARC_CONTENT_LENGTH_HEADER_PREFIX)) { // "Content-Length:" String contentLengthStr = line.substring(WARC_CONTENT_LENGTH_HEADER_PREFIX.length()).trim(); contentLength = Integer.parseInt(contentLengthStr); lineCount = 0; } } } // flush the last record. If it was a warcinfo record, record would be null here if(record != null) { processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter, recordCount, contentLength, lineCount, WARCtargetURI, record.toString()); record = null; } } catch(IOException ioe) { ioe.printStackTrace(); } return recordCount; } public int getRecordCount() { return this.recordCount; } /** * Determines if a WET record belongs in the keep or discard pile depending on if it * contains enough text, based on contentLength and line count of the record body. * Then writes out the WET record to a uniquely named file in the keep or discard folder, * and writes out the WET record's URL to the keepURLs.txt file or discardURLs.txt file. */ private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter, BufferedWriter greyListedURLsWriter, int recordID, int contentLength, int lineCount, String recordURI, String record) { logger.info("CrawlID: CC-MAIN-" + this.crawlID + " WET #" + this.WETFileID + " record #" + recordID + " - contentLength: " + contentLength + " - lineCount: " + lineCount); logger.info("URI: " + recordURI); //logger.debug(record); //logger.info("--------------------------"); File parentFolder = null; if(batchProcessor.isBlacklisted(recordURI)) { // explicit whitelisting overrides blacklisting if(batchProcessor.isWhitelisted(recordURI)) { parentFolder = batchProcessor.keepFolder; //tentative } // if not whitelisted, then greylisting still overrides blacklisting else if(batchProcessor.isGreylisted(recordURI)) { parentFolder = batchProcessor.greyListedFolder; logger.debug("@@@GREYLISTED"); } else { // url was only blacklisted parentFolder = batchProcessor.discardFolder; logger.debug("@@@DISCARDING - blacklisted"); } } else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites // explicit whitelisting overrides greylisting if(batchProcessor.isWhitelisted(recordURI)) { parentFolder = batchProcessor.keepFolder; // tentative } else { parentFolder = batchProcessor.greyListedFolder; logger.debug("@@@GREYLISTED"); } } // If URL was not blacklisted/greylisted, or was even explicitly whitelisted, // it still can't be in the keep list as it needs further inspection: // it needs sufficient content for language analysis. // We don't care about the combination of number of lines and content-length, // we just care about the number of "valid words" as defined by us. if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null // If a web page's WET record contains a certain minimum number of words, // we will think it's a meaningful web page and has sufficient content for text analysis // to have been successful. Cut off values at present are: // - a minimum of 20 words // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point // to words having been glued together. This is used by irrelevant sites and moreover // can't be analysed for language, so may not be actually MRI. // Though StringTokenizer still in use, as seen in discussion at // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated // String.split(regex) seems better for splitting on general whitespace String[] allWords = record.split("\\s"); int validWordCount = 0; int numCamelCaseWords = 0; for(int i = 0; i < allWords.length; i++) { String word = allWords[i]; // throw away if n words contain camelcase, which is another case of words glued together // For now, we'll only skip camelcased words in our count of valid words if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) { numCamelCaseWords++; } // In Maori, word length of 1 is not uncommon // but let's skip camelcased words when counting valid words else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) { validWordCount++; } } /* // dump if too many camelcase words (ideally keep no WET record of that kind?) if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { parentFolder = batchProcessor.discardFolder; logger.debug("@@@DISCARDING - CAMELCASED CONTENTS"); } else*/ // For now, don't discount content with too many camelcased words // Just focus on whether there are a sufficient number of valid words // (camelcased words are however still ignored in our count of valid words) if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words parentFolder = batchProcessor.keepFolder; logger.debug("@@@KEEPING"); } } // if parentFolder still not set, it means that the content length/num words or lines // were insufficient, so meant to be discarded if(parentFolder == null) { parentFolder = batchProcessor.discardFolder; logger.debug("@@@DISCARDING"); } try { if (parentFolder == batchProcessor.keepFolder) { keepURLsWriter.write(recordURI + "\n"); } else if (parentFolder == batchProcessor.greyListedFolder) { greyListedURLsWriter.write(recordURI + "\n"); } else { discardURLsWriter.write(recordURI + "\n"); } } catch(Exception e) { logger.debug("Unable to write URL"); e.printStackTrace(); } logger.debug("--------------------------"); // outFilename will look something like YYYY-##-#### String outFilename = String.format("%s-%02d-%04d", this.crawlID, this.WETFileID, recordID); //= this.crawlID + "-" + String.format("%02d", this.WETFileID) + "-" + String.format("%04d.txt", recordID); File outFile = new File(parentFolder, outFilename); try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) { writer.write(record); // Try-with-resources examples don't call close() explicitly: // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html //writer.close(); } catch(IOException ioe) { ioe.printStackTrace(); logger.error("@@@@@@@@@ Error writing to file " + outFile, ioe); } } }