Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33501

Timestamp:

2019-09-23T21:28:06+12:00 (5 years ago)

Author:

ak19

Message:

Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.

Location:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea

Files:

: 1 added
: 1 edited

CCWETProcessor.java (added)
WETProcessor.java (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

-              r33497
+              r33501
 */
 public class WETProcessor {
+    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
+    private static Properties configProperties = new Properties();
+    // In Java, can initialize static final variables inside a static block
+    // But the unavoidable try/catch in this static block prevents initialization of
+    // the static final int variables (seen further below) inside the block itself,
+    // that therefore need to be declared and initialized thereafter.
+    static {
+    // load up the properties from the config file
+    try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
+        configProperties = new Properties();
+        configProperties.load(infile);
+        //infile.close();
+    } catch(Exception e) {
+        System.err.println("Exception attempting to read properties from config.properties.");
+        logger.error("Exception attempting to read properties from config.properties.");
+        e.printStackTrace();
+    }
+    }
+    // Providing fall-back cuttoff values if config.properties doesn't load
+    // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
+    /*
+    private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
+    private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
+    private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
+    private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
+    */
+    private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
+    private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
+    private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
+    // File paths shared across WETProcessor instances
+    private static File discardFolder;
+    private static File keepFolder;
+    private static File keepURLsFile;
+    private static File discardURLsFile;
+    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
     // WARC WET header lines and header line prefixes of interest
 …
     static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
     static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
+    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
+    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
+    private final String WETFileID;
+    private final File inFile;
+    private int recordCount = 0;
+    /** Handle to a CCWETProcessor that processes a set of WET files
+     * Whereas a WETProcessor instance only processes a single WET file
+     * containing multiple WET records.
+     */
+    private CCWETProcessor batchProcessor;
-    // Keep a count of all the records that all WETProcessors instantiated
-    // by our main method combined have processed
-    //private static int recordCount = 0;
-    private final File outputFolder;
-    private final String WETFileID;
     /**
      * WET processor processes a single warc.wet file containing multiple WET records
 …
      * record's content length and number of lines of actual content (excluding WARC headers).
      */
+    public WETProcessor(File inFile, File outFolder) {
+    this.outputFolder = outFolder;
+    StringBuilder record = null;
+    String line = null;
+    boolean readingRecord = false;
+    String WARCtargetURI = "";
+    int recordCount = 0;
+    int contentLength = -1; // of record
+    int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
+    public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
+    this.batchProcessor = batchProcessor;
+    this.inFile = inFile;
     // We just want a unique recordID prefix, which we get from the wet file name suffix:
     // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
 …
     fileID = fileID.substring(0, fileID.indexOf("."));
     this.WETFileID = fileID;
+    }
+    public int processWETFile() {
+    File keepURLsFile = this.batchProcessor.keepURLsFile;
+    File discardURLsFile = this.batchProcessor.discardURLsFile;
+    StringBuilder record = null;
+    String line = null;
+    boolean readingRecord = false;
+    String WARCtargetURI = "";
+    //int recordCount = 0;
+    int contentLength = -1; // of record
+    int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
     // read from WETfile
     try (
          BufferedReader reader = new BufferedReader(new FileReader(inFile));
+         BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
          BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
          BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
 …
         ioe.printStackTrace();
+    }
+    return recordCount;
+    }
+    public int getRecordCount() { return this.recordCount; }
     /**
 …
         // don't want a "translated" product site/online store
         // These curiously often tend to have "product(s)" in the URL
         parentFolder = WETProcessor.discardFolder;
+        parentFolder = batchProcessor.discardFolder;
+    }
     else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
         parentFolder = WETProcessor.keepFolder;
+        parentFolder = batchProcessor.keepFolder;
         System.err.println("@@@KEEPING");
     } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
 …
         // So we have at least 500 chars (possibly on a single wrapped line)
         // containing at least 10 spaces. Such a record is also worth keeping.
         parentFolder = WETProcessor.keepFolder;
+        parentFolder = batchProcessor.keepFolder;
+        }
+    }
     */
     if(isInDiscardFilter(recordURI)) {
         parentFolder = WETProcessor.discardFolder;
+    }
     else if(isInCheckFilter(recordURI)) { // products sites
         parentFolder = WETProcessor.discardFolder; // TODO: checkfolder
+    if(batchProcessor.isBlacklisted(recordURI)) {
+        parentFolder = batchProcessor.discardFolder;
+    }
+    else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
+        parentFolder = batchProcessor.discardFolder; // TODO: checkfolder
     } else {
         // If a web page's WET record contains a certain minimum number of words,
 …
         // In Maori, word length of 1 is not uncommon
         // but let's skip camelcased words when counting valid words
         else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
+        else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++;
+        }
         // dump if too many camelcase words (ideally keep none of that kind?)
         if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
         parentFolder = WETProcessor.discardFolder;
+        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
+        parentFolder = batchProcessor.discardFolder;
         System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
+        }
         else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
         parentFolder = WETProcessor.keepFolder;
+        else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
+        parentFolder = batchProcessor.keepFolder;
         System.err.println("@@@KEEPING");
+        }
 …
     // if parentFolder still not set, set to discard pile folder
     if(parentFolder == null) {
         parentFolder = WETProcessor.discardFolder;
+        parentFolder = batchProcessor.discardFolder;
         System.err.println("@@@DISCARDING");
+    }
     try {
         if (parentFolder == WETProcessor.keepFolder) {
+        if (parentFolder == batchProcessor.keepFolder) {
         keepURLsWriter.write(recordURI + "\n");
         } else {
 …
+    }
+    }
-    /**
-     * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
-     * As output produces the URL seed list and regex-urlfilter text files required by nutch,
-     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
-     */
-    public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {
-    // Maintain Sets of unique domains and urls
-    // TreeSet: by default, "the elements are ordered using their natural ordering"
-    // (or by a Comparator provided at set creation time).
-    // Whereas HashSet doesn't guarantee ordering.
-    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
-    Set<String> domainsSet = new TreeSet<String>();
-    Set<String> urlsSet = new TreeSet<String>();
-    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
-    try (
-         BufferedReader reader = new BufferedReader(new FileReader(urlsFile));
-         ) {
-        // read a URL at a time from urlsFile
-        String url = null;
-        String domain = null;
-        while((url = reader.readLine()) != null) { // readLine removes newline separator
-        // work out domain. This retains any www. or subdomain prefix:
-        int startIndex = url.indexOf("//"); // http:// or https:// prefix
-        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
-        domain = url.substring(startIndex);
-        int endIndex = domain.indexOf("/");
-        if(endIndex == -1) endIndex = domain.length();
-        domain = domain.substring(0, endIndex);
-        //if(!domainsMap.containsKey(domain)) {
-        urlsSet.add(url);
-        domainsSet.add(domain);
-        //}
+        }
-    } catch (IOException ioe) {
-        ioe.printStackTrace();
-        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);
+    }
-    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
-        Iterator<String> i = urlsSet.iterator();
-        while(i.hasNext()) {
-        String url = i.next();
-        seedURLsWriter.write(url + "\n");
+        }
-    } catch (IOException ioe) {
-        ioe.printStackTrace();
-        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
+    }
-    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
-        Iterator<String> i = domainsSet.iterator();
-        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
-        while(i.hasNext()) {
-        String domain = i.next();
-        domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";
-        urlFilterWriter.write(domain + "\n");
+        }
-    } catch (IOException ioe) {
-        ioe.printStackTrace();
-        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
+    }
+    }
-    /**
-     * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
-     * whether it is in the discard list.
-     * Filters don't represent actual regex, just ^ and $ as start and end terminators.
-     * By not having this method deal with actual regex for filters, this has the advantage that
-     * we don't have to remember to escape or double escape each filter to turn it into a regex.
-     */
-    public boolean isInDiscardFilter(String url) {
-    String discardFilterFile = "url-discard-filter.txt"; // in conf folder
-    try (
-         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
-         ) {
-        String filter = null;
-        while((filter = reader.readLine()) != null) {
-        if(filter.trim().equals("")) {
-            continue;
+        }
-        //System.err.println("Got filter: " + filter);
-        if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
-            System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
+        }
-        else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
-            System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
-            return true;
+        }
-        else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
-            System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
-            return true;
+        }
-        else if(url.contains(filter)) {
-            System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
-            return true;
+        }
+        }
-    } catch (IOException ioe) {
-        ioe.printStackTrace();
-        System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
+    }
-    return false;
+    }
-    // TODO
-    public boolean isInCheckFilter(String url) {
-    //System.err.println("isInCheckFilter(url) is not yet implemented");
-    return false;
+    }
-    //public static int getRecordCount() { return recordCount; }
-    public static void printUsage() {
-    System.err.println("Run this program as:");
-    System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
+    }
-    /** Filename filter to only list warc.wet files or else warc.wet.gz files
-     * for which unzipped warc.wet equivalents don't yet exist.
-     */
-    private static class WETFilenameFilter implements FilenameFilter {
-    public boolean accept(File dir, String name) {
-        if(name.endsWith(".warc.wet")) {
-        logger.debug("Will include " + name + " for processing.");
-        return true;
+        }
-        if(name.endsWith(".warc.wet.gz")) {
-        String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
-        File unzippedVersion = new File(dir, nameWithoutGZext);
-        if(unzippedVersion.exists()) {
-            logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
-            logger.debug("Skipping " + name);
-            return false; // don't count gzipped version if unzipped version exists.
+        }
-        else {
-            logger.debug("Only zipped version " + name + " exists.");
-            return true; // No unzipped version, so have to work with gzipped version
+        }
+        }
-        // we're not even interested in any other file extensions
-        logger.debug("Not a WET file. Skipping " + name);
-        return false;
+    }
+    }
-    public static void main(String[] args) {
-    if(args.length != 2) {
-        printUsage();
-        return;
+    }
-    File WETFileDir = new File(args[0]);
-    if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
-        System.out.println("Error: " + args[0] + " does not exist or is not a directory");
-        return;
+    }
-    File outFolder = new File(args[1]);
-    if(!outFolder.exists() || !outFolder.isDirectory()) {
-        System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
-        return;
+    }
-    // static folders and files to be shared across all WETProcessor instances
-    WETProcessor.discardFolder = new File(outFolder, "discard");
-    if(!WETProcessor.discardFolder.exists()) {
-        WETProcessor.discardFolder.mkdir();
+    }
-    WETProcessor.keepFolder = new File(outFolder, "keep");
-    if(!WETProcessor.keepFolder.exists()) {
-        WETProcessor.keepFolder.mkdir();
+    }
-    WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");
-    if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
-        System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
-        return;
+    }
-    WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
-    if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
-        System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
-        return;
+    }
-    // Will list all the warc.wet files in the input directory or else their gzipped versions
-    File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
-    for(int i = 0; i < WETFiles.length; i++) {
-        File WETFile = WETFiles[i];
-        logger.debug("Processing WETfile: " + WETFile);
-        // Any .gz files listed means they haven't been unzipped yet. So unzip.
-        String WETFilename = WETFile.toString();
-        if(WETFilename.endsWith(".gz")) {
-        File GZippedWETFile = WETFile;
-        String WETGZippedFilename = WETFilename;
-        WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));
-        WETFile = new File(WETFilename);
-        Utility.unzipFile(GZippedWETFile, WETFile);
+        }
-        // hereafter all WETFiles should refer to the unzipped version
-        // Check the unzipped WETFile exists
-        if(!WETFile.exists() || !WETFile.isFile()) {
-        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
-        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
-        return;
+        }
-        // Finally, we can process this WETFile's records into the keep and discard pile
-        logger.debug("Off to process " + WETFile);
-        WETProcessor processor = new WETProcessor(WETFile, outFolder);
+    }
-    File seedURLsFile = new File(outFolder, "seedURLs.txt");
-    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
-    WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);
-    return;
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33501

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

Download in other formats: