Changeset 33501 for gs3-extensions


Ignore:
Timestamp:
2019-09-23T21:28:06+12:00 (5 years ago)
Author:
ak19
Message:

Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33497 r33501  
    2929*/
    3030public class WETProcessor {
    31     private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());   
    32     private static Properties configProperties = new Properties();
    33 
    34     // In Java, can initialize static final variables inside a static block
    35     // But the unavoidable try/catch in this static block prevents initialization of
    36     // the static final int variables (seen further below) inside the block itself,
    37     // that therefore need to be declared and initialized thereafter.
    38     static {   
    39     // load up the properties from the config file
    40     try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
    41         configProperties = new Properties();
    42         configProperties.load(infile);     
    43         //infile.close();
    44        
    45     } catch(Exception e) {
    46         System.err.println("Exception attempting to read properties from config.properties.");
    47         logger.error("Exception attempting to read properties from config.properties.");
    48         e.printStackTrace();
    49     }
    50     }
    51    
    52     // Providing fall-back cuttoff values if config.properties doesn't load
    53     // or doesn't have the named props. But what happens when Integer.parseInt throws an exception?
    54     /*
    55     private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100"));
    56     private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2"));
    57     private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500"));
    58     private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10"));
    59     */
    60     private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces)
    61     private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20"));
    62     private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10"));
    63    
    64     // File paths shared across WETProcessor instances
    65     private static File discardFolder;
    66     private static File keepFolder;
    67     private static File keepURLsFile;
    68     private static File discardURLsFile;
     31    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());
    6932
    7033    // WARC WET header lines and header line prefixes of interest
     
    7235    static final String WARC_INFO_HEADER = "WARC-Type: warcinfo";
    7336    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:";
    74     static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";
     37    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";   
     38 
     39    private final String WETFileID;
     40    private final File inFile;
     41
     42    private int recordCount = 0;
     43
     44    /** Handle to a CCWETProcessor that processes a set of WET files
     45     * Whereas a WETProcessor instance only processes a single WET file
     46     * containing multiple WET records.
     47     */
     48    private CCWETProcessor batchProcessor;
    7549   
    76     // Keep a count of all the records that all WETProcessors instantiated
    77     // by our main method combined have processed
    78     //private static int recordCount = 0;
    79 
    80     private final File outputFolder;
    81     private final String WETFileID;
    82 
    83 
    8450    /**
    8551     * WET processor processes a single warc.wet file containing multiple WET records
     
    8854     * record's content length and number of lines of actual content (excluding WARC headers).
    8955     */
    90     public WETProcessor(File inFile, File outFolder) {
    91     this.outputFolder = outFolder;
    92    
    93     StringBuilder record = null;
    94     String line = null;
    95     boolean readingRecord = false;
    96 
    97     String WARCtargetURI = "";
    98    
    99     int recordCount = 0;
    100    
    101     int contentLength = -1; // of record
    102     int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
    103 
     56    public WETProcessor(File inFile, CCWETProcessor batchProcessor) {
     57    this.batchProcessor = batchProcessor;
     58   
     59    this.inFile = inFile;
    10460    // We just want a unique recordID prefix, which we get from the wet file name suffix:
    10561    // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet
     
    11268    fileID = fileID.substring(0, fileID.indexOf("."));
    11369    this.WETFileID = fileID;
    114 
     70    }
     71
     72    public int processWETFile() {
     73    File keepURLsFile = this.batchProcessor.keepURLsFile;
     74    File discardURLsFile = this.batchProcessor.discardURLsFile;
     75   
     76    StringBuilder record = null;
     77    String line = null;
     78    boolean readingRecord = false;
     79
     80    String WARCtargetURI = "";
     81   
     82    //int recordCount = 0;
     83   
     84    int contentLength = -1; // of record
     85    int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers)
    11586       
    11687    // read from WETfile
    11788    try (
    118          BufferedReader reader = new BufferedReader(new FileReader(inFile));
     89         BufferedReader reader = new BufferedReader(new FileReader(this.inFile));
    11990         BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
    12091         BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
     
    185156        ioe.printStackTrace();
    186157    }
     158
     159    return recordCount;
    187160    }
     161
     162    public int getRecordCount() { return this.recordCount; }
    188163
    189164    /**
     
    217192        // don't want a "translated" product site/online store
    218193        // These curiously often tend to have "product(s)" in the URL
    219         parentFolder = WETProcessor.discardFolder;
     194        parentFolder = batchProcessor.discardFolder;
    220195    }
    221196
    222197    else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
    223         parentFolder = WETProcessor.keepFolder;
     198        parentFolder = batchProcessor.keepFolder;
    224199        System.err.println("@@@KEEPING");
    225200    } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
     
    231206        // So we have at least 500 chars (possibly on a single wrapped line)
    232207        // containing at least 10 spaces. Such a record is also worth keeping.
    233         parentFolder = WETProcessor.keepFolder;
     208        parentFolder = batchProcessor.keepFolder;
    234209        }
    235210    }
    236211    */
    237212
    238     if(isInDiscardFilter(recordURI)) {
    239         parentFolder = WETProcessor.discardFolder;
    240     }
    241     else if(isInCheckFilter(recordURI)) { // products sites
    242         parentFolder = WETProcessor.discardFolder; // TODO: checkfolder
     213    if(batchProcessor.isBlacklisted(recordURI)) {
     214        parentFolder = batchProcessor.discardFolder;
     215    }
     216    else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
     217        parentFolder = batchProcessor.discardFolder; // TODO: checkfolder
    243218    } else {
    244219        // If a web page's WET record contains a certain minimum number of words,
     
    266241        // In Maori, word length of 1 is not uncommon
    267242        // but let's skip camelcased words when counting valid words
    268         else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++;
     243        else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++;
    269244        }
    270245
    271246        // dump if too many camelcase words (ideally keep none of that kind?)
    272         if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) {
    273         parentFolder = WETProcessor.discardFolder;
     247        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
     248        parentFolder = batchProcessor.discardFolder;
    274249        System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
    275250        }
    276         else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
    277         parentFolder = WETProcessor.keepFolder;
     251        else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
     252        parentFolder = batchProcessor.keepFolder;
    278253        System.err.println("@@@KEEPING");
    279254        }
     
    281256    // if parentFolder still not set, set to discard pile folder
    282257    if(parentFolder == null) {
    283         parentFolder = WETProcessor.discardFolder;
     258        parentFolder = batchProcessor.discardFolder;
    284259        System.err.println("@@@DISCARDING");
    285260    }
    286261
    287262    try {
    288         if (parentFolder == WETProcessor.keepFolder) {
     263        if (parentFolder == batchProcessor.keepFolder) {
    289264        keepURLsWriter.write(recordURI + "\n");
    290265        } else {
     
    310285    }
    311286    }
    312 
    313 
    314     /**
    315      * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
    316      * As output produces the URL seed list and regex-urlfilter text files required by nutch,
    317      * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
    318      */
    319     public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {
    320     // Maintain Sets of unique domains and urls
    321     // TreeSet: by default, "the elements are ordered using their natural ordering"
    322     // (or by a Comparator provided at set creation time).
    323     // Whereas HashSet doesn't guarantee ordering.
    324     // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
    325 
    326     Set<String> domainsSet = new TreeSet<String>();
    327     Set<String> urlsSet = new TreeSet<String>();
    328 
    329     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
    330    
    331     try (
    332          BufferedReader reader = new BufferedReader(new FileReader(urlsFile));
    333          ) {
    334        
    335         // read a URL at a time from urlsFile
    336         String url = null;
    337         String domain = null;
    338         while((url = reader.readLine()) != null) { // readLine removes newline separator
    339        
    340         // work out domain. This retains any www. or subdomain prefix:
    341         int startIndex = url.indexOf("//"); // http:// or https:// prefix
    342         startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
    343         domain = url.substring(startIndex);
    344         int endIndex = domain.indexOf("/");
    345         if(endIndex == -1) endIndex = domain.length();
    346         domain = domain.substring(0, endIndex);
    347 
    348         //if(!domainsMap.containsKey(domain)) {
    349         urlsSet.add(url);
    350         domainsSet.add(domain);
    351         //}
    352         }
    353     } catch (IOException ioe) {
    354         ioe.printStackTrace();
    355         System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);
    356     }
    357    
    358     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
    359         Iterator<String> i = urlsSet.iterator();
    360         while(i.hasNext()) {
    361         String url = i.next();
    362         seedURLsWriter.write(url + "\n");
    363         }
    364        
    365     } catch (IOException ioe) {
    366         ioe.printStackTrace();
    367         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
    368     }
    369 
    370     try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
    371         Iterator<String> i = domainsSet.iterator();
    372         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
    373         while(i.hasNext()) {
    374         String domain = i.next();
    375         domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";       
    376         urlFilterWriter.write(domain + "\n");
    377         }
    378        
    379     } catch (IOException ioe) {
    380         ioe.printStackTrace();
    381         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
    382     }
    383     }
    384 
    385     /**
    386      * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
    387      * whether it is in the discard list.
    388      * Filters don't represent actual regex, just ^ and $ as start and end terminators.
    389      * By not having this method deal with actual regex for filters, this has the advantage that
    390      * we don't have to remember to escape or double escape each filter to turn it into a regex.
    391      */
    392     public boolean isInDiscardFilter(String url) {
    393 
    394     String discardFilterFile = "url-discard-filter.txt"; // in conf folder
    395 
    396     try (
    397          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
    398          ) {
    399         String filter = null;
    400         while((filter = reader.readLine()) != null) {
    401         if(filter.trim().equals("")) {
    402             continue;
    403         }
    404         //System.err.println("Got filter: " + filter);
    405         if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
    406             System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
    407         }
    408         else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
    409             System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
    410             return true;
    411         }
    412         else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
    413             System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
    414             return true;
    415         }
    416         else if(url.contains(filter)) {
    417             System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
    418             return true;
    419         }
    420                
    421         }
    422        
    423     } catch (IOException ioe) {
    424         ioe.printStackTrace();
    425         System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
    426     }
    427 
    428     return false;
    429     }
    430 
    431     // TODO
    432     public boolean isInCheckFilter(String url) {
    433     //System.err.println("isInCheckFilter(url) is not yet implemented");   
    434     return false;
    435     }
    436    
    437     //public static int getRecordCount() { return recordCount; }
    438    
    439     public static void printUsage() {
    440     System.err.println("Run this program as:");
    441     System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");   
    442     }
    443 
    444     /** Filename filter to only list warc.wet files or else warc.wet.gz files
    445      * for which unzipped warc.wet equivalents don't yet exist.
    446      */
    447     private static class WETFilenameFilter implements FilenameFilter {
    448    
    449     public boolean accept(File dir, String name) {
    450         if(name.endsWith(".warc.wet")) {
    451         logger.debug("Will include " + name + " for processing.");
    452         return true;
    453         }
    454 
    455         if(name.endsWith(".warc.wet.gz")) {
    456         String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz"));
    457         File unzippedVersion = new File(dir, nameWithoutGZext);
    458         if(unzippedVersion.exists()) {
    459             logger.debug("--- Unzipped version " + unzippedVersion + " exists.");
    460             logger.debug("Skipping " + name);
    461             return false; // don't count gzipped version if unzipped version exists.
    462         }
    463         else {
    464             logger.debug("Only zipped version " + name + " exists.");
    465             return true; // No unzipped version, so have to work with gzipped version
    466         }
    467         }
    468 
    469         // we're not even interested in any other file extensions
    470         logger.debug("Not a WET file. Skipping " + name);
    471         return false;
    472     }
    473     }
    474    
    475     public static void main(String[] args) {
    476     if(args.length != 2) {
    477         printUsage();
    478         return;
    479     }
    480 
    481    
    482     File WETFileDir = new File(args[0]);
    483     if(!WETFileDir.exists() || !WETFileDir.isDirectory()) {
    484         System.out.println("Error: " + args[0] + " does not exist or is not a directory");
    485         return;
    486     }
    487    
    488     File outFolder = new File(args[1]);
    489     if(!outFolder.exists() || !outFolder.isDirectory()) {
    490         System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
    491         return;
    492     }   
    493 
    494     // static folders and files to be shared across all WETProcessor instances
    495     WETProcessor.discardFolder = new File(outFolder, "discard");
    496     if(!WETProcessor.discardFolder.exists()) {
    497         WETProcessor.discardFolder.mkdir();
    498     }   
    499     WETProcessor.keepFolder = new File(outFolder, "keep");
    500     if(!WETProcessor.keepFolder.exists()) {
    501         WETProcessor.keepFolder.mkdir();
    502     }
    503 
    504     WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");   
    505     if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) {
    506         System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed.");
    507         return;
    508     }
    509     WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt");
    510     if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) {
    511         System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed.");
    512         return;
    513     }
    514    
    515     // Will list all the warc.wet files in the input directory or else their gzipped versions
    516     File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter());
    517 
    518     for(int i = 0; i < WETFiles.length; i++) {
    519         File WETFile = WETFiles[i];     
    520         logger.debug("Processing WETfile: " + WETFile);
    521 
    522         // Any .gz files listed means they haven't been unzipped yet. So unzip.
    523         String WETFilename = WETFile.toString();
    524         if(WETFilename.endsWith(".gz")) {
    525         File GZippedWETFile = WETFile;
    526         String WETGZippedFilename = WETFilename;
    527         WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));     
    528 
    529         WETFile = new File(WETFilename);
    530         Utility.unzipFile(GZippedWETFile, WETFile);
    531         }
    532         // hereafter all WETFiles should refer to the unzipped version
    533         // Check the unzipped WETFile exists       
    534 
    535         if(!WETFile.exists() || !WETFile.isFile()) {
    536         System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
    537         logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
    538         return;
    539         }
    540 
    541         // Finally, we can process this WETFile's records into the keep and discard pile
    542         logger.debug("Off to process " + WETFile);
    543         WETProcessor processor = new WETProcessor(WETFile, outFolder);     
    544        
    545     }
    546 
    547     File seedURLsFile = new File(outFolder, "seedURLs.txt");
    548     File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
    549     WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);
    550    
    551     return;
    552     }
    553287}
Note: See TracChangeset for help on using the changeset viewer.