Changeset 33501

Show
Ignore:
Timestamp:
23.09.2019 21:28:06 (3 weeks ago)
Author:
ak19
Message:

Refactored code into 2 classes: The existing WETProcessor, which processes a single WETFile which can contain a large number of WET records. And the new CCWETProcessor, which stores configuration info for processing all the WET files belonging to a common-crawl. Refactoring will make it easier to prepare the blacklist and greylist and share them across WETProcessor instances.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33497 r33501  
    2929*/ 
    3030public class WETProcessor { 
    31     private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName());     
    32     private static Properties configProperties = new Properties(); 
    33  
    34     // In Java, can initialize static final variables inside a static block 
    35     // But the unavoidable try/catch in this static block prevents initialization of 
    36     // the static final int variables (seen further below) inside the block itself, 
    37     // that therefore need to be declared and initialized thereafter. 
    38     static {     
    39     // load up the properties from the config file 
    40     try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 
    41         configProperties = new Properties(); 
    42         configProperties.load(infile);       
    43         //infile.close(); 
    44          
    45     } catch(Exception e) { 
    46         System.err.println("Exception attempting to read properties from config.properties."); 
    47         logger.error("Exception attempting to read properties from config.properties."); 
    48         e.printStackTrace(); 
    49     } 
    50     } 
    51      
    52     // Providing fall-back cuttoff values if config.properties doesn't load 
    53     // or doesn't have the named props. But what happens when Integer.parseInt throws an exception? 
    54     /* 
    55     private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100")); 
    56     private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2")); 
    57     private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 
    58     private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 
    59     */ 
    60     private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces) 
    61     private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20")); 
    62     private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10")); 
    63      
    64     // File paths shared across WETProcessor instances 
    65     private static File discardFolder; 
    66     private static File keepFolder; 
    67     private static File keepURLsFile; 
    68     private static File discardURLsFile; 
     31    private static Logger logger = Logger.getLogger(org.greenstone.atea.WETProcessor.class.getName()); 
    6932 
    7033    // WARC WET header lines and header line prefixes of interest 
     
    7235    static final String WARC_INFO_HEADER = "WARC-Type: warcinfo"; 
    7336    static final String WARC_TARGET_URI_HEADER_PREFIX = "WARC-Target-URI:"; 
    74     static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:"; 
     37    static final String WARC_CONTENT_LENGTH_HEADER_PREFIX = "Content-Length:";     
     38   
     39    private final String WETFileID; 
     40    private final File inFile; 
     41 
     42    private int recordCount = 0; 
     43 
     44    /** Handle to a CCWETProcessor that processes a set of WET files 
     45     * Whereas a WETProcessor instance only processes a single WET file 
     46     * containing multiple WET records. 
     47     */ 
     48    private CCWETProcessor batchProcessor; 
    7549     
    76     // Keep a count of all the records that all WETProcessors instantiated 
    77     // by our main method combined have processed 
    78     //private static int recordCount = 0; 
    79  
    80     private final File outputFolder; 
    81     private final String WETFileID; 
    82  
    83  
    8450    /** 
    8551     * WET processor processes a single warc.wet file containing multiple WET records 
     
    8854     * record's content length and number of lines of actual content (excluding WARC headers). 
    8955     */ 
    90     public WETProcessor(File inFile, File outFolder) { 
    91     this.outputFolder = outFolder; 
    92      
    93     StringBuilder record = null; 
    94     String line = null;  
    95     boolean readingRecord = false; 
    96  
    97     String WARCtargetURI = ""; 
    98      
    99     int recordCount = 0; 
    100      
    101     int contentLength = -1; // of record 
    102     int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) 
    103  
     56    public WETProcessor(File inFile, CCWETProcessor batchProcessor) { 
     57    this.batchProcessor = batchProcessor; 
     58     
     59    this.inFile = inFile; 
    10460    // We just want a unique recordID prefix, which we get from the wet file name suffix: 
    10561    // inFile name looks something like MAORI-CC-2019-30-20190902100139-000000.warc.wet 
     
    11268    fileID = fileID.substring(0, fileID.indexOf(".")); 
    11369    this.WETFileID = fileID; 
    114  
     70    } 
     71 
     72    public int processWETFile() { 
     73    File keepURLsFile = this.batchProcessor.keepURLsFile; 
     74    File discardURLsFile = this.batchProcessor.discardURLsFile;  
     75     
     76    StringBuilder record = null; 
     77    String line = null;  
     78    boolean readingRecord = false; 
     79 
     80    String WARCtargetURI = ""; 
     81     
     82    //int recordCount = 0; 
     83     
     84    int contentLength = -1; // of record 
     85    int lineCount = -1; // actual number of non-empty lines in record body (i.e. excludes WET/WARC headers) 
    11586         
    11687    // read from WETfile 
    11788    try ( 
    118          BufferedReader reader = new BufferedReader(new FileReader(inFile)); 
     89         BufferedReader reader = new BufferedReader(new FileReader(this.inFile)); 
    11990         BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true)); 
    12091         BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append 
     
    185156        ioe.printStackTrace(); 
    186157    } 
     158 
     159    return recordCount; 
    187160    } 
     161 
     162    public int getRecordCount() { return this.recordCount; } 
    188163 
    189164    /** 
     
    217192        // don't want a "translated" product site/online store 
    218193        // These curiously often tend to have "product(s)" in the URL 
    219         parentFolder = WETProcessor.discardFolder; 
     194        parentFolder = batchProcessor.discardFolder; 
    220195    } 
    221196 
    222197    else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 
    223         parentFolder = WETProcessor.keepFolder; 
     198        parentFolder = batchProcessor.keepFolder; 
    224199        System.err.println("@@@KEEPING"); 
    225200    } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) { 
     
    231206        // So we have at least 500 chars (possibly on a single wrapped line) 
    232207        // containing at least 10 spaces. Such a record is also worth keeping. 
    233         parentFolder = WETProcessor.keepFolder; 
     208        parentFolder = batchProcessor.keepFolder; 
    234209        } 
    235210    } 
    236211    */ 
    237212 
    238     if(isInDiscardFilter(recordURI)) { 
    239         parentFolder = WETProcessor.discardFolder; 
    240     } 
    241     else if(isInCheckFilter(recordURI)) { // products sites 
    242         parentFolder = WETProcessor.discardFolder; // TODO: checkfolder 
     213    if(batchProcessor.isBlacklisted(recordURI)) { 
     214        parentFolder = batchProcessor.discardFolder; 
     215    } 
     216    else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites 
     217        parentFolder = batchProcessor.discardFolder; // TODO: checkfolder 
    243218    } else { 
    244219        // If a web page's WET record contains a certain minimum number of words, 
     
    266241        // In Maori, word length of 1 is not uncommon 
    267242        // but let's skip camelcased words when counting valid words 
    268         else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++; 
     243        else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++; 
    269244        } 
    270245 
    271246        // dump if too many camelcase words (ideally keep none of that kind?) 
    272         if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) { 
    273         parentFolder = WETProcessor.discardFolder; 
     247        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 
     248        parentFolder = batchProcessor.discardFolder; 
    274249        System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 
    275250        } 
    276         else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
    277         parentFolder = WETProcessor.keepFolder; 
     251        else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
     252        parentFolder = batchProcessor.keepFolder; 
    278253        System.err.println("@@@KEEPING"); 
    279254        } 
     
    281256    // if parentFolder still not set, set to discard pile folder 
    282257    if(parentFolder == null) { 
    283         parentFolder = WETProcessor.discardFolder; 
     258        parentFolder = batchProcessor.discardFolder; 
    284259        System.err.println("@@@DISCARDING"); 
    285260    } 
    286261 
    287262    try { 
    288         if (parentFolder == WETProcessor.keepFolder) { 
     263        if (parentFolder == batchProcessor.keepFolder) { 
    289264        keepURLsWriter.write(recordURI + "\n"); 
    290265        } else { 
     
    310285    } 
    311286    } 
    312  
    313  
    314     /** 
    315      * Takes as input the keepURLs.txt file generated by running WETProcessor instances. 
    316      * As output produces the URL seed list and regex-urlfilter text files required by nutch, 
    317      * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 
    318      */ 
    319     public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) { 
    320     // Maintain Sets of unique domains and urls 
    321     // TreeSet: by default, "the elements are ordered using their natural ordering" 
    322     // (or by a Comparator provided at set creation time). 
    323     // Whereas HashSet doesn't guarantee ordering. 
    324     // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 
    325  
    326     Set<String> domainsSet = new TreeSet<String>(); 
    327     Set<String> urlsSet = new TreeSet<String>(); 
    328  
    329     final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 
    330      
    331     try ( 
    332          BufferedReader reader = new BufferedReader(new FileReader(urlsFile)); 
    333          ) { 
    334          
    335         // read a URL at a time from urlsFile 
    336         String url = null; 
    337         String domain = null; 
    338         while((url = reader.readLine()) != null) { // readLine removes newline separator 
    339          
    340         // work out domain. This retains any www. or subdomain prefix: 
    341         int startIndex = url.indexOf("//"); // http:// or https:// prefix 
    342         startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
    343         domain = url.substring(startIndex); 
    344         int endIndex = domain.indexOf("/"); 
    345         if(endIndex == -1) endIndex = domain.length(); 
    346         domain = domain.substring(0, endIndex); 
    347  
    348         //if(!domainsMap.containsKey(domain)) { 
    349         urlsSet.add(url); 
    350         domainsSet.add(domain); 
    351         //} 
    352         } 
    353     } catch (IOException ioe) { 
    354         ioe.printStackTrace(); 
    355         System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile); 
    356     } 
    357      
    358     try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
    359         Iterator<String> i = urlsSet.iterator(); 
    360         while(i.hasNext()) { 
    361         String url = i.next(); 
    362         seedURLsWriter.write(url + "\n"); 
    363         } 
    364          
    365     } catch (IOException ioe) { 
    366         ioe.printStackTrace(); 
    367         System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 
    368     } 
    369  
    370     try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 
    371         Iterator<String> i = domainsSet.iterator(); 
    372         // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
    373         while(i.hasNext()) { 
    374         String domain = i.next(); 
    375         domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";         
    376         urlFilterWriter.write(domain + "\n"); 
    377         } 
    378          
    379     } catch (IOException ioe) { 
    380         ioe.printStackTrace(); 
    381         System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 
    382     } 
    383     } 
    384  
    385     /** 
    386      * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide 
    387      * whether it is in the discard list. 
    388      * Filters don't represent actual regex, just ^ and $ as start and end terminators. 
    389      * By not having this method deal with actual regex for filters, this has the advantage that 
    390      * we don't have to remember to escape or double escape each filter to turn it into a regex. 
    391      */ 
    392     public boolean isInDiscardFilter(String url) { 
    393  
    394     String discardFilterFile = "url-discard-filter.txt"; // in conf folder 
    395  
    396     try ( 
    397          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8")); 
    398          ) { 
    399         String filter = null; 
    400         while((filter = reader.readLine()) != null) { 
    401         if(filter.trim().equals("")) { 
    402             continue; 
    403         } 
    404         //System.err.println("Got filter: " + filter); 
    405         if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) { 
    406             System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter); 
    407         } 
    408         else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) { 
    409             System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter); 
    410             return true; 
    411         } 
    412         else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) { 
    413             System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter); 
    414             return true; 
    415         } 
    416         else if(url.contains(filter)) { 
    417             System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter); 
    418             return true; 
    419         } 
    420                  
    421         } 
    422          
    423     } catch (IOException ioe) { 
    424         ioe.printStackTrace(); 
    425         System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile); 
    426     } 
    427  
    428     return false; 
    429     } 
    430  
    431     // TODO 
    432     public boolean isInCheckFilter(String url) { 
    433     //System.err.println("isInCheckFilter(url) is not yet implemented");     
    434     return false; 
    435     } 
    436      
    437     //public static int getRecordCount() { return recordCount; } 
    438      
    439     public static void printUsage() { 
    440     System.err.println("Run this program as:"); 
    441     System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");    
    442     } 
    443  
    444     /** Filename filter to only list warc.wet files or else warc.wet.gz files 
    445      * for which unzipped warc.wet equivalents don't yet exist. 
    446      */ 
    447     private static class WETFilenameFilter implements FilenameFilter { 
    448      
    449     public boolean accept(File dir, String name) { 
    450         if(name.endsWith(".warc.wet")) { 
    451         logger.debug("Will include " + name + " for processing."); 
    452         return true; 
    453         } 
    454  
    455         if(name.endsWith(".warc.wet.gz")) { 
    456         String nameWithoutGZext = name.substring(0, name.lastIndexOf(".gz")); 
    457         File unzippedVersion = new File(dir, nameWithoutGZext); 
    458         if(unzippedVersion.exists()) { 
    459             logger.debug("--- Unzipped version " + unzippedVersion + " exists."); 
    460             logger.debug("Skipping " + name); 
    461             return false; // don't count gzipped version if unzipped version exists. 
    462         } 
    463         else { 
    464             logger.debug("Only zipped version " + name + " exists."); 
    465             return true; // No unzipped version, so have to work with gzipped version 
    466         } 
    467         } 
    468  
    469         // we're not even interested in any other file extensions 
    470         logger.debug("Not a WET file. Skipping " + name); 
    471         return false; 
    472     } 
    473     } 
    474      
    475     public static void main(String[] args) { 
    476     if(args.length != 2) { 
    477         printUsage(); 
    478         return; 
    479     } 
    480  
    481      
    482     File WETFileDir = new File(args[0]); 
    483     if(!WETFileDir.exists() || !WETFileDir.isDirectory()) { 
    484         System.out.println("Error: " + args[0] + " does not exist or is not a directory"); 
    485         return; 
    486     } 
    487      
    488     File outFolder = new File(args[1]); 
    489     if(!outFolder.exists() || !outFolder.isDirectory()) { 
    490         System.out.println("Error: " + args[1] + " does not exist or is not a directory."); 
    491         return; 
    492     }    
    493  
    494     // static folders and files to be shared across all WETProcessor instances 
    495     WETProcessor.discardFolder = new File(outFolder, "discard"); 
    496     if(!WETProcessor.discardFolder.exists()) { 
    497         WETProcessor.discardFolder.mkdir(); 
    498     }    
    499     WETProcessor.keepFolder = new File(outFolder, "keep"); 
    500     if(!WETProcessor.keepFolder.exists()) { 
    501         WETProcessor.keepFolder.mkdir(); 
    502     } 
    503  
    504     WETProcessor.keepURLsFile = new File(outFolder, "keepURLs.txt");     
    505     if(WETProcessor.keepURLsFile.exists() && !WETProcessor.keepURLsFile.delete()) { 
    506         System.err.println("Unable to delete " + WETProcessor.keepURLsFile + ". Unable to proceed."); 
    507         return; 
    508     } 
    509     WETProcessor.discardURLsFile = new File(outFolder, "discardURLs.txt"); 
    510     if(WETProcessor.discardURLsFile.exists() && !WETProcessor.discardURLsFile.delete()) { 
    511         System.err.println("Unable to delete " + WETProcessor.discardURLsFile + ". Unable to proceed."); 
    512         return; 
    513     } 
    514      
    515     // Will list all the warc.wet files in the input directory or else their gzipped versions 
    516     File[] WETFiles = WETFileDir.listFiles(new WETFilenameFilter()); 
    517  
    518     for(int i = 0; i < WETFiles.length; i++) { 
    519         File WETFile = WETFiles[i];      
    520         logger.debug("Processing WETfile: " + WETFile); 
    521  
    522         // Any .gz files listed means they haven't been unzipped yet. So unzip. 
    523         String WETFilename = WETFile.toString(); 
    524         if(WETFilename.endsWith(".gz")) { 
    525         File GZippedWETFile = WETFile; 
    526         String WETGZippedFilename = WETFilename; 
    527         WETFilename = WETFilename.substring(0, WETFilename.lastIndexOf(".gz"));      
    528  
    529         WETFile = new File(WETFilename); 
    530         Utility.unzipFile(GZippedWETFile, WETFile); 
    531         } 
    532         // hereafter all WETFiles should refer to the unzipped version 
    533         // Check the unzipped WETFile exists         
    534  
    535         if(!WETFile.exists() || !WETFile.isFile()) { 
    536         System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 
    537         logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 
    538         return; 
    539         } 
    540  
    541         // Finally, we can process this WETFile's records into the keep and discard pile 
    542         logger.debug("Off to process " + WETFile); 
    543         WETProcessor processor = new WETProcessor(WETFile, outFolder);       
    544          
    545     } 
    546  
    547     File seedURLsFile = new File(outFolder, "seedURLs.txt"); 
    548     File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 
    549     WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile); 
    550      
    551     return;  
    552     } 
    553287}