Changeset 33503 for gs3-extensions


Ignore:
Timestamp:
2019-09-23T23:16:28+12:00 (5 years ago)
Author:
ak19
Message:

More efficient blacklisting/greylisting/whitelisting now by reading in the lists only once and then comparing each URL to each list. Explicit whitelisting has precedence over greylisting and which takes precedence over blacklisting. Then any remaining urls are checked for having sufficient content. The code that checks for sufficient content still needs some more adjusting.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33501 r33503  
    66import java.util.zip.GZIPInputStream;
    77import java.util.Iterator;
     8import java.util.HashMap;
     9import java.util.Map;
    810import java.util.Set;
    911import java.util.TreeSet;
     
    1416 * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
    1517 * the WET records in each, putting each WET record into a file. Each file is put into a
    16  * keep or discard folder, based on content-length and number of lines.
    17  * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
     18 * keep or discard or greyListed folder, and its url listed written into a keep, discard
     19 * or greylisted text file, based on based on
     20 *
     21 * 1. whether it's whitelisted, else greylisted else blacklisted
     22 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
     23 * enough content. Formerly, content-length and number of lines were used to determine if
     24 * the content was sufficient. Now it's just word count and number of MAX characters
     25 * (not MINIMUM characters) that determine a string is a word. These settings can be adjusted
     26 * in conf/config.properties.
     27 *
     28 * Put a url-blacklist-filter.txt and/or url-greylist-filter.txt and/or url-whitelist-filter.txt
     29 * into the conf folder to control any url patterns that are explicitly included or excluded or
     30 * set aside for inspecting later. These filter text files don't use regexes, instead their
     31 * format is:
     32 * - precede URL by ^ to blacklist urls that match the given prefix
     33 * - succeed URL by $ to blacklist urls that match the given suffix
     34 * - ^url$ will blacklist urls that match the given url completely
     35 * - Without either ^ or $ symbol, urls containing the given url will get blacklisted
     36 *
     37 * WETProcessor.java's current implementation is that explicit whitelisting has precedence
     38 * over greylisting and which takes precedence over blacklisting in turn. However, even
     39 * explicitly whitelisted urls still need to have sufficient content to end up in keepURLs.txt
     40 * and in the seedURLs.txt file used for nutch, along with its domain in regex-urlfilter.txt
     41 * also for nutch.
     42 *
     43 * A CCWETProcessor instance can be configured to process all the .warc.wet(.gz) files
     44 * in the given input folder. Then use a single instance of the WETProcessor class to process
     45 * each single unzipped warc.wet file.
    1846 *
    1947 * To compile, including the jars in lib/ for compiling.
     
    4472    public final File discardFolder;
    4573    public final File keepFolder;
     74    public final File greyListedFolder;
    4675    public final File keepURLsFile;
    4776    public final File discardURLsFile;
     77    public final File greyListedFile;
     78
     79    private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
     80    private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
     81    private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
     82    private final Integer LIST_ENTRY_MATCHES = new Integer(3);
     83   
     84    private HashMap<String, Integer> blackList;
     85    private HashMap<String, Integer> greyList;
     86    private HashMap<String, Integer> whiteList;
    4887
    4988    // Keep a count of all the records that all WETProcessors instantiated
     
    5392    private int wetFileCount = 0;
    5493   
    55     public CCWETProcessor(File inFolder, File outFolder) {
     94    public CCWETProcessor(File inFolder, File outFolder) throws Exception {
    5695    this.WETFilesDir = inFolder;
    5796    this.outputFolder = outFolder;
    5897
    5998    // load up the properties from the config file
    60     try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
     99    try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
    61100        configProperties = new Properties();
    62101        configProperties.load(infile);     
    63         //infile.close();
     102        //infile.close(); // not explicitly called in examples of try-with-resources
    64103       
    65104    } catch(Exception e) {
     
    86125        keepFolder.mkdir();
    87126    }
    88 
     127   
     128    this.greyListedFolder = new File(outFolder, "greylisted");
     129    if(!greyListedFolder.exists()) {
     130        greyListedFolder.mkdir();
     131    }
     132   
    89133    this.keepURLsFile = new File(outFolder, "keepURLs.txt");   
    90134    if(keepURLsFile.exists() && !keepURLsFile.delete()) {
    91         System.err.println("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
    92         //return;
     135        throw new Exception("Warning: Unable to delete " + this.keepURLsFile + ". Unable to proceed.");
    93136    }
    94137    this.discardURLsFile = new File(outFolder, "discardURLs.txt");
    95138    if(discardURLsFile.exists() && !discardURLsFile.delete()) {
    96         System.err.println("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
    97         //return;
    98     }
    99    
     139        throw new Exception ("Warning Unable to delete " + discardURLsFile + ". Unable to proceed.");
     140    }
     141    this.greyListedFile = new File(outFolder, "greyListed.txt");
     142    if(greyListedFile.exists() && !greyListedFile.delete()) {
     143        throw new Exception ("Warning Unable to delete " + greyListedFile + ". Unable to proceed.");
     144    }
     145
     146    System.err.println("Loading blacklist.");
     147    blackList = new HashMap<String, Integer>();
     148    initURLFilterList(blackList, "url-blacklist-filter.txt");
     149    System.err.println("Loading greylist.");
     150    greyList = new HashMap<String, Integer>();
     151    initURLFilterList(greyList, "url-greylist-filter.txt");
     152    System.err.println("Loading whitelist.");
     153    whiteList = new HashMap<String, Integer>();
     154    initURLFilterList(whiteList, "url-whitelist-filter.txt");
     155
     156    //System.err.println("Prematurely terminating for testing purposes.");
     157    //System.exit(-1);
    100158    }
    101159   
     
    171229    }
    172230
    173     /*
     231    private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) {
     232    Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet();
     233    Iterator<Map.Entry<String, Integer>> i = entries.iterator();
     234    while(i.hasNext()) {
     235        Map.Entry<String, Integer> entry = i.next();
     236        String urlPattern = entry.getKey();
     237        Integer matchRule = entry.getValue();
     238
     239        if(matchRule == LIST_ENTRY_CONTAINS && url.contains(urlPattern)) {
     240        return true;
     241        }
     242        else if(matchRule == LIST_ENTRY_STARTSWITH && url.startsWith(urlPattern)) {
     243        return true;
     244        }
     245        else if(matchRule == LIST_ENTRY_ENDSWITH && url.endsWith(urlPattern)) {
     246        return true;
     247        }
     248        else if(matchRule == LIST_ENTRY_MATCHES && url.equals(urlPattern)) {
     249        return true;
     250        }
     251        // else check the rest of the filter list against this url
     252        // before returning false to be certain it's not been listed in the filter list
     253    }
     254   
     255    return false;
     256    }
     257
     258    /**
     259     * Returns true if the url or pattern is found in the blacklist file.
     260     * Note that if eventually the same url pattern is found in the greylist or whitelist too,
     261     * it won't get blacklisted after all. But that's not implemented here.
     262     */
    174263    public boolean isBlacklisted(String url) {
    175     return false;
    176     }
    177     */
    178 
     264    return isListedInFilterList(blackList, url);
     265    }
     266   
     267    /**
     268     * Returns true if the url or pattern is explicitly mentioned in the greylist file.
     269     * Will eventually take precedence over if the same URL pattern was mentioned in the blacklist.
     270     * Will eventually be pre-empted into the whitelist if mentioned in the whitelist.
     271     */
    179272    public boolean isGreylisted(String url) {
    180     // alexa top sites and auto-translated product sites
    181     return false;
     273    // TODO: alexa top sites and auto-translated product sites
     274    return isListedInFilterList(greyList, url);
     275    }
     276
     277    /**
     278     * Returns true if the url or pattern is explicitly mentioned in the whitelist file
     279     * Its mention in a whitelist moreover overrides any mention in the blacklist and greylist.
     280     */
     281    public boolean isWhitelisted(String url) {
     282    return isListedInFilterList(whiteList, url);
    182283    }
    183284   
     
    189290     * we don't have to remember to escape or double escape each filter to turn it into a regex.
    190291     */
    191     //public boolean isInDiscardFilter(String url) {
    192 
    193     public boolean isBlacklisted(String url) { 
     292    public void initURLFilterList(Map<String, Integer> list, String filterListFilename) {
     293   
     294    // if filterListFilename does not exist in the conf folder, just return
     295    if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
     296        System.err.println(filterListFilename + " does not exist");
     297        return;     
     298    }
     299
     300    try (
     301         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
     302         ) {
     303        String filter = null;
     304        while((filter = reader.readLine()) != null) {
     305        // skip comments and empty lines
     306        filter = filter.trim();
     307        if(filter.equals("") || filter.startsWith("#")) {
     308            continue;
     309        }
     310       
     311        if(filter.startsWith("^") && filter.endsWith("$")) {
     312            filter = filter.substring(1, filter.length()-1);
     313            list.put(filter, LIST_ENTRY_MATCHES);
     314        }
     315        else if(filter.startsWith("^")) {
     316            filter = filter.substring(1);
     317            list.put(filter, LIST_ENTRY_STARTSWITH);
     318            System.err.println("Match filter startswith: " + filter);
     319        }
     320        else if(filter.endsWith("$")) {
     321            filter = filter.substring(0, filter.length()-1);
     322            list.put(filter, LIST_ENTRY_ENDSWITH);
     323        }
     324        else {
     325            list.put(filter, LIST_ENTRY_CONTAINS);
     326        }
     327        //System.err.println("Got filter: " + filter);
     328        }
     329       
     330    } catch (IOException ioe) {
     331        ioe.printStackTrace();
     332        System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
     333    }
     334   
     335    }
     336    /*
     337    public boolean isInDiscardFilter(String url) {
    194338    String discardFilterFile = "url-discard-filter.txt"; // in conf folder
    195339
    196340    try (
    197          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
     341         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
    198342         ) {
    199343        String filter = null;
     
    227371
    228372    return false;
    229     }
     373    }*/
    230374
    231375    /** Maintain a count of all WET files processed. */
     
    293437    }   
    294438
     439    try {
    295440    CCWETProcessor ccWETFilesProcessor = new CCWETProcessor(WETFileDir, outFolder);
    296441
     
    341486    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
    342487    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
     488    } catch(Exception e) {
     489    // can get an exception when instantiating CCWETProcessor instance
     490    e.printStackTrace();
     491    System.err.println(e.getMessage());
     492    }
    343493   
    344494    return;
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33501 r33503  
    1212
    1313/**
    14  * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
    15  * the WET records in each, putting each WET record into a file. Each file is put into a
    16  * keep or discard folder, based on content-length and number of lines.
    17  * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
     14 * A single instance of the WETProcessor class can process a single unzipped warc.wet file.
     15 * A WETProcessor take a warc.wet file and goes through all its WET records,
     16 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
     17 * and its url listed written into a keep, discard or greylisted text file, based on:
    1818 *
    19  * To compile, including the jars in lib/ for compiling.
    20  *      maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
     19 * 1. whether it's whitelisted, else greylisted else blacklisted
     20 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
     21 * enough content. Formerly, content-length and number of lines were used to determine if
     22 * the content was sufficient. Now it's just word count and number of MAX characters
     23 * (not MINIMUM characters) that determine a string is a word.
     24 * Explicit whitelisting has precedence over greylisting and which takes precedence
     25 * over blacklisting in turn.
     26 * However, even explicitly whitelisted urls still need to have sufficient content to end
     27 * up in keepURLs.txt.
    2128 *
    22  * To run, passing the log4j and other properties files in conf/ folder:
    23  *      maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor &lt;folder containing warc.wet(.gz) files&gt; &lt;outputFolder&gt;
    24  *
    25  * e.g.
    26  *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
    27  *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2&gt;&amp;1 | less
     29 * See CCWETProcessor.java for compile instructions and how to run.
    2830 *
    2931*/
     
    7274    public int processWETFile() {
    7375    File keepURLsFile = this.batchProcessor.keepURLsFile;
    74     File discardURLsFile = this.batchProcessor.discardURLsFile;
     76    File discardURLsFile = this.batchProcessor.discardURLsFile;
     77    File greyListedFile = this.batchProcessor.greyListedFile;
    7578   
    7679    StringBuilder record = null;
     
    9093         BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
    9194         BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
     95         BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
    9296         ) {
    9397       
     
    104108            // process any previous record
    105109            if(record != null) {
    106             processWETrecord(keepURLsWriter, discardURLsWriter,
     110            processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
    107111                     recordCount, contentLength, lineCount,
    108112                     WARCtargetURI, record.toString());
     
    147151        // flush the last record. If it was a warcinfo record, record would be null here
    148152        if(record != null) {
    149         processWETrecord(keepURLsWriter, discardURLsWriter,
     153        processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
    150154                 recordCount, contentLength, lineCount,
    151155                 WARCtargetURI, record.toString());
     
    169173     */
    170174    private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
     175                  BufferedWriter greyListedURLsWriter,
    171176                  int recordID, int contentLength, int lineCount,
    172177                  String recordURI, String record)
     
    210215    }
    211216    */
    212 
     217   
    213218    if(batchProcessor.isBlacklisted(recordURI)) {
    214         parentFolder = batchProcessor.discardFolder;
     219
     220       
     221        // explicit whitelisting overrides blacklisting
     222        if(batchProcessor.isWhitelisted(recordURI)) {
     223        parentFolder = batchProcessor.keepFolder; //tentative
     224        }
     225        // if not whitelisted, then greylisting overrides blacklisting
     226        else if(batchProcessor.isGreylisted(recordURI)) {
     227        parentFolder = batchProcessor.greyListedFolder;
     228        System.err.println("@@@GREYLISTED");
     229        }
     230        else { // only blacklisted
     231        parentFolder = batchProcessor.discardFolder;
     232        System.err.println("@@@DISCARDING - blacklisted");
     233        }
    215234    }
    216235    else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
    217         parentFolder = batchProcessor.discardFolder; // TODO: checkfolder
    218     } else {
     236        // explicit whitelisting overrides greylisting
     237        if(batchProcessor.isWhitelisted(recordURI)) {
     238        parentFolder = batchProcessor.keepFolder; // tentative
     239        }
     240        else {
     241        parentFolder = batchProcessor.greyListedFolder;
     242        System.err.println("@@@GREYLISTED");
     243        }
     244    }
     245
     246    // If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
     247    // it still can't be in the keep list as it needs further inspection:
     248    // it needs sufficient content for language analysis.
     249    if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
     250       
    219251        // If a web page's WET record contains a certain minimum number of words,
    220252        // we will think it's a meaningful web page and has sufficient content for text analysis
     
    254286        }
    255287    }
    256     // if parentFolder still not set, set to discard pile folder
     288    // if parentFolder still not set, it means that the content length/num words or lines
     289    // were insufficient, so meant to be discarded
    257290    if(parentFolder == null) {
    258291        parentFolder = batchProcessor.discardFolder;
     
    263296        if (parentFolder == batchProcessor.keepFolder) {
    264297        keepURLsWriter.write(recordURI + "\n");
     298        } else if (parentFolder == batchProcessor.greyListedFolder) {
     299        greyListedURLsWriter.write(recordURI + "\n");
    265300        } else {
    266301        discardURLsWriter.write(recordURI + "\n");
Note: See TracChangeset for help on using the changeset viewer.