Ignore:
Timestamp:
2019-09-23T23:16:28+12:00 (5 years ago)
Author:
ak19
Message:

More efficient blacklisting/greylisting/whitelisting now by reading in the lists only once and then comparing each URL to each list. Explicit whitelisting has precedence over greylisting and which takes precedence over blacklisting. Then any remaining urls are checked for having sufficient content. The code that checks for sufficient content still needs some more adjusting.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33501 r33503  
    1212
    1313/**
    14  * The main() method of this class takes a folder of warc.wet(.gz) files and goes through
    15  * the WET records in each, putting each WET record into a file. Each file is put into a
    16  * keep or discard folder, based on content-length and number of lines.
    17  * A single instance of the WETProcessor class processes a single unzipped warc.wet file.
     14 * A single instance of the WETProcessor class can process a single unzipped warc.wet file.
     15 * A WETProcessor take a warc.wet file and goes through all its WET records,
     16 * putting each WET record into a file. Each file is put into a keep, discard or greylisted folder
     17 * and its url listed written into a keep, discard or greylisted text file, based on:
    1818 *
    19  * To compile, including the jars in lib/ for compiling.
    20  *      maori-lang-detection/src$ javac -cp ".:../lib/*" org/greenstone/atea/WETProcessor.java
     19 * 1. whether it's whitelisted, else greylisted else blacklisted
     20 * 2. and if explicitly whitelisted or else not greylisted or blacklisted and there's
     21 * enough content. Formerly, content-length and number of lines were used to determine if
     22 * the content was sufficient. Now it's just word count and number of MAX characters
     23 * (not MINIMUM characters) that determine a string is a word.
     24 * Explicit whitelisting has precedence over greylisting and which takes precedence
     25 * over blacklisting in turn.
     26 * However, even explicitly whitelisted urls still need to have sufficient content to end
     27 * up in keepURLs.txt.
    2128 *
    22  * To run, passing the log4j and other properties files in conf/ folder:
    23  *      maori-lang-detection/src$ java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor <folder containing warc.wet(.gz) files> <outputFolder>
    24  *
    25  * e.g.
    26  *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET
    27  *    - java -cp ".:../conf:../lib/*" org.greenstone.atea.WETProcessor ../tmp/processWET /Scratch/ak19/gs3-extensions/maori-lang-detection/tmp/processedWET 2>&1 | less
     29 * See CCWETProcessor.java for compile instructions and how to run.
    2830 *
    2931*/
     
    7274    public int processWETFile() {
    7375    File keepURLsFile = this.batchProcessor.keepURLsFile;
    74     File discardURLsFile = this.batchProcessor.discardURLsFile;
     76    File discardURLsFile = this.batchProcessor.discardURLsFile;
     77    File greyListedFile = this.batchProcessor.greyListedFile;
    7578   
    7679    StringBuilder record = null;
     
    9093         BufferedWriter keepURLsWriter = new BufferedWriter(new FileWriter(keepURLsFile, true));
    9194         BufferedWriter discardURLsWriter = new BufferedWriter(new FileWriter(discardURLsFile, true)); // true to append
     95         BufferedWriter greyListedURLsWriter = new BufferedWriter(new FileWriter(greyListedFile, true)); // true to append
    9296         ) {
    9397       
     
    104108            // process any previous record
    105109            if(record != null) {
    106             processWETrecord(keepURLsWriter, discardURLsWriter,
     110            processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
    107111                     recordCount, contentLength, lineCount,
    108112                     WARCtargetURI, record.toString());
     
    147151        // flush the last record. If it was a warcinfo record, record would be null here
    148152        if(record != null) {
    149         processWETrecord(keepURLsWriter, discardURLsWriter,
     153        processWETrecord(keepURLsWriter, discardURLsWriter, greyListedURLsWriter,
    150154                 recordCount, contentLength, lineCount,
    151155                 WARCtargetURI, record.toString());
     
    169173     */
    170174    private void processWETrecord(BufferedWriter keepURLsWriter, BufferedWriter discardURLsWriter,
     175                  BufferedWriter greyListedURLsWriter,
    171176                  int recordID, int contentLength, int lineCount,
    172177                  String recordURI, String record)
     
    210215    }
    211216    */
    212 
     217   
    213218    if(batchProcessor.isBlacklisted(recordURI)) {
    214         parentFolder = batchProcessor.discardFolder;
     219
     220       
     221        // explicit whitelisting overrides blacklisting
     222        if(batchProcessor.isWhitelisted(recordURI)) {
     223        parentFolder = batchProcessor.keepFolder; //tentative
     224        }
     225        // if not whitelisted, then greylisting overrides blacklisting
     226        else if(batchProcessor.isGreylisted(recordURI)) {
     227        parentFolder = batchProcessor.greyListedFolder;
     228        System.err.println("@@@GREYLISTED");
     229        }
     230        else { // only blacklisted
     231        parentFolder = batchProcessor.discardFolder;
     232        System.err.println("@@@DISCARDING - blacklisted");
     233        }
    215234    }
    216235    else if(batchProcessor.isGreylisted(recordURI)) { // e.g. products sites
    217         parentFolder = batchProcessor.discardFolder; // TODO: checkfolder
    218     } else {
     236        // explicit whitelisting overrides greylisting
     237        if(batchProcessor.isWhitelisted(recordURI)) {
     238        parentFolder = batchProcessor.keepFolder; // tentative
     239        }
     240        else {
     241        parentFolder = batchProcessor.greyListedFolder;
     242        System.err.println("@@@GREYLISTED");
     243        }
     244    }
     245
     246    // If URL was not blacklisted/greylisted, or was even explicitly whitelisted,
     247    // it still can't be in the keep list as it needs further inspection:
     248    // it needs sufficient content for language analysis.
     249    if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
     250       
    219251        // If a web page's WET record contains a certain minimum number of words,
    220252        // we will think it's a meaningful web page and has sufficient content for text analysis
     
    254286        }
    255287    }
    256     // if parentFolder still not set, set to discard pile folder
     288    // if parentFolder still not set, it means that the content length/num words or lines
     289    // were insufficient, so meant to be discarded
    257290    if(parentFolder == null) {
    258291        parentFolder = batchProcessor.discardFolder;
     
    263296        if (parentFolder == batchProcessor.keepFolder) {
    264297        keepURLsWriter.write(recordURI + "\n");
     298        } else if (parentFolder == batchProcessor.greyListedFolder) {
     299        greyListedURLsWriter.write(recordURI + "\n");
    265300        } else {
    266301        discardURLsWriter.write(recordURI + "\n");
Note: See TracChangeset for help on using the changeset viewer.