Changeset 33497


Ignore:
Timestamp:
2019-09-22T21:17:48+12:00 (5 years ago)
Author:
ak19
Message:

First version of discard url filter file. Inefficient implementation. Better to read the file once, adjust the filters as required and keep in memory to do the comparisons with each URL.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33488 r33497  
    212212   
    213213
     214    /*
    214215    if(recordURI.contains("product") && !recordURI.contains("production")) {
    215216
     
    218219        parentFolder = WETProcessor.discardFolder;
    219220    }
    220     /*
     221
    221222    else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
    222223        parentFolder = WETProcessor.keepFolder;
     
    235236    */
    236237
    237     else {
     238    if(isInDiscardFilter(recordURI)) {
     239        parentFolder = WETProcessor.discardFolder;
     240    }
     241    else if(isInCheckFilter(recordURI)) { // products sites
     242        parentFolder = WETProcessor.discardFolder; // TODO: checkfolder
     243    } else {
    238244        // If a web page's WET record contains a certain minimum number of words,
    239245        // we will think it's a meaningful web page and has sufficient content for text analysis
     
    359365    } catch (IOException ioe) {
    360366        ioe.printStackTrace();
    361         System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);
     367        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile);
    362368    }
    363369
     
    373379    } catch (IOException ioe) {
    374380        ioe.printStackTrace();
    375         System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);
    376     }
    377     }
    378 
     381        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile);
     382    }
     383    }
     384
     385    /**
     386     * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide
     387     * whether it is in the discard list.
     388     * Filters don't represent actual regex, just ^ and $ as start and end terminators.
     389     * By not having this method deal with actual regex for filters, this has the advantage that
     390     * we don't have to remember to escape or double escape each filter to turn it into a regex.
     391     */
     392    public boolean isInDiscardFilter(String url) {
     393
     394    String discardFilterFile = "url-discard-filter.txt"; // in conf folder
     395
     396    try (
     397         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8"));
     398         ) {
     399        String filter = null;
     400        while((filter = reader.readLine()) != null) {
     401        if(filter.trim().equals("")) {
     402            continue;
     403        }
     404        //System.err.println("Got filter: " + filter);
     405        if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) {
     406            System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter);
     407        }
     408        else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) {
     409            System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter);
     410            return true;
     411        }
     412        else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) {
     413            System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter);
     414            return true;
     415        }
     416        else if(url.contains(filter)) {
     417            System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter);
     418            return true;
     419        }
     420               
     421        }
     422       
     423    } catch (IOException ioe) {
     424        ioe.printStackTrace();
     425        System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile);
     426    }
     427
     428    return false;
     429    }
     430
     431    // TODO
     432    public boolean isInCheckFilter(String url) {
     433    //System.err.println("isInCheckFilter(url) is not yet implemented");   
     434    return false;
     435    }
    379436   
    380437    //public static int getRecordCount() { return recordCount; }
Note: See TracChangeset for help on using the changeset viewer.