Changeset 33497

Show
Ignore:
Timestamp:
22.09.2019 21:17:48 (4 weeks ago)
Author:
ak19
Message:

First version of discard url filter file. Inefficient implementation. Better to read the file once, adjust the filters as required and keep in memory to do the comparisons with each URL.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33488 r33497  
    212212     
    213213 
     214    /* 
    214215    if(recordURI.contains("product") && !recordURI.contains("production")) { 
    215216 
     
    218219        parentFolder = WETProcessor.discardFolder; 
    219220    } 
    220     /* 
     221 
    221222    else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 
    222223        parentFolder = WETProcessor.keepFolder; 
     
    235236    */ 
    236237 
    237     else { 
     238    if(isInDiscardFilter(recordURI)) { 
     239        parentFolder = WETProcessor.discardFolder; 
     240    } 
     241    else if(isInCheckFilter(recordURI)) { // products sites 
     242        parentFolder = WETProcessor.discardFolder; // TODO: checkfolder 
     243    } else { 
    238244        // If a web page's WET record contains a certain minimum number of words, 
    239245        // we will think it's a meaningful web page and has sufficient content for text analysis 
     
    359365    } catch (IOException ioe) { 
    360366        ioe.printStackTrace(); 
    361         System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 
     367        System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 
    362368    } 
    363369 
     
    373379    } catch (IOException ioe) { 
    374380        ioe.printStackTrace(); 
    375         System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 
    376     } 
    377     } 
    378  
     381        System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 
     382    } 
     383    } 
     384 
     385    /** 
     386     * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide 
     387     * whether it is in the discard list. 
     388     * Filters don't represent actual regex, just ^ and $ as start and end terminators. 
     389     * By not having this method deal with actual regex for filters, this has the advantage that 
     390     * we don't have to remember to escape or double escape each filter to turn it into a regex. 
     391     */ 
     392    public boolean isInDiscardFilter(String url) { 
     393 
     394    String discardFilterFile = "url-discard-filter.txt"; // in conf folder 
     395 
     396    try ( 
     397         BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8")); 
     398         ) { 
     399        String filter = null; 
     400        while((filter = reader.readLine()) != null) { 
     401        if(filter.trim().equals("")) { 
     402            continue; 
     403        } 
     404        //System.err.println("Got filter: " + filter); 
     405        if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) { 
     406            System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter); 
     407        } 
     408        else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) { 
     409            System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter); 
     410            return true; 
     411        } 
     412        else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) { 
     413            System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter); 
     414            return true; 
     415        } 
     416        else if(url.contains(filter)) { 
     417            System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter); 
     418            return true; 
     419        } 
     420                 
     421        } 
     422         
     423    } catch (IOException ioe) { 
     424        ioe.printStackTrace(); 
     425        System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile); 
     426    } 
     427 
     428    return false; 
     429    } 
     430 
     431    // TODO 
     432    public boolean isInCheckFilter(String url) { 
     433    //System.err.println("isInCheckFilter(url) is not yet implemented");     
     434    return false; 
     435    } 
    379436     
    380437    //public static int getRecordCount() { return recordCount; }