Changeset 33517


Ignore:
Timestamp:
2019-09-24T20:30:40+12:00 (5 years ago)
Author:
ak19
Message:
  1. Blacklists were introduced so that too many instances of camelcased words need no longer disqualify WET records from inclusion in the keep pile. Still check camelcasing of words as such words don't get counted as valid words, in the valid word count that determines if there's sufficient content in a WET record. 2. Some more commenting.
Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33515 r33517  
    7777    public final File greyListedFile;
    7878
     79    /** Possible values stored in the blackList/whiteList/greyList Maps */
    7980    private final Integer LIST_ENTRY_CONTAINS = new Integer(0);
    8081    private final Integer LIST_ENTRY_STARTSWITH = new Integer(1);
    8182    private final Integer LIST_ENTRY_ENDSWITH = new Integer(2);
    8283    private final Integer LIST_ENTRY_MATCHES = new Integer(3);
    83    
     84
     85    /**
     86     * Store url patterns as keys and values indicated whether a url should
     87     * match it exactly, start/end with it, or contain it
     88     */
    8489    private HashMap<String, Integer> blackList;
    8590    private HashMap<String, Integer> greyList;
    8691    private HashMap<String, Integer> whiteList;
    8792
     93    /** Map of domains we keep and the full urls we're keeping that are of that domain.
     94     * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys,
     95     * since a HashMap has no notion of ordering.
     96     */
     97    private TreeMap<String, TreeSet<String>> domainsToURLsMap;
     98   
    8899    // Keep a count of all the records that all WETProcessors instantiated
    89100    // by our main method combined have processed
     
    144155    }
    145156
     157    // prepare our blacklist, greylist (for inspection) and whitelist
    146158    System.err.println("Loading blacklist.");
    147159    blackList = new HashMap<String, Integer>();
    148160    initURLFilterList(blackList, "url-blacklist-filter.txt");
     161   
    149162    System.err.println("Loading greylist.");
    150163    greyList = new HashMap<String, Integer>();
    151164    initURLFilterList(greyList, "url-greylist-filter.txt");
     165   
    152166    System.err.println("Loading whitelist.");
    153167    whiteList = new HashMap<String, Integer>();
     
    159173   
    160174    /**
    161      * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
    162      * As output produces the URL seed list and regex-urlfilter text files required by nutch,
     175     * Using the keepURLs.txt file generated by running WETProcessor instances, produces
     176     * as output the URL seed list and regex-urlfilter text files required by nutch, see
    163177     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
    164178     */
     
    448462    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
    449463    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile);
     464
     465    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     466   
    450467    } catch(Exception e) {
    451468    // can get an exception when instantiating CCWETProcessor instance
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33503 r33517  
    188188    File parentFolder = null;
    189189
    190     // want to match "product(s)" but not "production"
    191     //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) {
    192    
    193 
    194     /*
    195     if(recordURI.contains("product") && !recordURI.contains("production")) {
    196 
    197         // don't want a "translated" product site/online store
    198         // These curiously often tend to have "product(s)" in the URL
    199         parentFolder = batchProcessor.discardFolder;
    200     }
    201 
    202     else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) {
    203         parentFolder = batchProcessor.keepFolder;
    204         System.err.println("@@@KEEPING");
    205     } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) {
    206         int countSpaces = 0;
    207         for(int i = 0; i < record.length(); i++) {
    208         if(record.charAt(i) == ' ') countSpaces++;
    209         }
    210         if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) {
    211         // So we have at least 500 chars (possibly on a single wrapped line)
    212         // containing at least 10 spaces. Such a record is also worth keeping.
    213         parentFolder = batchProcessor.keepFolder;
    214         }
    215     }
    216     */
    217    
     190
    218191    if(batchProcessor.isBlacklisted(recordURI)) {
    219 
    220192       
    221193        // explicit whitelisting overrides blacklisting
     
    223195        parentFolder = batchProcessor.keepFolder; //tentative
    224196        }
    225         // if not whitelisted, then greylisting overrides blacklisting
     197        // if not whitelisted, then greylisting still overrides blacklisting
    226198        else if(batchProcessor.isGreylisted(recordURI)) {
    227199        parentFolder = batchProcessor.greyListedFolder;
    228200        System.err.println("@@@GREYLISTED");
    229201        }
    230         else { // only blacklisted
     202        else { // url was only blacklisted
    231203        parentFolder = batchProcessor.discardFolder;
    232204        System.err.println("@@@DISCARDING - blacklisted");
     
    247219    // it still can't be in the keep list as it needs further inspection:
    248220    // it needs sufficient content for language analysis.
     221    // We don't care about the combination of number of lines and content-length,
     222    // we just care about the number of "valid words" as defined by us.
    249223    if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null
    250224       
     
    267241
    268242        // throw away if n words contain camelcase, which is another case of words glued together
     243        // For now, we'll only skip camelcased words in our count of valid words
    269244        if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) {
    270245            numCamelCaseWords++;           
    271         }
    272        
     246        }       
    273247        // In Maori, word length of 1 is not uncommon
    274248        // but let's skip camelcased words when counting valid words
    275         else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++;
    276         }
    277 
    278         // dump if too many camelcase words (ideally keep none of that kind?)
     249        else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) {
     250            validWordCount++;
     251        }
     252        }
     253
     254       
     255        /*
     256        // dump if too many camelcase words (ideally keep no WET record of that kind?)
    279257        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) {
    280258        parentFolder = batchProcessor.discardFolder;
    281259        System.err.println("@@@DISCARDING - CAMELCASED CONTENTS");
    282260        }
    283         else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
     261        else*/
     262        // For now, don't discount content with too many camelcased words
     263        // Just focus on whether there are a sufficient number of valid words
     264        // (camelcased words are however still ignored in our count of valid words)
     265        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words
    284266        parentFolder = batchProcessor.keepFolder;
    285267        System.err.println("@@@KEEPING");
Note: See TracChangeset for help on using the changeset viewer.