Changeset 33517

Show
Ignore:
Timestamp:
24.09.2019 20:30:40 (3 weeks ago)
Author:
ak19
Message:

1. Blacklists were introduced so that too many instances of camelcased words need no longer disqualify WET records from inclusion in the keep pile. Still check camelcasing of words as such words don't get counted as valid words, in the valid word count that determines if there's sufficient content in a WET record. 2. Some more commenting.

Location:
gs3-extensions/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33515 r33517  
    7777    public final File greyListedFile; 
    7878 
     79    /** Possible values stored in the blackList/whiteList/greyList Maps */ 
    7980    private final Integer LIST_ENTRY_CONTAINS = new Integer(0); 
    8081    private final Integer LIST_ENTRY_STARTSWITH = new Integer(1); 
    8182    private final Integer LIST_ENTRY_ENDSWITH = new Integer(2); 
    8283    private final Integer LIST_ENTRY_MATCHES = new Integer(3); 
    83      
     84 
     85    /** 
     86     * Store url patterns as keys and values indicated whether a url should 
     87     * match it exactly, start/end with it, or contain it 
     88     */ 
    8489    private HashMap<String, Integer> blackList; 
    8590    private HashMap<String, Integer> greyList; 
    8691    private HashMap<String, Integer> whiteList; 
    8792 
     93    /** Map of domains we keep and the full urls we're keeping that are of that domain.  
     94     * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys, 
     95     * since a HashMap has no notion of ordering. 
     96     */ 
     97    private TreeMap<String, TreeSet<String>> domainsToURLsMap; 
     98     
    8899    // Keep a count of all the records that all WETProcessors instantiated 
    89100    // by our main method combined have processed 
     
    144155    } 
    145156 
     157    // prepare our blacklist, greylist (for inspection) and whitelist 
    146158    System.err.println("Loading blacklist."); 
    147159    blackList = new HashMap<String, Integer>(); 
    148160    initURLFilterList(blackList, "url-blacklist-filter.txt"); 
     161     
    149162    System.err.println("Loading greylist."); 
    150163    greyList = new HashMap<String, Integer>(); 
    151164    initURLFilterList(greyList, "url-greylist-filter.txt"); 
     165     
    152166    System.err.println("Loading whitelist."); 
    153167    whiteList = new HashMap<String, Integer>(); 
     
    159173     
    160174    /** 
    161      * Takes as input the keepURLs.txt file generated by running WETProcessor instances. 
    162      * As output produces the URL seed list and regex-urlfilter text files required by nutch, 
     175     * Using the keepURLs.txt file generated by running WETProcessor instances, produces 
     176     * as output the URL seed list and regex-urlfilter text files required by nutch, see 
    163177     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 
    164178     */ 
     
    448462    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 
    449463    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 
     464 
     465    System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
     466     
    450467    } catch(Exception e) { 
    451468    // can get an exception when instantiating CCWETProcessor instance 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33503 r33517  
    188188    File parentFolder = null; 
    189189 
    190     // want to match "product(s)" but not "production" 
    191     //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) { 
    192      
    193  
    194     /* 
    195     if(recordURI.contains("product") && !recordURI.contains("production")) { 
    196  
    197         // don't want a "translated" product site/online store 
    198         // These curiously often tend to have "product(s)" in the URL 
    199         parentFolder = batchProcessor.discardFolder; 
    200     } 
    201  
    202     else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 
    203         parentFolder = batchProcessor.keepFolder; 
    204         System.err.println("@@@KEEPING"); 
    205     } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) { 
    206         int countSpaces = 0; 
    207         for(int i = 0; i < record.length(); i++) { 
    208         if(record.charAt(i) == ' ') countSpaces++; 
    209         } 
    210         if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) { 
    211         // So we have at least 500 chars (possibly on a single wrapped line) 
    212         // containing at least 10 spaces. Such a record is also worth keeping. 
    213         parentFolder = batchProcessor.keepFolder; 
    214         } 
    215     } 
    216     */ 
    217      
     190 
    218191    if(batchProcessor.isBlacklisted(recordURI)) { 
    219  
    220192         
    221193        // explicit whitelisting overrides blacklisting 
     
    223195        parentFolder = batchProcessor.keepFolder; //tentative 
    224196        } 
    225         // if not whitelisted, then greylisting overrides blacklisting 
     197        // if not whitelisted, then greylisting still overrides blacklisting 
    226198        else if(batchProcessor.isGreylisted(recordURI)) { 
    227199        parentFolder = batchProcessor.greyListedFolder; 
    228200        System.err.println("@@@GREYLISTED"); 
    229201        } 
    230         else { // only blacklisted 
     202        else { // url was only blacklisted 
    231203        parentFolder = batchProcessor.discardFolder; 
    232204        System.err.println("@@@DISCARDING - blacklisted"); 
     
    247219    // it still can't be in the keep list as it needs further inspection: 
    248220    // it needs sufficient content for language analysis. 
     221    // We don't care about the combination of number of lines and content-length, 
     222    // we just care about the number of "valid words" as defined by us. 
    249223    if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null 
    250224         
     
    267241 
    268242        // throw away if n words contain camelcase, which is another case of words glued together 
     243        // For now, we'll only skip camelcased words in our count of valid words 
    269244        if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) { 
    270245            numCamelCaseWords++;             
    271         } 
    272          
     246        }        
    273247        // In Maori, word length of 1 is not uncommon 
    274248        // but let's skip camelcased words when counting valid words 
    275         else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++; 
    276         } 
    277  
    278         // dump if too many camelcase words (ideally keep none of that kind?) 
     249        else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) { 
     250            validWordCount++; 
     251        } 
     252        } 
     253 
     254         
     255        /* 
     256        // dump if too many camelcase words (ideally keep no WET record of that kind?) 
    279257        if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 
    280258        parentFolder = batchProcessor.discardFolder; 
    281259        System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 
    282260        } 
    283         else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
     261        else*/ 
     262        // For now, don't discount content with too many camelcased words 
     263        // Just focus on whether there are a sufficient number of valid words 
     264        // (camelcased words are however still ignored in our count of valid words) 
     265        if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 
    284266        parentFolder = batchProcessor.keepFolder; 
    285267        System.err.println("@@@KEEPING");