Changeset 33517 for gs3-extensions
- Timestamp:
- 2019-09-24T20:30:40+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33515 r33517 77 77 public final File greyListedFile; 78 78 79 /** Possible values stored in the blackList/whiteList/greyList Maps */ 79 80 private final Integer LIST_ENTRY_CONTAINS = new Integer(0); 80 81 private final Integer LIST_ENTRY_STARTSWITH = new Integer(1); 81 82 private final Integer LIST_ENTRY_ENDSWITH = new Integer(2); 82 83 private final Integer LIST_ENTRY_MATCHES = new Integer(3); 83 84 85 /** 86 * Store url patterns as keys and values indicated whether a url should 87 * match it exactly, start/end with it, or contain it 88 */ 84 89 private HashMap<String, Integer> blackList; 85 90 private HashMap<String, Integer> greyList; 86 91 private HashMap<String, Integer> whiteList; 87 92 93 /** Map of domains we keep and the full urls we're keeping that are of that domain. 94 * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys, 95 * since a HashMap has no notion of ordering. 96 */ 97 private TreeMap<String, TreeSet<String>> domainsToURLsMap; 98 88 99 // Keep a count of all the records that all WETProcessors instantiated 89 100 // by our main method combined have processed … … 144 155 } 145 156 157 // prepare our blacklist, greylist (for inspection) and whitelist 146 158 System.err.println("Loading blacklist."); 147 159 blackList = new HashMap<String, Integer>(); 148 160 initURLFilterList(blackList, "url-blacklist-filter.txt"); 161 149 162 System.err.println("Loading greylist."); 150 163 greyList = new HashMap<String, Integer>(); 151 164 initURLFilterList(greyList, "url-greylist-filter.txt"); 165 152 166 System.err.println("Loading whitelist."); 153 167 whiteList = new HashMap<String, Integer>(); … … 159 173 160 174 /** 161 * Takes as input the keepURLs.txt file generated by running WETProcessor instances.162 * As output produces the URL seed list and regex-urlfilter text files required by nutch,175 * Using the keepURLs.txt file generated by running WETProcessor instances, produces 176 * as output the URL seed list and regex-urlfilter text files required by nutch, see 163 177 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 164 178 */ … … 448 462 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 449 463 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile); 464 465 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 466 450 467 } catch(Exception e) { 451 468 // can get an exception when instantiating CCWETProcessor instance -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33503 r33517 188 188 File parentFolder = null; 189 189 190 // want to match "product(s)" but not "production" 191 //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) { 192 193 194 /* 195 if(recordURI.contains("product") && !recordURI.contains("production")) { 196 197 // don't want a "translated" product site/online store 198 // These curiously often tend to have "product(s)" in the URL 199 parentFolder = batchProcessor.discardFolder; 200 } 201 202 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 203 parentFolder = batchProcessor.keepFolder; 204 System.err.println("@@@KEEPING"); 205 } else if(contentLength >= MIN_CONTENT_LENGTH_WRAPPED_LINE) { 206 int countSpaces = 0; 207 for(int i = 0; i < record.length(); i++) { 208 if(record.charAt(i) == ' ') countSpaces++; 209 } 210 if(countSpaces >= MIN_SPACES_IN_A_WRAPPED_LINE) { 211 // So we have at least 500 chars (possibly on a single wrapped line) 212 // containing at least 10 spaces. Such a record is also worth keeping. 213 parentFolder = batchProcessor.keepFolder; 214 } 215 } 216 */ 217 190 218 191 if(batchProcessor.isBlacklisted(recordURI)) { 219 220 192 221 193 // explicit whitelisting overrides blacklisting … … 223 195 parentFolder = batchProcessor.keepFolder; //tentative 224 196 } 225 // if not whitelisted, then greylisting overrides blacklisting197 // if not whitelisted, then greylisting still overrides blacklisting 226 198 else if(batchProcessor.isGreylisted(recordURI)) { 227 199 parentFolder = batchProcessor.greyListedFolder; 228 200 System.err.println("@@@GREYLISTED"); 229 201 } 230 else { // only blacklisted202 else { // url was only blacklisted 231 203 parentFolder = batchProcessor.discardFolder; 232 204 System.err.println("@@@DISCARDING - blacklisted"); … … 247 219 // it still can't be in the keep list as it needs further inspection: 248 220 // it needs sufficient content for language analysis. 221 // We don't care about the combination of number of lines and content-length, 222 // we just care about the number of "valid words" as defined by us. 249 223 if(parentFolder != batchProcessor.greyListedFolder && parentFolder != batchProcessor.discardFolder) { // i.e. parentFolder == keepFolder if whiteListed || parentFolder == null 250 224 … … 267 241 268 242 // throw away if n words contain camelcase, which is another case of words glued together 243 // For now, we'll only skip camelcased words in our count of valid words 269 244 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) { 270 245 numCamelCaseWords++; 271 } 272 246 } 273 247 // In Maori, word length of 1 is not uncommon 274 248 // but let's skip camelcased words when counting valid words 275 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) validWordCount++; 276 } 277 278 // dump if too many camelcase words (ideally keep none of that kind?) 249 else if(word.length() >= 1 && word.length() <= batchProcessor.MAX_WORD_LENGTH) { 250 validWordCount++; 251 } 252 } 253 254 255 /* 256 // dump if too many camelcase words (ideally keep no WET record of that kind?) 279 257 if(numCamelCaseWords >= batchProcessor.MAX_WORDS_CAMELCASE) { 280 258 parentFolder = batchProcessor.discardFolder; 281 259 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 282 260 } 283 else if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 261 else*/ 262 // For now, don't discount content with too many camelcased words 263 // Just focus on whether there are a sufficient number of valid words 264 // (camelcased words are however still ignored in our count of valid words) 265 if(validWordCount >= batchProcessor.MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 284 266 parentFolder = batchProcessor.keepFolder; 285 267 System.err.println("@@@KEEPING");
Note:
See TracChangeset
for help on using the changeset viewer.