- Timestamp:
- 2019-09-16T19:45:01+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33471 r33480 31 31 // In Java, can initialize static final variables inside a static block 32 32 // But the unavoidable try/catch in this static block prevents initialization of 33 // the static final int variables further below that therefore need to be declared34 // and initialized thereafter.33 // the static final int variables (seen further below) inside the block itself, 34 // that therefore need to be declared and initialized thereafter. 35 35 static { 36 37 36 // load up the properties from the config file 38 37 try (InputStream infile = org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { … … 50 49 // Providing fall-back cuttoff values if config.properties doesn't load 51 50 // or doesn't have the named props. But what happens when Integer.parseInt throws an exception? 51 /* 52 52 private static final int MIN_CONTENT_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length", "100")); 53 53 private static final int MIN_LINE_COUNT= Integer.parseInt(configProperties.getProperty("WETprocessor.min.line.count", "2")); 54 54 private static final int MIN_CONTENT_LENGTH_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.content.length.wrapped.line", "500")); 55 55 private static final int MIN_SPACES_IN_A_WRAPPED_LINE = Integer.parseInt(configProperties.getProperty("WETprocessor.min.spaces.per.wrapped.line", "10")); 56 56 */ 57 private static final int MAX_WORD_LENGTH = Integer.parseInt(configProperties.getProperty("WETprocessor.max.word.length", "15")); // to identify and skip web pages where content consists of words glued together (with no spaces) 58 private static final int MIN_NUM_WORDS = Integer.parseInt(configProperties.getProperty("WETprocessor.min.num.words", "20")); 59 private static final int MAX_WORDS_CAMELCASE = Integer.parseInt(configProperties.getProperty("WETprocessor.max.words.camelcase", "10")); 60 57 61 // File paths shared across WETProcessor instances 58 62 private static File discardFolder; … … 202 206 203 207 // want to match "product(s)" but not "production" 204 205 208 //if(recordURI.matches(".*/?product[^a-rt-z].*")) {//if(recordURI.matches(".*/?products?/?.*")) { 209 210 206 211 if(recordURI.contains("product") && !recordURI.contains("production")) { 207 212 … … 209 214 // These curiously often tend to have "product(s)" in the URL 210 215 parentFolder = WETProcessor.discardFolder; 211 } 216 } 217 /* 212 218 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 213 219 parentFolder = WETProcessor.keepFolder; … … 224 230 } 225 231 } 226 232 */ 233 234 else { 235 // If a web page's WET record contains a certain minimum number of words, 236 // we will think it's a meaningful web page and has sufficient content for text analysis 237 // to have been successful. Cut off values at present are: 238 // - a minimum of 20 words 239 // - a word consists of 1 to 15 chars demarcated by spaces. Any more chars may point 240 // to words having been glued together. This is used by irrelevant sites and moreover 241 // can't be analysed for language, so may not be actually MRI. 242 243 // Though StringTokenizer still in use, as seen in discussion at 244 // https://stackoverflow.com/questions/6983856/why-is-stringtokenizer-deprecated 245 // String.split(regex) seems better for splitting on general whitespace 246 String[] allWords = record.split("\\s"); 247 int validWordCount = 0; 248 int numCamelCaseWords = 0; 249 for(int i = 0; i < allWords.length; i++) { 250 String word = allWords[i]; 251 252 // throw away if n words contain camelcase, which is another case of words glued together 253 if(word.matches(".*[a-z][A-Z].*") && word.length() >= 5) { 254 numCamelCaseWords++; 255 } 256 257 // In Maori, word length of 1 is not uncommon 258 // but let's skip camelcased words when counting valid words 259 else if(word.length() >= 1 && word.length() <= MAX_WORD_LENGTH) validWordCount++; 260 } 261 262 // dump if too many camelcase words (ideally keep none of that kind?) 263 if(numCamelCaseWords >= MAX_WORDS_CAMELCASE) { 264 parentFolder = WETProcessor.discardFolder; 265 System.err.println("@@@DISCARDING - CAMELCASED CONTENTS"); 266 } 267 else if(validWordCount >= MIN_NUM_WORDS) { // otherwise, keep anything with a sufficient number of valid words 268 parentFolder = WETProcessor.keepFolder; 269 System.err.println("@@@KEEPING"); 270 } 271 } 227 272 // if parentFolder still not set, set to discard pile folder 228 273 if(parentFolder == null) {
Note:
See TracChangeset
for help on using the changeset viewer.