- Timestamp:
- 2019-09-22T21:17:48+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33488 r33497 212 212 213 213 214 /* 214 215 if(recordURI.contains("product") && !recordURI.contains("production")) { 215 216 … … 218 219 parentFolder = WETProcessor.discardFolder; 219 220 } 220 /* 221 221 222 else if(lineCount >= MIN_LINE_COUNT && contentLength >= MIN_CONTENT_LENGTH) { 222 223 parentFolder = WETProcessor.keepFolder; … … 235 236 */ 236 237 237 else { 238 if(isInDiscardFilter(recordURI)) { 239 parentFolder = WETProcessor.discardFolder; 240 } 241 else if(isInCheckFilter(recordURI)) { // products sites 242 parentFolder = WETProcessor.discardFolder; // TODO: checkfolder 243 } else { 238 244 // If a web page's WET record contains a certain minimum number of words, 239 245 // we will think it's a meaningful web page and has sufficient content for text analysis … … 359 365 } catch (IOException ioe) { 360 366 ioe.printStackTrace(); 361 System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);367 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 362 368 } 363 369 … … 373 379 } catch (IOException ioe) { 374 380 ioe.printStackTrace(); 375 System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 376 } 377 } 378 381 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 382 } 383 } 384 385 /** 386 * Checks URL parameter against each line ("filter") of conf/url-discard-filter.txt to decide 387 * whether it is in the discard list. 388 * Filters don't represent actual regex, just ^ and $ as start and end terminators. 389 * By not having this method deal with actual regex for filters, this has the advantage that 390 * we don't have to remember to escape or double escape each filter to turn it into a regex. 391 */ 392 public boolean isInDiscardFilter(String url) { 393 394 String discardFilterFile = "url-discard-filter.txt"; // in conf folder 395 396 try ( 397 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.WETProcessor.class.getClassLoader().getResourceAsStream(discardFilterFile), "UTF-8")); 398 ) { 399 String filter = null; 400 while((filter = reader.readLine()) != null) { 401 if(filter.trim().equals("")) { 402 continue; 403 } 404 //System.err.println("Got filter: " + filter); 405 if(filter.startsWith("^") && filter.endsWith("$") && url.equals(filter.substring(1, filter.length()-1))) { 406 System.err.println("*** Discarding url " + url + "\n\tas it MATCHES filter " + filter); 407 } 408 else if(filter.startsWith("^") && url.startsWith(filter.substring(1))) { 409 System.err.println("*** Discarding url " + url + "\n\tas it STARTS WITH filter " + filter); 410 return true; 411 } 412 else if(filter.endsWith("$") && url.endsWith(filter.substring(0, filter.length()-1))) { 413 System.err.println("*** Discarding url " + url + "\n\tas it ENDS WITH filter " + filter); 414 return true; 415 } 416 else if(url.contains(filter)) { 417 System.err.println("*** Discarding url " + url + "\n\tas it CONTAINS filter " + filter); 418 return true; 419 } 420 421 } 422 423 } catch (IOException ioe) { 424 ioe.printStackTrace(); 425 System.err.println("\n@@@@@@@@@ Error reading from " + discardFilterFile); 426 } 427 428 return false; 429 } 430 431 // TODO 432 public boolean isInCheckFilter(String url) { 433 //System.err.println("isInCheckFilter(url) is not yet implemented"); 434 return false; 435 } 379 436 380 437 //public static int getRecordCount() { return recordCount; }
Note:
See TracChangeset
for help on using the changeset viewer.