- Timestamp:
- 2019-10-11T20:49:05+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33560 r33561 69 69 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY"; 70 70 public final String SINGLEPAGE = "SINGLEPAGE"; 71 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE"; 71 72 72 73 /** … … 74 75 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions 75 76 * https://www.regular-expressions.info/refcharacters.html 76 */77 //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"];78 // put the \\ at start so we don't the escape character for chars escaped earlier77 * Put the \\ (escape char) at start so we don't double-escape chars already escaped, 78 * as would happen for any chars appearing earlier in this list than \\ 79 */ 79 80 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|"; 81 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"]; 80 82 81 83 private Properties configProperties = new Properties(); … … 212 214 } 213 215 214 int tabindex = str.indexOf("\t"); 215 if(tabindex == -1) { 216 // comma separated list of values 217 int splitindex = str.indexOf(","); 218 if(splitindex == -1) { 216 219 topSitesMap.put(str, ""); 217 220 } else { 218 String topsite = str.substring(0, tabindex).trim();219 String allowed_url_pattern = str.substring( tabindex+1).trim();221 String topsite = str.substring(0, splitindex).trim(); 222 String allowed_url_pattern = str.substring(splitindex+1).trim(); 220 223 topSitesMap.put(topsite, allowed_url_pattern); 221 224 } … … 352 355 while(domainIterator.hasNext()) { 353 356 String domainWithProtocol = domainIterator.next(); 357 // Also get domain without protocol prefix 354 358 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 355 359 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 356 360 String domain = domainWithProtocol.substring(startIndex); 357 358 System.err.println("domain with protocol: " + domainWithProtocol); 359 System.err.println("domain: " + domain); 361 362 /*if(domain.contains("docs.google.com")) { 363 System.err.println("domain with protocol: " + domainWithProtocol); 364 System.err.println("domain: " + domain); 365 }*/ 360 366 361 367 String allowedURLPatternRegex = isURLinTopSitesMap(domain); … … 372 378 373 379 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 374 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 375 while(urlIterator.hasNext()) { 376 String url = urlIterator.next(); 380 for(String url : urlsForDomainSet) { 377 381 topSiteMatchesWriter.write("\t" + url + "\n"); 378 382 } 379 383 380 384 continue; // done with this domain 381 385 } … … 451 455 siteRegexWriter.write(regexed_url + "\n"); 452 456 } 457 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) { 458 459 // DON'T write out domain into siteURLs file, 460 // BUT DO write it into urlFilter file 461 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/"; 462 463 urlFilterWriter.write(regexed_domain + "\n"); 464 siteRegexWriter.write(regexed_domain + "\n"); 453 465 } else { // allowedURLPatternRegex is a url-form - convert to regex 454 466 if(!allowedURLPatternRegex.endsWith("/")) { … … 467 479 // also write into the global seeds file (with a tab prefixed to each?) 468 480 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 469 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 470 while(urlIterator.hasNext()) { 471 String url = urlIterator.next(); 481 for(String url : urlsForDomainSet) { 472 482 seedURLsWriter.write(url + "\n"); // global seedURLs file 473 483 siteURLsWriter.write(url + "\n"); 474 484 } 485 475 486 } catch (IOException ioe) { 476 487 ioe.printStackTrace();
Note:
See TracChangeset
for help on using the changeset viewer.