- Timestamp:
- 2019-10-10T23:49:58+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33557 r33560 65 65 public final int MIN_NUM_WORDS; 66 66 public final int MAX_WORDS_CAMELCASE; 67 68 // constants for the possible fixed values in sites-too-big-to-exhaustively-crawl.txt file 69 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY"; 70 public final String SINGLEPAGE = "SINGLEPAGE"; 71 72 /** 73 * Characters that need escaping if used as a string literal in a regex 74 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions 75 * https://www.regular-expressions.info/refcharacters.html 76 */ 77 //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"]; 78 // put the \\ at start so we don't the escape character for chars escaped earlier 79 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|"; 67 80 68 81 private Properties configProperties = new Properties(); … … 220 233 * This retains any www. or subdomain prefix. 221 234 */ 222 private String getDomainForURL(String url ) {223 int startIndex = url.indexOf("//"); //http:// or https:// prefix235 private String getDomainForURL(String url, boolean withProtocol) { 236 int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix 224 237 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 238 // the keep the URL around in case param withProtocol=true 239 String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex); 240 225 241 String domain = url.substring(startIndex); 226 242 int endIndex = domain.indexOf("/"); 227 243 if(endIndex == -1) endIndex = domain.length(); 228 244 domain = domain.substring(0, endIndex); 245 246 if(withProtocol) { 247 // now that we have the domain (everything to the first / when there is no protocol) 248 // can glue the protocol back on 249 domain = protocol + domain; 250 } 229 251 230 252 return domain; 253 } 254 255 /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */ 256 private String escapeStringForRegex(String str) { 257 for(int i = 0; i < ESCAPE_CHARS_FOR_RE.length(); i++) { 258 char c = ESCAPE_CHARS_FOR_RE.charAt(i); 259 str = str.replace(Character.toString(c), "\\"+c); 260 } 261 return str; 231 262 } 232 263 … … 238 269 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 239 270 File domainURLsFile, File topSiteMatchesFile) { 240 // Maintain Sets of unique domains and urls271 // Maintain a Map of unique domains mapped to seed urls at that domain 241 272 // TreeSet: by default, "the elements are ordered using their natural ordering" 242 273 // (or by a Comparator provided at set creation time). 243 274 // Whereas HashSet doesn't guarantee ordering. 244 275 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 245 246 //Set<String> domainsSet = new TreeSet<String>(); 247 //Set<String> urlsSet = new TreeSet<String>(); 276 // Would be a similar distinction for Maps. 248 277 domainsToURLsMap = new TreeMap<String, Set<String>>(); 249 250 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 278 279 final String PROTOCOL_REGEX_PREFIX = "+^https?://"; 280 final String FILTER_REGEX_PREFIX = PROTOCOL_REGEX_PREFIX + "([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* for nutch's regex-urlfilter.txt 251 281 252 282 try ( … … 256 286 // read a URL at a time from urlsFile 257 287 String url = null; 258 String domain = null;288 String domainWithProtocol = null; 259 289 while((url = reader.readLine()) != null) { // readLine removes newline separator 260 290 261 // work out domain. This retains any www. or subdomain prefix 262 domain = getDomainForURL(url); 263 264 //urlsSet.add(url); 265 //domainsSet.add(domain); 291 // work out domain. This retains any www. or subdomain prefix 292 // passing true to further also retain the http(s) protocol 293 domainWithProtocol = getDomainForURL(url, true); 294 266 295 Set<String> urlsSet; 267 if(!domainsToURLsMap.containsKey(domain )) {296 if(!domainsToURLsMap.containsKey(domainWithProtocol)) { 268 297 urlsSet = new TreeSet<String>(); 269 298 urlsSet.add(url); 270 domainsToURLsMap.put(domain , urlsSet);299 domainsToURLsMap.put(domainWithProtocol, urlsSet); 271 300 } else { 272 urlsSet = domainsToURLsMap.get(domain );301 urlsSet = domainsToURLsMap.get(domainWithProtocol); 273 302 urlsSet.add(url); 274 303 } … … 322 351 323 352 while(domainIterator.hasNext()) { 324 String domain = domainIterator.next(); 353 String domainWithProtocol = domainIterator.next(); 354 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 355 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 356 String domain = domainWithProtocol.substring(startIndex); 357 358 System.err.println("domain with protocol: " + domainWithProtocol); 359 System.err.println("domain: " + domain); 325 360 326 361 String allowedURLPatternRegex = isURLinTopSitesMap(domain); … … 330 365 // domain and seedURLs to the topSiteMatchesFile. 331 366 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) { 367 332 368 // topsite, but we don't (yet) know what portion can be crawled 333 369 // Append the top site and url to a global/toplevel file that … … 335 371 // won't go into any other file hereafter 336 372 337 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain );373 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 338 374 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 339 375 while(urlIterator.hasNext()) { … … 353 389 354 390 // write out the domain 355 //seedURLsWriter.write(domain + "\n");391 //seedURLsWriter.write(domainWithProtocol + "\n"); 356 392 357 393 … … 367 403 368 404 // write all sorted unique domains into global domains file 405 // Using the domain withuot protocol since the global domains file is for 406 // informational purposes 369 407 domainURLsWriter.write(domain + "\n"); 370 408 … … 373 411 // files (and write regexed domain into each sites/0000#/regex-urlfilter.txt) 374 412 // If we ever run nutch on a single seedURLs listing containing 375 // all seed pages to crawl sites from, the above two files will work for that. 376 413 // all seed pages to crawl sites from, the above two files will work for that. 377 414 378 415 if(allowedURLPatternRegex == null) { // entire site can be crawled 379 siteURLsWriter.write(domain + "\n");416 siteURLsWriter.write(domainWithProtocol + "\n"); 380 417 381 418 // Write out filter in the following form for a site, e.g. for nutch.apache.org: 382 419 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 383 String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 420 String regexed_domain = FILTER_REGEX_PREFIX + escapeStringForRegex(domain) + "/"; 421 //String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 384 422 urlFilterWriter.write(regexed_domain + "\n"); //global file 385 423 siteRegexWriter.write(regexed_domain + "\n"); // site file … … 387 425 else { // domain belongs to a top site where only portion of site can be crawled 388 426 389 if(allowedURLPatternRegex.equals( "COPY")) { // COPY existing domain as url-filter390 siteURLsWriter.write(domain + "\n");427 if(allowedURLPatternRegex.equals(SUBDOMAIN_COPY)) { // COPY existing domain as url-filter 428 siteURLsWriter.write(domainWithProtocol + "\n"); 391 429 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com 392 430 // and not for all of blogspot.com 393 431 394 String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/"; 395 urlFilterWriter.write(regexed_domain + "\n"); 396 siteRegexWriter.write(regexed_domain + "\n"); 397 398 } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) { 432 String regexed_domain = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(domain) + "/"; 433 //String regexed_domain = PROTOCOL_REGEX_PREFIX+domain.replace(".", "\\.") + "/"; 434 urlFilterWriter.write(regexed_domain + "\n"); 435 siteRegexWriter.write(regexed_domain + "\n"); 436 437 } else if(allowedURLPatternRegex.equals(SINGLEPAGE)) { 399 438 // don't write out domain. We want individual pages 400 //DON'T DO : siteURLsWriter.write(domain+ "\n");439 //DON'T DO THIS HERE: siteURLsWriter.write(domainWithProtocol + "\n"); 401 440 402 // don't write out domain as a regex expression url filter 441 // don't write out domain as a regex expression url filter either, 403 442 // write out the individual seed urls for the domain instead 404 443 // since we will only be downloading the single page 405 444 406 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 407 for(String urlInDomain : urlsForDomainSet) { 408 String regexed_url = "+^"+urlInDomain.replace(".", "\\."); 445 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 446 for(String urlInDomain : urlsForDomainSet) { 447 // don't append slash to end this time 448 String regexed_url = "+^"+escapeStringForRegex(urlInDomain); 449 //String regexed_url = "+^"+urlInDomain.replace(".", "\\."); 409 450 urlFilterWriter.write(regexed_url + "\n"); 410 451 siteRegexWriter.write(regexed_url + "\n"); 411 452 } 412 453 } else { // allowedURLPatternRegex is a url-form - convert to regex 413 String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\."); 414 siteURLsWriter.write(domain + "\n"); 454 if(!allowedURLPatternRegex.endsWith("/")) { 455 allowedURLPatternRegex += "/"; 456 } 457 String regexed_pattern = PROTOCOL_REGEX_PREFIX+escapeStringForRegex(allowedURLPatternRegex); 458 //String regexed_pattern = PROTOCOL_REGEX_PREFIX+allowedURLPatternRegex.replace(".", "\\."); 459 siteURLsWriter.write(domainWithProtocol + "\n"); 415 460 urlFilterWriter.write(regexed_pattern + "\n"); 416 461 siteRegexWriter.write(regexed_pattern + "\n"); … … 420 465 421 466 // next write out the urls for the domain into the sites/0000x/seedURLs.txt file 422 // also write into the global seeds file 423 // (with a tab prefixed to each url?) 424 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 467 // also write into the global seeds file (with a tab prefixed to each?) 468 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 425 469 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 426 470 while(urlIterator.hasNext()) { 427 471 String url = urlIterator.next(); 428 seedURLsWriter.write( "\t" +url + "\n"); // global seedURLs file429 siteURLsWriter.write( "\t" +url + "\n");472 seedURLsWriter.write(url + "\n"); // global seedURLs file 473 siteURLsWriter.write(url + "\n"); 430 474 } 431 475 } catch (IOException ioe) { 432 476 ioe.printStackTrace(); 433 System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile);477 System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile); 434 478 } 435 479 … … 438 482 } catch (IOException ioe) { 439 483 ioe.printStackTrace(); 440 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile + " or " + urlFilterFile); 441 } 442 443 // write out domains as regular expressions into "regex-urlfilter.txt" file 444 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 445 Set<String> domainsSet = domainsToURLsMap.keySet(); 446 Iterator<String> i = domainsSet.iterator(); 447 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 448 while(i.hasNext()) { 449 String domain = i.next(); 450 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 451 urlFilterWriter.write(domain + "\n"); 452 } 453 454 } catch (IOException ioe) { 455 ioe.printStackTrace(); 456 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 457 } 484 System.err.println("\n@@@@@@@@@ Error writing to one of: "); 485 System.err.println("\t" + seedURLsFile); 486 System.err.println("\t" + urlFilterFile); 487 System.err.println("\t" + domainURLsFile); 488 System.err.println("\t" + topSiteMatchesFile); 489 } 458 490 459 491 /* … … 475 507 return url; 476 508 } 477 509 510 511 /** 512 * @return true when a seedURL's domain exactly matches a topsite such as blogspot.com, 513 * with or without www. prefix. This method tests for such as case as it would be dangerous 514 * to do a SUBDOMAIN-COPY on such a site and thereby crawl that entire domain. 515 */ 516 private boolean isExactDomainMatch(String seedURLDomain, String domain) { 517 // check for an exact match as-is 518 if(seedURLDomain.equals(domain)) { 519 return true; 520 } 521 522 // else check if with or without a www. prefix we have an exact match with domain 523 if(seedURLDomain.startsWith("www.")) { 524 if(seedURLDomain.substring(4).equals(domain)) { 525 return true; 526 } 527 } else { 528 if(domain.equals("www."+seedURLDomain)) { 529 return true; 530 } 531 } 532 533 return false; 534 } 535 536 478 537 /** 479 * Check if the domain of the url, either in its entirety or when stripped of www/subdomains,480 * is in the list of top sites.538 * Check if the domain of the seedurl, either in its entirety or when stripped of 539 * www/subdomains, is in the list of top sites. 481 540 * If it is, and the given url matches the regex for that topsite, then add the url to the 482 541 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file. 483 484 */ 485 private String isURLinTopSitesMap(String domain) { 542 * @param fullSeedDomain: domain of seedURL without the protocol. May include www. prefix. 543 * @return one of the following values: 544 * - This function returns null if the seedURL's domain does not match any of the topsites. 545 * - The empty String is returned if the seedURL's domain matched a topsite but no (allowed- 546 * url-pattern) value was defined for it. The empty String is also returned if the seedURL's 547 * domain exactly matched a topsite and had a value of SUBDOMAIN-COPY, because we still don't 548 * want to blindly crawl a topsite (as would happen with SUBDOMAIN-COPY). 549 * - A non-emptry String is returned if the seedURL's domain matched a topsite and a value 550 * was defined for it. (The value will be one of "SUBDOMAIN-COPY", "SINGLEPAGE" or an allowed 551 * URL pattern. 552 */ 553 private String isURLinTopSitesMap(String fullSeedDomain) { 486 554 boolean keepLooping = true; 487 555 556 String domain = fullSeedDomain; 557 488 558 // domain aprameter will have retained www or subdomains, but is stripped of protocol 489 559 … … 492 562 // If no match at all, return null. 493 563 do { 494 if(domain.contains("pinterest.com")) {495 System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map");496 }497 564 498 565 String allowed_url_pattern = topSitesMap.get(domain); 499 566 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 500 567 // there's an entry for the URL in the topSitesMap 501 System.err.println("##### A top site matches URL domain " + domain); 502 return allowed_url_pattern; 568 System.err.println("##### A top site matches URL domain " + domain); 569 570 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without 571 // www prefix, should not exactly match the topSitesMap domain 572 // e.g. we don't want to crawl a seed URL with domain www.blogspot.com 573 // despite it matching topsite blogspot.com with a value of SUBDOMAIN-COPY. 574 575 if(allowed_url_pattern.equals(SUBDOMAIN_COPY) && isExactDomainMatch(fullSeedDomain, domain)) { 576 return ""; // means don't crawl site, write url into unprocessed-topsite-matches file 577 } 578 return allowed_url_pattern; 503 579 } 504 580 // else, no entry for the URL in the topSitesMap 505 // Not done: strip subDomain from URL and check it against topSitesMap581 // We're not done yet: strip subDomain from URL and check it against topSitesMap again 506 582 507 String newURL = stripSubDomain(domain); 508 if(domain.equals(newURL)) keepLooping = false; 509 else domain = newURL; 583 String newDomain = stripSubDomain(domain); 584 if(domain.equals(newDomain)) { 585 keepLooping = false; 586 } else { 587 domain = newDomain; 588 } 510 589 } while(keepLooping); 511 590 … … 674 753 this.setRecordCount(wetRecordCount); 675 754 } 676 755 756 757 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 677 758 public static void printUsage() { 678 759 System.err.println("Run this program as:"); … … 770 851 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 771 852 772 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n");853 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 773 854 774 855
Note:
See TracChangeset
for help on using the changeset viewer.