Changeset 33561 for gs3-extensions
- Timestamp:
- 2019-10-11T20:49:05+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt
r33559 r33561 12 12 13 13 # FORMAT OF THIS FILE'S CONTENTS: 14 # <topsite-base-url> <tabspace><value>14 # <topsite-base-url>,<value> 15 15 # where <value> can be empty or one of SUBDOMAIN-COPY, SINGLEPAGE, <url-form-without-protocol> 16 16 # … … 29 29 # However, if the seedurl's domain is an exact match on topsite-base-url, the seedurl will go 30 30 # into the file unprocessed-topsite-matches.txt and the site/page won't be crawled. 31 # - FOLLOW-LINKS-WITHIN-TOPSITE: if pages linked from the seedURL page can be followed and 32 # downloaded, as long as it's within the same subdomain matching the topsite-base-url. 33 # This is different from SUBDOMAIN-COPY, as that can download all of a specific subdomain but 34 # restricts against downloading the entire domain (e.g. all pinky.blogspot.com and not anything 35 # else within blogspot.com). FOLLOW-LINKS-WITHIN-TOPSITE can download all linked pages (at 36 # depth specified for the nutch crawl) as long as they're within the topsite-base-url. 37 # e.g. seedURLs on docs.google.com containing links will have those linked pages and any 38 # they link to etc. downloaded as long as they're on docs.google.com. 31 39 # - <url-form-without-protocol>: if a seedurl contains topsite-base-url, then the provided 32 40 # url-form-without-protocol will make up the urlfilter, again preventing leaking into a … … 38 46 # Remember to leave out any protocol <from url-form-without-protocol>. 39 47 40 41 42 docs.google.com SINGLEPAGE 43 drive.google.com SINGLEPAGE 44 forms.office.com SINGLEPAGE 45 player.vimeo.com SINGLEPAGE 46 static-promote.weebly.com SINGLEPAGE 48 # column 3: whether nutch should do fetch all or not 49 # column 4: number of crawl iterations 50 51 # docs.google.com is a special case: not all pages are public and any interlinking is likely to 52 # be intentional. But SUBDOMAIN-COPY does not work: as seedURL's domain becomes docs.google.com 53 # which, when combined with SUBDOMAIN-COPY, the Java code treats as a special case so that 54 # any seedURL on docs.google.com ends up pushed out into the "unprocessed....txt" text file. 55 #docs.google.com,SUBDOMAIN-COPY 56 docs.google.com,FOLLOW-LINKS-WITHIN-TOPSITE 57 58 drive.google.com,SINGLEPAGE 59 forms.office.com,SINGLEPAGE 60 player.vimeo.com,SINGLEPAGE 61 static-promote.weebly.com,SINGLEPAGE 47 62 48 63 # Special case of yale.edu: its Rapa-Nui pages are on blacklist, but we want this page + its photos 49 64 # The page's containing folder is whitelisted in case the photos are there. 50 korora.econ.yale.edu 65 korora.econ.yale.edu,,SINGLEPAGE 51 66 52 67 000webhost.com … … 115 130 blackberry.com 116 131 blogger.com 117 blogspot.com 132 blogspot.com,SUBDOMAIN-COPY 118 133 bloomberg.com 119 134 booking.com … … 171 186 dreniq.com 172 187 dribbble.com 173 dropbox.com 188 dropbox.com,SINGLEPAGE 174 189 dropboxusercontent.com 175 190 dw.com … … 303 318 lonelyplanet.com 304 319 lycos.com 305 m.wikipedia.org 320 m.wikipedia.org,mi.m.wikipedia.org 306 321 mail.ru 307 322 marketwatch.com … … 315 330 merriam-webster.com 316 331 metro.co.uk 317 microsoft.com 332 microsoft.com,microsoft.com/mi-nz/ 318 333 microsoftonline.com 319 334 mirror.co.uk … … 382 397 photobucket.com 383 398 php.net 384 pinterest.com 399 pinterest.com,SINGLEPAGE 385 400 pixabay.com 386 401 playstation.com … … 456 471 stores.jp 457 472 storify.com 458 stuff.co.nz 473 stuff.co.nz,SINGLEPAGE 459 474 surveymonkey.com 460 475 symantec.com … … 534 549 wikihow.com 535 550 wikimedia.org 536 wikipedia.org 537 wiktionary.org 551 wikipedia.org,mi.wikipedia.org 552 wiktionary.org,mi.wiktionary.org 538 553 wiley.com 539 554 windowsphone.com 540 555 wired.com 541 556 wix.com 542 wordpress.org 557 wordpress.org,SUBDOMAIN-COPY 543 558 worldbank.org 544 559 wp.com -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33560 r33561 69 69 public final String SUBDOMAIN_COPY = "SUBDOMAIN-COPY"; 70 70 public final String SINGLEPAGE = "SINGLEPAGE"; 71 public final String FOLLOW_LINKS_WITHIN_TOPSITE = "FOLLOW-LINKS-WITHIN-TOPSITE"; 71 72 72 73 /** … … 74 75 * https://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions 75 76 * https://www.regular-expressions.info/refcharacters.html 76 */77 //public final String[] ESCAPE_CHARS_FOR_RE = [".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "\\", "|"];78 // put the \\ at start so we don't the escape character for chars escaped earlier77 * Put the \\ (escape char) at start so we don't double-escape chars already escaped, 78 * as would happen for any chars appearing earlier in this list than \\ 79 */ 79 80 public final String ESCAPE_CHARS_FOR_RE = "\\.^$*+?()[{|"; 81 //public final String[] ESCAPE_CHARS_FOR_RE = ["\\", ".", "^", "$", "*", "+", "?", "(", ")", "[", "{", "|"]; 80 82 81 83 private Properties configProperties = new Properties(); … … 212 214 } 213 215 214 int tabindex = str.indexOf("\t"); 215 if(tabindex == -1) { 216 // comma separated list of values 217 int splitindex = str.indexOf(","); 218 if(splitindex == -1) { 216 219 topSitesMap.put(str, ""); 217 220 } else { 218 String topsite = str.substring(0, tabindex).trim();219 String allowed_url_pattern = str.substring( tabindex+1).trim();221 String topsite = str.substring(0, splitindex).trim(); 222 String allowed_url_pattern = str.substring(splitindex+1).trim(); 220 223 topSitesMap.put(topsite, allowed_url_pattern); 221 224 } … … 352 355 while(domainIterator.hasNext()) { 353 356 String domainWithProtocol = domainIterator.next(); 357 // Also get domain without protocol prefix 354 358 int startIndex = domainWithProtocol.indexOf("//"); // http:// or https:// prefix 355 359 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 356 360 String domain = domainWithProtocol.substring(startIndex); 357 358 System.err.println("domain with protocol: " + domainWithProtocol); 359 System.err.println("domain: " + domain); 361 362 /*if(domain.contains("docs.google.com")) { 363 System.err.println("domain with protocol: " + domainWithProtocol); 364 System.err.println("domain: " + domain); 365 }*/ 360 366 361 367 String allowedURLPatternRegex = isURLinTopSitesMap(domain); … … 372 378 373 379 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 374 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 375 while(urlIterator.hasNext()) { 376 String url = urlIterator.next(); 380 for(String url : urlsForDomainSet) { 377 381 topSiteMatchesWriter.write("\t" + url + "\n"); 378 382 } 379 383 380 384 continue; // done with this domain 381 385 } … … 451 455 siteRegexWriter.write(regexed_url + "\n"); 452 456 } 457 } else if(allowedURLPatternRegex.equals(FOLLOW_LINKS_WITHIN_TOPSITE)) { 458 459 // DON'T write out domain into siteURLs file, 460 // BUT DO write it into urlFilter file 461 String regexed_domain = PROTOCOL_REGEX_PREFIX + escapeStringForRegex(domain) + "/"; 462 463 urlFilterWriter.write(regexed_domain + "\n"); 464 siteRegexWriter.write(regexed_domain + "\n"); 453 465 } else { // allowedURLPatternRegex is a url-form - convert to regex 454 466 if(!allowedURLPatternRegex.endsWith("/")) { … … 467 479 // also write into the global seeds file (with a tab prefixed to each?) 468 480 Set<String> urlsForDomainSet = domainsToURLsMap.get(domainWithProtocol); 469 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 470 while(urlIterator.hasNext()) { 471 String url = urlIterator.next(); 481 for(String url : urlsForDomainSet) { 472 482 seedURLsWriter.write(url + "\n"); // global seedURLs file 473 483 siteURLsWriter.write(url + "\n"); 474 484 } 485 475 486 } catch (IOException ioe) { 476 487 ioe.printStackTrace();
Note:
See TracChangeset
for help on using the changeset viewer.