- Timestamp:
- 2019-10-09T23:10:06+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33552 r33557 92 92 private HashMap<String, Integer> whiteList; 93 93 94 /** map of topsites with allowable regexes: sites too big to exhaustively crawl 95 * with optional regex defining allowed exceptions, like subdomains or url suffixes 96 * off that top site. For example, wikipedia.org is a topsite, but mi.wikipedia.org 97 * is relevant. Or blogspot.com is a top site, but someone's pages in Maori off blogspot 98 * would be relevant. 99 * The map would store top site domain suffix and an optional regex string for allowable 100 * url patterns. 101 */ 102 private HashMap<String, String> topSitesMap; 103 94 104 /** Map of domains we keep and the full urls we're keeping that are of that domain. 95 105 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys, … … 173 183 initURLFilterList(whiteList, "url-whitelist-filter.txt"); 174 184 185 // Create the map of topSites 186 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite."); 187 topSitesMap = new HashMap<String, String>(); 188 //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt"); 189 190 try ( 191 BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8")); 192 ) { 193 194 String str = null; 195 while((str = reader.readLine()) != null) { 196 str = str.trim(); 197 if(str.equals("") || str.startsWith("#")) { 198 continue; 199 } 200 201 int tabindex = str.indexOf("\t"); 202 if(tabindex == -1) { 203 topSitesMap.put(str, ""); 204 } else { 205 String topsite = str.substring(0, tabindex).trim(); 206 String allowed_url_pattern = str.substring(tabindex+1).trim(); 207 topSitesMap.put(topsite, allowed_url_pattern); 208 } 209 } 210 } catch (IOException ioe) { 211 ioe.printStackTrace(); 212 System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt"); 213 } 214 175 215 //System.err.println("Prematurely terminating for testing purposes."); 176 216 //System.exit(-1); 217 } 218 219 /** Work out the 'domain' for a given url. 220 * This retains any www. or subdomain prefix. 221 */ 222 private String getDomainForURL(String url) { 223 int startIndex = url.indexOf("//"); // http:// or https:// prefix 224 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 225 String domain = url.substring(startIndex); 226 int endIndex = domain.indexOf("/"); 227 if(endIndex == -1) endIndex = domain.length(); 228 domain = domain.substring(0, endIndex); 229 230 return domain; 177 231 } 178 232 … … 182 236 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 183 237 */ 184 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, File domainURLsFile) { 238 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 239 File domainURLsFile, File topSiteMatchesFile) { 185 240 // Maintain Sets of unique domains and urls 186 241 // TreeSet: by default, "the elements are ordered using their natural ordering" … … 204 259 while((url = reader.readLine()) != null) { // readLine removes newline separator 205 260 206 // work out domain. This retains any www. or subdomain prefix: 207 int startIndex = url.indexOf("//"); // http:// or https:// prefix 208 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 209 domain = url.substring(startIndex); 210 int endIndex = domain.indexOf("/"); 211 if(endIndex == -1) endIndex = domain.length(); 212 domain = domain.substring(0, endIndex); 261 // work out domain. This retains any www. or subdomain prefix 262 domain = getDomainForURL(url); 213 263 214 264 //urlsSet.add(url); … … 232 282 // We'd have pruned out duplicates by now and have a sorted list of domains, 233 283 // each of which maps to seed URLs in the commoncrawl for that domain 234 235 /* 236 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 237 Iterator<String> i = urlsSet.iterator(); 238 while(i.hasNext()) { 239 String url = i.next(); 240 seedURLsWriter.write(url + "\n"); 241 } 242 243 } catch (IOException ioe) { 244 ioe.printStackTrace(); 245 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 246 } 247 */ 248 284 249 285 int domainCount = 0; 250 286 File sitesFolder = new File(outputFolder, "sites"); … … 258 294 try ( 259 295 // global lists of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 296 // Also a global file listing any urls that matched top sites that didn't specify 297 // allowed regex patterns 260 298 BufferedWriter domainURLsWriter = new BufferedWriter(new FileWriter(domainURLsFile)); 261 299 BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile)); 262 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)) 300 BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile)); 301 BufferedWriter topSiteMatchesWriter = new BufferedWriter(new FileWriter(topSiteMatchesFile)) 263 302 ) { 303 304 // initialise topSiteMatchesFile with some instructional text. 305 topSiteMatchesWriter.write("The following domain with seedURLs are on a major/top 500 site\n"); 306 topSiteMatchesWriter.write("for which no allowed URL pattern regex has been specified.\n"); 307 topSiteMatchesWriter.write("Specify one for this domain in the tab-spaced sites-too-big-to-exhaustively-crawl.txt file\n"); 308 264 309 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet(); 265 310 Set<String> domainsSet = domainsToURLsMap.keySet(); 266 311 Iterator<String> domainIterator = domainsSet.iterator(); 312 313 /* 314 // DEBUG 315 String value = topSitesMap.get("wikipedia.org"); 316 if(value == null) { 317 System.err.println("### wikipedia.org had null value"); 318 } else { 319 System.err.println("### wikipedia.org had value: " + value); 320 } // DEBUG 321 */ 267 322 268 323 while(domainIterator.hasNext()) { 269 domainCount++; 324 String domain = domainIterator.next(); 325 326 String allowedURLPatternRegex = isURLinTopSitesMap(domain); 327 // If the domain is of a topsite for which no allowed URL pattern has been provided 328 // in sites-too-big-to-exhaustively-crawl.txt, 329 // then we don't know how to crawl the site. Warn the user by writing the affected 330 // domain and seedURLs to the topSiteMatchesFile. 331 if(allowedURLPatternRegex != null && allowedURLPatternRegex.equals("")) { 332 // topsite, but we don't (yet) know what portion can be crawled 333 // Append the top site and url to a global/toplevel file that 334 // the user needs to check later and we're done with this domain as it 335 // won't go into any other file hereafter 336 337 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 338 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 339 while(urlIterator.hasNext()) { 340 String url = urlIterator.next(); 341 topSiteMatchesWriter.write("\t" + url + "\n"); 342 } 343 344 continue; // done with this domain 345 } 346 347 // start counting the domains we're actually going to process 348 domainCount++; 349 270 350 String siteID = String.format(FORMATSTR, domainCount); 271 351 File domainFolder = new File(sitesFolder, siteID); 272 352 domainFolder.mkdir(); 273 353 274 // write out the domain 275 String domain = domainIterator.next(); 354 // write out the domain 276 355 //seedURLsWriter.write(domain + "\n"); 277 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 278 String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 279 urlFilterWriter.write(regexed_domain + "\n"); 356 280 357 281 358 // for every domain, we need a sites/0000x/ folder, where x is domain#, containing … … 297 374 // If we ever run nutch on a single seedURLs listing containing 298 375 // all seed pages to crawl sites from, the above two files will work for that. 299 siteURLsWriter.write(domain + "\n");300 siteRegexWriter.write(regexed_domain + "\n");301 376 302 // next write out the urls for the domain with a tab prefixed to each 303 // into the sites/0000x/seedURLs.txt file - also write into the global seeds file 377 378 if(allowedURLPatternRegex == null) { // entire site can be crawled 379 siteURLsWriter.write(domain + "\n"); 380 381 // Write out filter in the following form for a site, e.g. for nutch.apache.org: 382 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 383 String regexed_domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 384 urlFilterWriter.write(regexed_domain + "\n"); //global file 385 siteRegexWriter.write(regexed_domain + "\n"); // site file 386 } 387 else { // domain belongs to a top site where only portion of site can be crawled 388 389 if(allowedURLPatternRegex.equals("COPY")) { // COPY existing domain as url-filter 390 siteURLsWriter.write(domain + "\n"); 391 // e.g. pinky.blogspot.com will add a filter for pinky.blogspot.com 392 // and not for all of blogspot.com 393 394 String regexed_domain = "+https?://"+domain.replace(".", "\\.") + "/"; 395 urlFilterWriter.write(regexed_domain + "\n"); 396 siteRegexWriter.write(regexed_domain + "\n"); 397 398 } else if(allowedURLPatternRegex.equals("SINGLEPAGE")) { 399 // don't write out domain. We want individual pages 400 //DON'T DO: siteURLsWriter.write(domain + "\n"); 401 402 // don't write out domain as a regex expression url filter 403 // write out the individual seed urls for the domain instead 404 // since we will only be downloading the single page 405 406 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 407 for(String urlInDomain : urlsForDomainSet) { 408 String regexed_url = "+^"+urlInDomain.replace(".", "\\."); 409 urlFilterWriter.write(regexed_url + "\n"); 410 siteRegexWriter.write(regexed_url + "\n"); 411 } 412 } else { // allowedURLPatternRegex is a url-form - convert to regex 413 String regexed_pattern = "+^https?://"+allowedURLPatternRegex.replace(".", "\\."); 414 siteURLsWriter.write(domain + "\n"); 415 urlFilterWriter.write(regexed_pattern + "\n"); 416 siteRegexWriter.write(regexed_pattern + "\n"); 417 418 } 419 } 420 421 // next write out the urls for the domain into the sites/0000x/seedURLs.txt file 422 // also write into the global seeds file 423 // (with a tab prefixed to each url?) 304 424 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 305 425 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 306 426 while(urlIterator.hasNext()) { 307 427 String url = urlIterator.next(); 308 seedURLsWriter.write( url + "\n"); // global seedURLs file309 siteURLsWriter.write("\t" + url + "\n"); 428 seedURLsWriter.write("\t" + url + "\n"); // global seedURLs file 429 siteURLsWriter.write("\t" + url + "\n"); 310 430 } 311 431 } catch (IOException ioe) { … … 313 433 System.err.println("\n@@@@@@@@@ Error writing to " + siteSeedsFile + " or " + siteRegexFile); 314 434 } 315 } 316 435 436 } 437 317 438 } catch (IOException ioe) { 318 439 ioe.printStackTrace(); … … 335 456 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 336 457 } 458 459 /* 460 // BEGIN DEBUG 461 System.err.println("@@@@ TopSitesMap contains: "); 462 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) { 463 String topSite = entry.getKey(); 464 String urlPattern = entry.getValue(); 465 System.err.println(topSite + " - " + urlPattern); 466 } // END DEBUG 467 */ 468 } 469 470 private String stripSubDomain(String url) { 471 int index = url.indexOf("."); 472 if(index != -1) { 473 url = url.substring(index+1); 474 } 475 return url; 476 } 477 478 /** 479 * Check if the domain of the url, either in its entirety or when stripped of www/subdomains, 480 * is in the list of top sites. 481 * If it is, and the given url matches the regex for that topsite, then add the url to the 482 * whitelist and a regex disallowing the rest of the topsite to the url regex filter file. 483 484 */ 485 private String isURLinTopSitesMap(String domain) { 486 boolean keepLooping = true; 487 488 // domain aprameter will have retained www or subdomains, but is stripped of protocol 489 490 // keep looping, stripping subdomains from url and checking if it matches a topsite domain 491 // if it does, return the value for that topsite domain in the topSitesMap 492 // If no match at all, return null. 493 do { 494 if(domain.contains("pinterest.com")) { 495 System.err.println("@@@@@@@@@ Checking for url " + domain + " in the top sites map"); 496 } 497 498 String allowed_url_pattern = topSitesMap.get(domain); 499 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 500 // there's an entry for the URL in the topSitesMap 501 System.err.println("##### A top site matches URL domain " + domain); 502 return allowed_url_pattern; 503 } 504 // else, no entry for the URL in the topSitesMap 505 // Not done: strip subDomain from URL and check it against topSitesMap 506 507 String newURL = stripSubDomain(domain); 508 if(domain.equals(newURL)) keepLooping = false; 509 else domain = newURL; 510 } while(keepLooping); 511 512 // url in entirety or stripped of subdomains did not match any of the topsites 513 return null; 337 514 } 338 515 339 516 private boolean isListedInFilterList(Map<String, Integer> filterListMap, String url) { 340 Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet(); 341 Iterator<Map.Entry<String, Integer>> i = entries.iterator(); 342 while(i.hasNext()) { 343 Map.Entry<String, Integer> entry = i.next(); 517 //Set<Map.Entry<String,Integer>> entries = filterListMap.entrySet(); 518 //Iterator<Map.Entry<String, Integer>> i = entries.iterator(); 519 //while(i.hasNext()) { 520 // Map.Entry<String, Integer> entry = i.next(); 521 for(Map.Entry<String,Integer> entry : filterListMap.entrySet()) { 344 522 String urlPattern = entry.getKey(); 345 523 Integer matchRule = entry.getValue(); … … 379 557 */ 380 558 public boolean isGreylisted(String url) { 381 // TODO: alexa top sites andauto-translated product sites559 // auto-translated product sites 382 560 return isListedInFilterList(greyList, url); 383 561 } … … 579 757 } 580 758 581 // global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 759 760 // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls 582 761 // The former is the only unique one. seedURLs and regex-urlfilters are 583 762 // repeated on a per site/domain basis too, stored in the sites folder … … 585 764 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 586 765 File domainURLsFile = new File(outFolder, "all-domain-urls.txt"); 587 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile); 766 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 767 768 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); 588 769 589 770 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 771 772 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites but had no regex of allowed url patterns.\n"); 773 590 774 591 775 } catch(Exception e) {
Note:
See TracChangeset
for help on using the changeset viewer.