Changeset 33624
- Timestamp:
- 2019-11-05T21:48:50+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33623 r33624 251 251 * as output the URL seed list and regex-urlfilter text files required by nutch, see 252 252 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 253 * 254 * This method creates seedURLs files and url-regexfilter files needed by nutch, instructing 255 * it what portion to crawl of each site. 256 * 257 * The topSiteMatches file also gets created, listing sites excluded from crawling as 258 * they're too large to exhaustively crawl. The user will be told to inspect this file 259 * after this program has finished running. 260 * 261 * If checkForPossibleProductSites, then any urls containing /mi(/) that are outside of NZ 262 * or whose geolocation isn't known will end up in the file denoted by possibleProductSitesFile 263 * 253 264 */ 254 public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,265 public void prepareSitesForNutchCrawling(File seedURLsFile, File urlFilterFile, 255 266 File domainURLsFile, File topSiteMatchesFile, 256 File possibleProductSitesFile) {267 boolean checkForPossibleProductSites, File possibleProductSitesFile) { 257 268 // Maintain a Map of unique domains mapped to seed urls at that domain 258 269 // TreeSet: by default, "the elements are ordered using their natural ordering" … … 294 305 } 295 306 296 /* 297 // Dr Nichols said that a url that was located outside the country and 298 // which had /mi/ URLs was more likely to be an autotranslated (product) site. 299 // Following Dr Nichols' idea, let's keep a look out for more product sites: 300 // if any URL contains /mi AND the tld of its domain is outside of New Zealand 301 // then add that domain (if not already added) and that url into a file 302 // for later manual inspection 303 if(!domainWithProtocol.endsWith(".nz") 304 && (url.contains("/mi/") || url.endsWith("/mi"))) { 305 306 if(!possibleProductDomains.contains(domainWithProtocol)) { 307 308 String countryCode = ""; 309 try { 310 // more expensive test, so do this only if above conditions are true: 311 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile); 312 System.err.println("@@@@ Got country code: " + countryCode); 313 } catch(Exception exceptObj) { 314 countryCode = ""; // forces domain to be included for inspection 307 if(checkForPossibleProductSites) { 308 // Dr Nichols said that a url that was located outside the country and 309 // which had /mi/ URLs was more likely to be an autotranslated (product) site. 310 // Following Dr Nichols' idea, let's keep a look out for more product sites: 311 // if any URL contains /mi AND the tld of its domain is outside of New Zealand 312 // then add that domain (if not already added) and that url into a file 313 // for later manual inspection 314 if(!domainWithProtocol.endsWith(".nz") 315 && (url.contains("/mi/") || url.endsWith("/mi"))) { 316 317 if(!possibleProductDomains.contains(domainWithProtocol)) { 315 318 316 logger.error("Could not check if domain " + domainWithProtocol 317 + " was in country: " + countryCode, 318 exceptObj); 319 String countryCode = ""; 320 try { 321 // more expensive test, so do this only if above conditions are true: 322 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile); 323 System.err.println("@@@@ Got country code: " + countryCode); 324 } catch(Exception exceptObj) { 325 countryCode = ""; // forces domain to be included for inspection 326 327 logger.error("Could not check if domain " + domainWithProtocol 328 + " was in country: " + countryCode, 329 exceptObj); 330 } 331 332 boolean isInNZ = countryCode.toLowerCase().equals("nz"); 333 334 335 //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 336 if(!isInNZ) { 337 possibleProductDomains.add(domainWithProtocol); 338 // write both domain and a sample seedURL on that site out to file 339 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n"); 340 possibleProductSitesWriter.write("\t" + url + "\n"); 341 } 319 342 } 320 321 boolean isInNZ = countryCode.toLowerCase().equals("nz"); 322 323 324 //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 325 if(!isInNZ) { 326 possibleProductDomains.add(domainWithProtocol); 327 // write both domain and a sample seedURL on that site out to file 328 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n"); 329 possibleProductSitesWriter.write("\t" + url + "\n"); 330 } 343 //else { 344 // already wrote out domain to file at some point, write just the URL out to file 345 //possibleProductSitesWriter.write("\t" + url + "\n"); 346 //} 331 347 } 332 //else { 333 // already wrote out domain to file at some point, write just the URL out to file 334 //possibleProductSitesWriter.write("\t" + url + "\n"); 335 //} 336 } 337 */ 348 } 338 349 } 339 350 } catch (IOException ioe) { … … 803 814 public static void printUsage() { 804 815 System.err.println("Run this program as:"); 805 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path> ");816 System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path> [--check-for-product-sites]"); 806 817 } 807 818 … … 855 866 856 867 public static void main(String[] args) { 857 if(args.length != 2) {868 if(args.length < 2 || args.length > 3) { 858 869 printUsage(); 859 870 return; 871 } 872 873 boolean checkForPossibleProductSites = false; 874 if(args.length == 3) { 875 if(!args[2].equals("--check-for-product-sites")) { 876 printUsage(); 877 return; 878 } else { 879 checkForPossibleProductSites = true; 880 } 860 881 } 861 882 … … 870 891 logger.error("Error: " + args[1] + " does not exist or is not a directory."); 871 892 return; 872 } 893 } 894 873 895 874 896 try { … … 891 913 File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 892 914 File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt"); 893 894 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 915 916 917 ccWETFilesProcessor.prepareSitesForNutchCrawling(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, checkForPossibleProductSites, possibleProductSitesFile); 895 918 896 919 logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 897 920 898 logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 921 if(checkForPossibleProductSites) { 922 logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 923 } else { 924 possibleProductSitesFile.delete(); 925 } 899 926 900 927
Note:
See TracChangeset
for help on using the changeset viewer.