Changeset 33569
- Timestamp:
- 2019-10-16T20:00:09+13:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt
r33568 r33569 54 54 # NOT TOP SITES, BUT SITES WE INSPECTED AND WANT TO CONTROL SIMILARLY TO TOP SITES 55 55 00.gs,SINGLEPAGE 56 57 # May be a large site 56 # May be a large site with only seedURLs of real relevance 58 57 topographic-map.com,SINGLEPAGE 58 ami-media.net,SINGLEPAGE 59 # 2 pages of declarations of human rights in Maori, rest in other languages 60 anitra.net,SINGLEPAGE 61 # special case 62 mi.centr-zashity.ru,SINGLEPAGE 63 64 # TOP SITE BUT NOT TOP 500 65 www.tumblr.com,SINGLEPAGE 66 59 67 60 68 # TOP SITES … … 74 82 # The page's containing folder is whitelisted in case the photos are there. 75 83 korora.econ.yale.edu,SINGLEPAGE 84 76 85 77 86 000webhost.com -
gs3-extensions/maori-lang-detection/conf/url-blacklist-filter.txt
r33568 r33569 28 28 zh-min-nan.wiktionary.org 29 29 30 ###### 30 31 # unwanted domains 31 32 .video-chat. … … 69 70 acba.osb-land.com 70 71 72 73 # just get rid of any URL containing "livejasmin" 74 ## livejasmin 75 # Actually: do that in the code (CCWETProcessor) with a log message, 76 # since we actually need to get rid of any sites in entirety that contain 77 # any url with the string "livejasmin" 78 # So run the program once, check the log for messages mentioning "additional" 79 # adult sites found and add their domains in here. 80 anigma-beauty.com 81 adultfeet.com 82 atopian.org 83 bellydancingvideo.net 84 bmmodelsagency.com 85 brucknergallery.com 86 fuckvidz.org 87 photobattle.net 88 votekat.info 89 90 # Similar to above, the following contained the string "jasmin" in the URL 91 teenycuties.com 92 a.tiles.mapbox.com 93 blazingteens.net 94 redtubeporn.info 95 osb-land.com 96 totallyhotmales.com 97 babeevents.com 98 talkserver.de 99 hehechat.org 100 fetish-nights.com 101 lesslove.com 102 hebertsvideo.com 103 71 104 # sounds like some pirating site 72 105 ^http://pirateguides.com/ … … 85 118 # not sure about the domain name and/or full url seems like it belongs here 86 119 abcutie.com 120 121 # only had a single seedURL and it quickly redirected to an adult site 122 apparactes.gq -
gs3-extensions/maori-lang-detection/conf/url-greylist-filter.txt
r33568 r33569 18 18 abacre.com 19 19 cn-huafu.net 20 apteka.social 21 20 22 21 23 # not product stores but autotranslated? … … 25 27 1videosmusica.com 26 28 256file.com 27 7773033.ru 28 abali.ru 29 allbeautyone.ru 29 # already in greylisting of all .ru 30 #7773033.ru 31 #abali.ru 32 #allbeautyone.ru 33 aqualuz.org 30 34 31 35 # if page doesn't load and can't be tested … … 33 37 www.kiterewa.pl 34 38 35 # license plate site? 36 eba.com.ru 39 40 41 # MANUALLY INSPECTED URLS AND ADDED TO GREYLIST 42 43 # license plate site? - already in greylisting of all .ru 44 #eba.com.ru 37 45 38 46 # As per archive.org, there's just a photo on the defunct page at this site … … 42 50 # seems to be Indonesian or Malaysian Bible rather than in Maori or any Polynesian language 43 51 alkitab.life:2022 52 53 # appears defunct 54 alixira.com 55 56 # single seedURL was not a page in Maori, but global languages. 57 # And the rest of the domain appears to be in English 58 anglican.org 59 60 61 ### TLDs that we greylist - any exceptions will be in the whitelist 62 # Our list of .ru and .pl domains were not relevant 63 .ru/ 64 .pl/ 65 .tk/ -
gs3-extensions/maori-lang-detection/conf/url-whitelist-filter.txt
r33559 r33569 10 10 # some particular other urls on yale.edu 11 11 http://korora.econ.yale.edu/phillips/archive/hauraki.htm 12 13 # We've added .ru$ sites to the blacklist, but the following 14 # Russian website contains actual Maori language content 15 http://www.krassotkin.ru/sites/prayer.su/maori/ 16 https://mi.centr-zashity.ru/ -
gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh
r33567 r33569 74 74 echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out 75 75 fi 76 76 77 78 # move the peripheral crawl products (the log.out and UNFINISHED files) 79 # from the input to the output folder. This way we can re-run the crawl and 80 # the original output will still have been preserved 81 mv ${siteDir}log.out $outputDir/$crawlId/log.out 82 mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED 77 83 } 78 84 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33568 r33569 150 150 151 151 } catch(Exception e) { 152 System.err.println("Exception attempting to read properties from config.properties."); 153 logger.error("Exception attempting to read properties from config.properties."); 154 e.printStackTrace(); 152 error("Exception attempting to read properties from config.properties.", e); 155 153 } 156 154 157 155 if(configProperties.size() == 0) { 158 System.err.println("*** Warning: no values read into config properties. Using defaults.");156 warn("*** Warning: no values read into config properties. Using defaults."); 159 157 } 160 158 … … 192 190 193 191 // prepare our blacklist, greylist (for inspection) and whitelist 194 System.err.println("Loading blacklist.");192 info("Loading blacklist."); 195 193 blackList = new HashMap<String, Integer>(); 196 194 initURLFilterList(blackList, "url-blacklist-filter.txt"); 197 195 198 System.err.println("Loading greylist.");196 info("Loading greylist."); 199 197 greyList = new HashMap<String, Integer>(); 200 198 initURLFilterList(greyList, "url-greylist-filter.txt"); 201 199 202 System.err.println("Loading whitelist.");200 info("Loading whitelist."); 203 201 whiteList = new HashMap<String, Integer>(); 204 202 initURLFilterList(whiteList, "url-whitelist-filter.txt"); 205 203 206 204 // Create the map of topSites 207 System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");205 info("Loading map of topsites with regex of allowable url patterns for each topsite."); 208 206 topSitesMap = new HashMap<String, String>(); 209 207 … … 228 226 topSitesMap.put(topsite, allowed_url_pattern); 229 227 230 // System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);228 //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern); 231 229 232 230 } 233 231 } catch(Exception e) { 234 e.printStackTrace(); 235 System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData); 236 } 237 238 232 error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e); 233 } 239 234 240 // System.err.println("Prematurely terminating for testing purposes.");235 //debug("Prematurely terminating for testing purposes."); 241 236 //System.exit(-1); 242 237 } … … 309 304 urlsSet = new TreeSet<String>(); 310 305 urlsSet.add(url); 311 domainsToURLsMap.put(domainWithProtocol, urlsSet); 306 domainsToURLsMap.put(domainWithProtocol, urlsSet); 312 307 } else { 313 308 urlsSet = domainsToURLsMap.get(domainWithProtocol); … … 317 312 } 318 313 } catch (IOException ioe) { 319 ioe.printStackTrace(); 320 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile); 314 error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe); 321 315 } 322 316 … … 356 350 String value = topSitesMap.get("wikipedia.org"); 357 351 if(value == null) { 358 System.err.println("### wikipedia.org had null value");352 debug("### wikipedia.org had null value"); 359 353 } else { 360 System.err.println("### wikipedia.org had value: " + value);354 debug("### wikipedia.org had value: " + value); 361 355 } // DEBUG 362 356 */ … … 370 364 371 365 /*if(domain.contains("docs.google.com")) { 372 System.err.println("domain with protocol: " + domainWithProtocol);373 System.err.println("domain: " + domain);366 debug("domain with protocol: " + domainWithProtocol); 367 debug("domain: " + domain); 374 368 }*/ 375 369 … … 495 489 496 490 } catch (IOException ioe) { 497 ioe.printStackTrace(); 498 System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile); 491 error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe); 499 492 } 500 493 … … 502 495 503 496 } catch (IOException ioe) { 504 ioe.printStackTrace(); 505 System.err.println("\n@@@@@@@@@ Error writing to one of: "); 506 System.err.println("\t" + seedURLsFile); 507 System.err.println("\t" + urlFilterFile); 508 System.err.println("\t" + domainURLsFile); 509 System.err.println("\t" + topSiteMatchesFile); 497 error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile 498 + "\n\t" + urlFilterFile 499 + "\n\t" + domainURLsFile 500 + "\n\t" + topSiteMatchesFile, ioe); 510 501 } 511 502 512 503 /* 513 504 // BEGIN DEBUG 514 System.err.println("@@@@ TopSitesMap contains: ");505 debug("@@@@ TopSitesMap contains: "); 515 506 for(Map.Entry<String, String> entry : topSitesMap.entrySet()) { 516 507 String topSite = entry.getKey(); 517 508 String urlPattern = entry.getValue(); 518 System.err.println(topSite + " - " + urlPattern);509 debug(topSite + " - " + urlPattern); 519 510 } // END DEBUG 520 511 */ … … 587 578 if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain); 588 579 // there's an entry for the URL in the topSitesMap 589 System.err.println("##### A top site matches URL domain " + domain);580 debug("##### A top site matches URL domain " + domain); 590 581 591 582 // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without … … 648 639 */ 649 640 public boolean isBlacklisted(String url) { 650 return isListedInFilterList(blackList, url); 641 boolean isBlackListed = isListedInFilterList(blackList, url); 642 643 // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually, 644 // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted 645 String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol 646 if(!isBlackListed && url.contains("jasmin")) { 647 warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol); 648 blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS); 649 } 650 return isBlackListed; 651 651 } 652 652 … … 680 680 // if filterListFilename does not exist in the conf folder, just return 681 681 if(MY_CLASSLOADER.getResource(filterListFilename) == null) { 682 System.err.println(filterListFilename + " does not exist");682 warn("Filter list filename: " + filterListFilename + " does not exist"); 683 683 return; 684 684 } … … 702 702 filter = filter.substring(1); 703 703 list.put(filter, LIST_ENTRY_STARTSWITH); 704 System.err.println("Match filter startswith: " + filter);704 //debug("Match filter startswith: " + filter); 705 705 } 706 706 else if(filter.endsWith("$")) { 707 707 filter = filter.substring(0, filter.length()-1); 708 708 list.put(filter, LIST_ENTRY_ENDSWITH); 709 //debug("@@@ Match filter endswith: " + filter); 709 710 } 710 711 else { 711 712 list.put(filter, LIST_ENTRY_CONTAINS); 712 713 } 713 // System.err.println("Got filter: " + filter);714 //debug("Got filter: " + filter); 714 715 } 715 716 716 717 } catch (IOException ioe) { 717 ioe.printStackTrace(); 718 System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename); 718 error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe); 719 719 } 720 720 … … 739 739 for(int i = 0; i < WETFiles.length; i++) { 740 740 File WETFile = WETFiles[i]; 741 logger.debug("Processing WETfile: " + WETFile);741 debug("Processing WETfile: " + WETFile); 742 742 743 743 // Any .gz files listed means they haven't been unzipped yet. So unzip. … … 754 754 // Check the unzipped WETFile exists 755 755 756 if(!WETFile.exists() || !WETFile.isFile()) { 757 System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)"); 758 logger.error("Error: " + WETFile + " does not exist (failure to unzip?)"); 756 if(!WETFile.exists() || !WETFile.isFile()) { 757 error("Error: " + WETFile + " does not exist (failure to unzip?)"); 759 758 return; 760 759 } … … 777 776 778 777 // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- // 778 public static void info(String msg) { 779 System.err.println(msg); 780 logger.info(msg); 781 } 782 public static void debug(String msg) { 783 System.err.println(msg); 784 logger.debug(msg); 785 } 786 public static void warn(String msg) { 787 System.err.println(msg); 788 logger.warn(msg); 789 } 790 public static void error(String msg) { 791 System.err.println(msg); 792 logger.error(msg); 793 } 794 public static void error(String msg, Exception e) { 795 logger.error(msg, e); 796 System.err.println(msg); 797 e.printStackTrace(); 798 } 799 779 800 public static void printUsage() { 780 System.err.println("Run this program as:");781 System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");801 info("Run this program as:"); 802 info("\tWetProcessor <folder containing wet(.gz) files> <output folder path>"); 782 803 } 783 804 … … 824 845 } 825 846 else { 826 System.err.println("File " + f + " is not a directory");847 info("File " + f + " is not a directory"); 827 848 } 828 849 return false; … … 838 859 File commoncrawlDir = new File(args[0]); 839 860 if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) { 840 System.out.println("Error: " + args[0] + " does not exist or is not a directory");861 error("Error: " + args[0] + " does not exist or is not a directory"); 841 862 return; 842 863 } … … 844 865 File outFolder = new File(args[1]); 845 866 if(!outFolder.exists() || !outFolder.isDirectory()) { 846 System.out.println("Error: " + args[1] + " does not exist or is not a directory.");867 error("Error: " + args[1] + " does not exist or is not a directory."); 847 868 return; 848 869 } … … 855 876 for(int i = 0; i < ccrawlFolders.length; i++) { 856 877 File ccrawlFolder = ccrawlFolders[i]; 857 System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);878 info("About to process commoncrawl WET files folder: " + ccrawlFolder); 858 879 ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder); 859 880 } … … 869 890 ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile); 870 891 871 System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");872 873 System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");892 info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 893 894 info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 874 895 875 896 876 897 } catch(Exception e) { 877 898 // can get an exception when instantiating CCWETProcessor instance 878 e.printStackTrace(); 879 System.err.println(e.getMessage()); 899 error(e.getMessage(), e); 880 900 } 881 901
Note:
See TracChangeset
for help on using the changeset viewer.