[33502] | 1 | # URL blacklist
|
---|
| 2 | # FORMAT:
|
---|
| 3 | # precede URL by ^ to blacklist urls that match the given prefix
|
---|
| 4 | # succeed URL by $ to blacklist urls that match the given suffix
|
---|
| 5 | # ^url$ will blacklist urls that match the given url completely
|
---|
| 6 | # Without either ^ or $ symbol, urls containing the given url will get blacklisted
|
---|
| 7 |
|
---|
| 8 |
|
---|
[33559] | 9 | # manually adjusting for irrelevant topsite hits
|
---|
| 10 | # Rapa-Nui is related to Easter Island
|
---|
| 11 | ^http://codex.cs.yale.edu/avi/silberschatz/gallery/trips-photos/South-America/Rapa-Nui/
|
---|
| 12 |
|
---|
| 13 | # We will blacklist this yale.edu domain except for the subportion that gets whitelisted
|
---|
| 14 | # then in the sites-too-big-to-exhaustively-crawl.txt, we have a mapping for an allowed url
|
---|
| 15 | # pattern in case elements on the page are stored elsewhere
|
---|
| 16 | ^http://korora.econ.yale.edu/
|
---|
| 17 |
|
---|
[33556] | 18 | # wikipedia pages in
|
---|
| 19 | # ksh (a German dialect), ilo (Filippino), ty Tahitian, wa for Walons/Walloon,
|
---|
| 20 | # io (Ido version of Esperanto) and zh-min-nan (Min-Nan-Chinese) are not in the Maori language
|
---|
| 21 | # Not sure why Commoncrawl had found them for language code MRI
|
---|
| 22 | ksh.wikipedia.org
|
---|
| 23 | ilo.wikipedia.org
|
---|
| 24 | wa.wikipedia.org
|
---|
| 25 | ty.m.wikipedia.org
|
---|
| 26 | io.m.wikipedia.org
|
---|
| 27 | zh-min-nan.wikipedia.org
|
---|
| 28 | zh-min-nan.wiktionary.org
|
---|
| 29 |
|
---|
[33569] | 30 | ######
|
---|
[33502] | 31 | # unwanted domains
|
---|
| 32 | .video-chat.
|
---|
| 33 | .videochat.
|
---|
[33554] | 34 | 3chat.ru
|
---|
| 35 | livevideochatting.org
|
---|
| 36 | lovewebcam.net
|
---|
[33502] | 37 |
|
---|
[33554] | 38 | cherrybabe.biz
|
---|
| 39 | dreamsbabes.com
|
---|
| 40 | adultfantasyboutique.com
|
---|
| 41 | adultterra.com
|
---|
| 42 |
|
---|
[33502] | 43 | leatherdyke.porn
|
---|
| 44 | hornyteenharlots.com
|
---|
| 45 | adultviewsex.com
|
---|
[33554] | 46 | adultsexualvideo.com
|
---|
| 47 | ctbererotica.sexe-traque.com
|
---|
| 48 | cybererotia.porn234.com
|
---|
| 49 | cybereroticz.adultsupermart.com
|
---|
| 50 | freegaywebcams.info
|
---|
| 51 | lesbiansinmysoup.com
|
---|
| 52 | videopornoxx.online
|
---|
| 53 | sexandplay.com
|
---|
| 54 | sexynakedselfies.info
|
---|
| 55 | barebabez.com
|
---|
| 56 | britnudes.net
|
---|
| 57 | camaporno.com
|
---|
| 58 | webxvideo.com
|
---|
| 59 | gayspornosex.com
|
---|
| 60 | jasminreviews.com
|
---|
| 61 | sexchatlines4u.com
|
---|
| 62 | sexybabeworld.org
|
---|
| 63 | sexyleaks.info
|
---|
| 64 | uniqueporno.com
|
---|
| 65 | wildsexsluts.com
|
---|
| 66 | xxxblacknudes.com
|
---|
[33568] | 67 | bigsexymelons.com
|
---|
[33800] | 68 | mi.thebestmasturbators.com
|
---|
[33502] | 69 |
|
---|
[33568] | 70 | # more adult sites
|
---|
| 71 | acba.osb-land.com
|
---|
[33823] | 72 | the-naked.com
|
---|
| 73 | # the full URL is http://ww25.milfsplease.com, but don't know whether the ww25 prefix should be included or not
|
---|
| 74 | ww25.milfsplease.com
|
---|
| 75 | milfsplease.com
|
---|
[33568] | 76 |
|
---|
[33569] | 77 | # just get rid of any URL containing "livejasmin"
|
---|
| 78 | ## livejasmin
|
---|
| 79 | # Actually: do that in the code (CCWETProcessor) with a log message,
|
---|
| 80 | # since we actually need to get rid of any sites in entirety that contain
|
---|
| 81 | # any url with the string "livejasmin"
|
---|
| 82 | # So run the program once, check the log for messages mentioning "additional"
|
---|
| 83 | # adult sites found and add their domains in here.
|
---|
| 84 | anigma-beauty.com
|
---|
| 85 | adultfeet.com
|
---|
| 86 | atopian.org
|
---|
| 87 | bellydancingvideo.net
|
---|
| 88 | bmmodelsagency.com
|
---|
| 89 | brucknergallery.com
|
---|
| 90 | fuckvidz.org
|
---|
| 91 | photobattle.net
|
---|
| 92 | votekat.info
|
---|
| 93 |
|
---|
| 94 | # Similar to above, the following contained the string "jasmin" in the URL
|
---|
| 95 | teenycuties.com
|
---|
| 96 | a.tiles.mapbox.com
|
---|
| 97 | blazingteens.net
|
---|
| 98 | redtubeporn.info
|
---|
| 99 | osb-land.com
|
---|
| 100 | totallyhotmales.com
|
---|
| 101 | babeevents.com
|
---|
| 102 | talkserver.de
|
---|
| 103 | hehechat.org
|
---|
| 104 | fetish-nights.com
|
---|
| 105 | lesslove.com
|
---|
| 106 | hebertsvideo.com
|
---|
| 107 |
|
---|
[33502] | 108 | # sounds like some pirating site
|
---|
| 109 | ^http://pirateguides.com/
|
---|
[33568] | 110 | fastmp3.ru
|
---|
[33531] | 111 |
|
---|
| 112 | # from alexa topsites at https://www.alexa.com/topsites
|
---|
| 113 | livejasmin.com
|
---|
| 114 | pornhub.com
|
---|
| 115 | # listed as a similar topsite at https://en.wikipedia.org/wiki/List_of_most_popular_websites
|
---|
| 116 | redtube.com
|
---|
| 117 | xvideos.com
|
---|
| 118 | xhamster.com
|
---|
| 119 | xnxx.com
|
---|
[33568] | 120 |
|
---|
| 121 |
|
---|
| 122 | # not sure about the domain name and/or full url seems like it belongs here
|
---|
| 123 | abcutie.com
|
---|
[33569] | 124 |
|
---|
| 125 | # only had a single seedURL and it quickly redirected to an adult site
|
---|
| 126 | apparactes.gq
|
---|