[33559] | 1 | # Mapping of top sites in base url forms to value
|
---|
[33550] | 2 |
|
---|
[33559] | 3 | # This file contains sites that are too large to crawl exhaustively.
|
---|
| 4 | # The domains are from Alexa top sites (where only the first 50 were visible)
|
---|
[33551] | 5 | # Added further top sites from https://en.wikipedia.org/wiki/List_of_most_popular_websites
|
---|
[33553] | 6 | # Finally also added https://moz.com/top500 by downloading its CSV file and
|
---|
| 7 | # adding its URLs to the existing listing here from alexa/wiki.
|
---|
[33551] | 8 | # Then used LibreOffice's Calc spreadsheet software to sort alphabetically and remove duplicates.
|
---|
[33553] | 9 | # Then in Gedit, used regex search and replace to remove <subdomain>.<site>.ext variants, keeping
|
---|
| 10 | # just <site>.ext
|
---|
| 11 | # And finally, re-sorted the reduced list alphabetically and pasted into here.
|
---|
[33550] | 12 |
|
---|
[33559] | 13 | # FORMAT OF THIS FILE'S CONTENTS:
|
---|
[33561] | 14 | # <topsite-base-url>,<value>
|
---|
[33562] | 15 | # where <value> can or is one of
|
---|
| 16 | # empty, SUBDOMAIN-COPY, FOLLOW-LINKS-WITHIN-TOPSITE, SINGLEPAGE, <url-form-without-protocol>
|
---|
[33559] | 17 | #
|
---|
[33562] | 18 | # - if value is left empty: if seedurl contains topsite-base-url, the seedurl will go into the
|
---|
| 19 | # file unprocessed-topsite-matches.txt and the site/page won't be crawled.
|
---|
[33559] | 20 | # The user will be notified to inspect the file unprocessed-topsite-matches.txt.
|
---|
| 21 | # - SINGLEPAGE: if seedurl matches topsite-base-url, then only download the page at that seedurl.
|
---|
| 22 | # For example, if the seedurl is http://docs.google.com/some-long-suffix-in-base64, then it
|
---|
| 23 | # matches the topsite-base-url of docs.google.com and its value of SINGLEPAGE will add the
|
---|
| 24 | # seedurl itself as the regex url-filter, to restrict the crawl to just the specified page.
|
---|
| 25 | # - SUBDOMAIN-COPY: if seedurl CONTAINS topsite-base-url, then whatever the seedurl's subdomain
|
---|
| 26 | # or else domain is, will make up the urlfilter, so we don't leak out into a larger domain.
|
---|
| 27 | # Use SUBDOMAIN-COPY to restrict to a domain prefix/subdomain. For example, if seedurl is
|
---|
| 28 | # pinky.blogspot.com, it will match the topsite-base-url of blogspot.com, but SUBDOMAIN-COPY
|
---|
| 29 | # will ensure we restrict crawling to pages on pinky.blogspot.com.
|
---|
| 30 | # However, if the seedurl's domain is an exact match on topsite-base-url, the seedurl will go
|
---|
| 31 | # into the file unprocessed-topsite-matches.txt and the site/page won't be crawled.
|
---|
[33666] | 32 | # - FOLLOW-LINKS-WITHIN-TOPSITE: download seedURL pages and pages linked from each seedURL
|
---|
| 33 | # page should be followed and downloaded too, as long as they're within the same subdomain
|
---|
| 34 | # matching the topsite-base-url.
|
---|
[33561] | 35 | # This is different from SUBDOMAIN-COPY, as that can download all of a specific subdomain but
|
---|
| 36 | # restricts against downloading the entire domain (e.g. all pinky.blogspot.com and not anything
|
---|
| 37 | # else within blogspot.com). FOLLOW-LINKS-WITHIN-TOPSITE can download all linked pages (at
|
---|
| 38 | # depth specified for the nutch crawl) as long as they're within the topsite-base-url.
|
---|
| 39 | # e.g. seedURLs on docs.google.com containing links will have those linked pages and any
|
---|
| 40 | # they link to etc. downloaded as long as they're on docs.google.com.
|
---|
[33559] | 41 | # - <url-form-without-protocol>: if a seedurl contains topsite-base-url, then the provided
|
---|
| 42 | # url-form-without-protocol will make up the urlfilter, again preventing leaking into a
|
---|
| 43 | # larger part of the domain. For example, if the seedurl is mi.wikipedia.org/SomePage, it will
|
---|
| 44 | # match the topsite-base-url of wikipedia.org for which the <url-form-without-protocol>
|
---|
| 45 | # value is mi.wikipedia.org, which should be all that's accepted for wikipedia.org. The
|
---|
| 46 | # <url-form-without-protocol> ends up in the regex urlfilter file, thereby restricting the
|
---|
| 47 | # crawl to just mi.wikipedia.org.
|
---|
| 48 | # Remember to leave out any protocol <from url-form-without-protocol>.
|
---|
[33562] | 49 | #
|
---|
| 50 | # TODO If useful:
|
---|
| 51 | # column 3: whether nutch should do fetch all or not
|
---|
| 52 | # column 4: number of crawl iterations
|
---|
[33551] | 53 |
|
---|
[33565] | 54 |
|
---|
| 55 | # NOT TOP SITES, BUT SITES WE INSPECTED AND WANT TO CONTROL SIMILARLY TO TOP SITES
|
---|
| 56 | 00.gs,SINGLEPAGE
|
---|
[33569] | 57 | # May be a large site with only seedURLs of real relevance
|
---|
[33568] | 58 | topographic-map.com,SINGLEPAGE
|
---|
[33569] | 59 | ami-media.net,SINGLEPAGE
|
---|
| 60 | # 2 pages of declarations of human rights in Maori, rest in other languages
|
---|
| 61 | anitra.net,SINGLEPAGE
|
---|
| 62 | # special case
|
---|
| 63 | mi.centr-zashity.ru,SINGLEPAGE
|
---|
[33565] | 64 |
|
---|
[33666] | 65 | # we want the http://loquevendra318.com/fox/maori.html seed URL but also
|
---|
| 66 | # pages within the following subsection
|
---|
| 67 | loquevendra318.com,loquevendra318.com/fox/maori/
|
---|
| 68 |
|
---|
[33604] | 69 | martinvrijland.nl,martinvrijland.nl/mi/
|
---|
| 70 | csunplugged.org,csunplugged.org/mi/
|
---|
| 71 | gpedia.com,gpedia.com/mi/
|
---|
| 72 |
|
---|
[33569] | 73 | # TOP SITE BUT NOT TOP 500
|
---|
| 74 | www.tumblr.com,SINGLEPAGE
|
---|
| 75 |
|
---|
| 76 |
|
---|
[33565] | 77 | # TOP SITES
|
---|
| 78 |
|
---|
[33561] | 79 | # docs.google.com is a special case: not all pages are public and any interlinking is likely to
|
---|
[33562] | 80 | # be intentional. Grab all linked pages, for link depth set with nutch's crawl, as long as the
|
---|
| 81 | # links are within the given topsite-base-url
|
---|
[33561] | 82 | docs.google.com,FOLLOW-LINKS-WITHIN-TOPSITE
|
---|
[33559] | 83 |
|
---|
[33562] | 84 | # Just crawl a single page for these:
|
---|
[33561] | 85 | drive.google.com,SINGLEPAGE
|
---|
| 86 | forms.office.com,SINGLEPAGE
|
---|
| 87 | player.vimeo.com,SINGLEPAGE
|
---|
| 88 | static-promote.weebly.com,SINGLEPAGE
|
---|
[33559] | 89 |
|
---|
| 90 | # Special case of yale.edu: its Rapa-Nui pages are on blacklist, but we want this page + its photos
|
---|
| 91 | # The page's containing folder is whitelisted in case the photos are there.
|
---|
[33562] | 92 | korora.econ.yale.edu,SINGLEPAGE
|
---|
[33559] | 93 |
|
---|
[33569] | 94 |
|
---|
[33551] | 95 | 000webhost.com
|
---|
[33550] | 96 | 360.cn
|
---|
[33551] | 97 | 4shared.com
|
---|
| 98 | a8.net
|
---|
| 99 | abc.es
|
---|
| 100 | abc.net.au
|
---|
| 101 | abcnews.go.com
|
---|
| 102 | about.com
|
---|
| 103 | about.me
|
---|
| 104 | aboutads.info
|
---|
| 105 | abril.com.br
|
---|
| 106 | academia.edu
|
---|
[33550] | 107 | accuweather.com
|
---|
[33551] | 108 | addthis.com
|
---|
| 109 | addtoany.com
|
---|
| 110 | adobe.com
|
---|
[33555] | 111 | adweek.com
|
---|
[33551] | 112 | airbnb.com
|
---|
| 113 | akamaihd.net
|
---|
| 114 | alexa.com
|
---|
| 115 | alibaba.com
|
---|
[33550] | 116 | aliexpress.com
|
---|
| 117 | alipay.com
|
---|
[33551] | 118 | aljazeera.com
|
---|
| 119 | allaboutcookies.org
|
---|
| 120 | allrecipes.com
|
---|
[33555] | 121 | amazon.ca
|
---|
| 122 | amazon.co.jp
|
---|
| 123 | amazon.co.uk
|
---|
| 124 | amazon.com
|
---|
| 125 | amazon.de
|
---|
| 126 | amazon.es
|
---|
| 127 | amazon.fr
|
---|
| 128 | amazon.in
|
---|
| 129 | ameblo.jp
|
---|
[33551] | 130 | ampproject.org
|
---|
| 131 | android.com
|
---|
| 132 | aol.com
|
---|
| 133 | ap.org
|
---|
| 134 | apache.org
|
---|
| 135 | apachefriends.org
|
---|
[33550] | 136 | apple.com
|
---|
[33551] | 137 | archive.org
|
---|
[33555] | 138 | archives.gov
|
---|
[33551] | 139 | arstechnica.com
|
---|
| 140 | arxiv.org
|
---|
| 141 | asahi.com
|
---|
| 142 | ask.fm
|
---|
| 143 | asus.com
|
---|
| 144 | axs.com
|
---|
[33550] | 145 | babytree.com
|
---|
| 146 | baidu.com
|
---|
[33551] | 147 | bandcamp.com
|
---|
| 148 | bbc.co.uk
|
---|
| 149 | bbc.com
|
---|
[33555] | 150 | behance.net
|
---|
[33551] | 151 | berkeley.edu
|
---|
| 152 | biblegateway.com
|
---|
| 153 | biglobe.ne.jp
|
---|
| 154 | billboard.com
|
---|
[33550] | 155 | bing.com
|
---|
[33551] | 156 | bit.ly
|
---|
[33550] | 157 | bitly.com
|
---|
[33551] | 158 | blackberry.com
|
---|
| 159 | blogger.com
|
---|
[33561] | 160 | blogspot.com,SUBDOMAIN-COPY
|
---|
[33551] | 161 | bloomberg.com
|
---|
| 162 | booking.com
|
---|
[33555] | 163 | boston.com
|
---|
[33551] | 164 | box.com
|
---|
| 165 | britannica.com
|
---|
| 166 | bt.com
|
---|
| 167 | bund.de
|
---|
| 168 | businessinsider.com
|
---|
| 169 | businesswire.com
|
---|
| 170 | buydomains.com
|
---|
| 171 | buzzfeed.com
|
---|
| 172 | ca.gov
|
---|
| 173 | cambridge.org
|
---|
[33555] | 174 | canalblog.com
|
---|
[33551] | 175 | cbc.ca
|
---|
[33555] | 176 | cbslocal.com
|
---|
[33551] | 177 | cbsnews.com
|
---|
| 178 | cdc.gov
|
---|
| 179 | change.org
|
---|
| 180 | channel4.com
|
---|
| 181 | chicagotribune.com
|
---|
[33555] | 182 | chinadaily.com.cn
|
---|
[33551] | 183 | cisco.com
|
---|
| 184 | clickbank.net
|
---|
| 185 | cloudflare.com
|
---|
[33555] | 186 | cmu.edu
|
---|
[33551] | 187 | cnbc.com
|
---|
| 188 | cnet.com
|
---|
| 189 | cnn.com
|
---|
| 190 | cocolog-nifty.com
|
---|
| 191 | columbia.edu
|
---|
[33555] | 192 | connect.over-blog.com
|
---|
[33551] | 193 | cornell.edu
|
---|
| 194 | corriere.it
|
---|
| 195 | cpanel.com
|
---|
| 196 | cpanel.net
|
---|
| 197 | creativecommons.org
|
---|
[33550] | 198 | csdn.net
|
---|
[33551] | 199 | csmonitor.com
|
---|
| 200 | dailymail.co.uk
|
---|
| 201 | dailymotion.com
|
---|
| 202 | dan.com
|
---|
| 203 | daum.net
|
---|
[33555] | 204 | debian.org
|
---|
[33551] | 205 | dell.com
|
---|
| 206 | depositfiles.com
|
---|
| 207 | detik.com
|
---|
| 208 | digg.com
|
---|
[33555] | 209 | discovery.com
|
---|
[33551] | 210 | disney.com
|
---|
[33555] | 211 | disney.go.com
|
---|
[33551] | 212 | disqus.com
|
---|
| 213 | doubleclick.net
|
---|
| 214 | dreniq.com
|
---|
| 215 | dribbble.com
|
---|
[33561] | 216 | dropbox.com,SINGLEPAGE
|
---|
[33551] | 217 | dropboxusercontent.com
|
---|
| 218 | dw.com
|
---|
| 219 | e-recht24.de
|
---|
| 220 | ea.com
|
---|
| 221 | ebay.co.uk
|
---|
[33550] | 222 | ebay.com
|
---|
[33551] | 223 | economist.com
|
---|
| 224 | eff.org
|
---|
| 225 | ehow.com
|
---|
| 226 | elmundo.es
|
---|
| 227 | elpais.com
|
---|
| 228 | engadget.com
|
---|
| 229 | entrepreneur.com
|
---|
| 230 | eonline.com
|
---|
[33550] | 231 | espn.com
|
---|
[33551] | 232 | espn.go.com
|
---|
| 233 | etsy.com
|
---|
| 234 | europa.eu
|
---|
| 235 | eventbrite.com
|
---|
| 236 | example.com
|
---|
| 237 | excite.co.jp
|
---|
| 238 | express.co.uk
|
---|
[33550] | 239 | facebook.com
|
---|
[33551] | 240 | fandom.com
|
---|
| 241 | fastcompany.com
|
---|
| 242 | fb.com
|
---|
| 243 | fb.me
|
---|
| 244 | fda.gov
|
---|
| 245 | fedoraproject.org
|
---|
| 246 | feedburner.com
|
---|
| 247 | fifa.com
|
---|
| 248 | files.wordpress.com
|
---|
| 249 | flickr.com
|
---|
| 250 | forbes.com
|
---|
| 251 | fortune.com
|
---|
| 252 | foursquare.com
|
---|
| 253 | foxnews.com
|
---|
| 254 | ft.com
|
---|
| 255 | ftc.gov
|
---|
| 256 | gen.xyz
|
---|
| 257 | geocities.jp
|
---|
| 258 | gesetze-im-internet.de
|
---|
| 259 | ggpht.com
|
---|
| 260 | github.com
|
---|
| 261 | gizmodo.com
|
---|
| 262 | globo.com
|
---|
| 263 | gmail.com
|
---|
| 264 | gnu.org
|
---|
| 265 | godaddy.com
|
---|
| 266 | gofundme.com
|
---|
| 267 | goo.gl
|
---|
| 268 | goo.ne.jp
|
---|
| 269 | goodreads.com
|
---|
[33555] | 270 | google.ca
|
---|
| 271 | google.co.id
|
---|
| 272 | google.co.in
|
---|
| 273 | google.co.jp
|
---|
| 274 | google.co.uk
|
---|
| 275 | google.com
|
---|
| 276 | google.com.br
|
---|
| 277 | google.com.hk
|
---|
| 278 | google.com.tr
|
---|
| 279 | google.de
|
---|
| 280 | google.es
|
---|
| 281 | google.fr
|
---|
| 282 | google.it
|
---|
| 283 | google.nl
|
---|
| 284 | google.pl
|
---|
| 285 | google.ru
|
---|
| 286 | googleapis.com
|
---|
[33551] | 287 | googleblog.com
|
---|
| 288 | googleusercontent.com
|
---|
| 289 | gooyaabitemplates.com
|
---|
| 290 | gov.uk
|
---|
| 291 | gravatar.com
|
---|
| 292 | greenpeace.org
|
---|
| 293 | gstatic.com
|
---|
| 294 | guardian.co.uk
|
---|
| 295 | harvard.edu
|
---|
| 296 | hatena.ne.jp
|
---|
| 297 | histats.com
|
---|
| 298 | hm.com
|
---|
| 299 | hollywoodreporter.com
|
---|
| 300 | home.pl
|
---|
| 301 | house.gov
|
---|
| 302 | howstuffworks.com
|
---|
| 303 | hp.com
|
---|
| 304 | huffingtonpost.com
|
---|
| 305 | huffpost.com
|
---|
| 306 | hugedomains.com
|
---|
| 307 | ibm.com
|
---|
| 308 | ibtimes.com
|
---|
| 309 | icann.org
|
---|
| 310 | ieee.org
|
---|
| 311 | ietf.org
|
---|
| 312 | ig.com.br
|
---|
| 313 | ign.com
|
---|
| 314 | ikea.com
|
---|
| 315 | imageshack.us
|
---|
| 316 | imdb.com
|
---|
| 317 | imgur.com
|
---|
| 318 | inc.com
|
---|
| 319 | independent.co.uk
|
---|
| 320 | indiatimes.com
|
---|
| 321 | indiegogo.com
|
---|
[33550] | 322 | instagram.com
|
---|
[33555] | 323 | instructables.com
|
---|
[33551] | 324 | intel.com
|
---|
[33555] | 325 | interia.pl
|
---|
[33551] | 326 | issuu.com
|
---|
| 327 | istockphoto.com
|
---|
| 328 | iubenda.com
|
---|
[33550] | 329 | jd.com
|
---|
[33551] | 330 | joomla.org
|
---|
| 331 | jquery.com
|
---|
| 332 | jstor.org
|
---|
| 333 | kickstarter.com
|
---|
| 334 | kinja.com
|
---|
| 335 | last.fm
|
---|
| 336 | latimes.com
|
---|
| 337 | lefigaro.fr
|
---|
| 338 | lemonde.fr
|
---|
| 339 | line.me
|
---|
| 340 | linkedin.com
|
---|
| 341 | list-manage.com
|
---|
[33550] | 342 | live.com
|
---|
[33551] | 343 | livejournal.com
|
---|
| 344 | livescience.com
|
---|
| 345 | loc.gov
|
---|
[33555] | 346 | lonelyplanet.com
|
---|
[33551] | 347 | lycos.com
|
---|
[33561] | 348 | m.wikipedia.org,mi.m.wikipedia.org
|
---|
[33551] | 349 | mail.ru
|
---|
| 350 | marketwatch.com
|
---|
| 351 | marriott.com
|
---|
| 352 | mashable.com
|
---|
| 353 | mediafire.com
|
---|
| 354 | medium.com
|
---|
| 355 | mega.nz
|
---|
[33555] | 356 | megaupload.com
|
---|
[33551] | 357 | mercurynews.com
|
---|
| 358 | merriam-webster.com
|
---|
| 359 | metro.co.uk
|
---|
[33561] | 360 | microsoft.com,microsoft.com/mi-nz/
|
---|
[33550] | 361 | microsoftonline.com
|
---|
[33551] | 362 | mirror.co.uk
|
---|
| 363 | mit.edu
|
---|
| 364 | mixcloud.com
|
---|
| 365 | mlb.com
|
---|
| 366 | mozilla.com
|
---|
| 367 | mozilla.org
|
---|
[33550] | 368 | msn.com
|
---|
[33551] | 369 | myspace.com
|
---|
| 370 | mysql.com
|
---|
| 371 | namecheap.com
|
---|
| 372 | narod.ru
|
---|
| 373 | nasa.gov
|
---|
| 374 | nationalgeographic.com
|
---|
| 375 | nature.com
|
---|
[33550] | 376 | naver.com
|
---|
[33551] | 377 | naver.jp
|
---|
[33555] | 378 | nba.com
|
---|
[33551] | 379 | nbcnews.com
|
---|
| 380 | ndtv.com
|
---|
[33550] | 381 | netflix.com
|
---|
[33551] | 382 | netsons.com
|
---|
| 383 | netvibes.com
|
---|
| 384 | networkadvertising.org
|
---|
| 385 | news.com.au
|
---|
| 386 | newscientist.com
|
---|
| 387 | newsweek.com
|
---|
[33555] | 388 | newyorker.com
|
---|
[33551] | 389 | nginx.com
|
---|
| 390 | nginx.org
|
---|
| 391 | nhk.or.jp
|
---|
| 392 | nicovideo.jp
|
---|
| 393 | nifty.com
|
---|
| 394 | nih.gov
|
---|
| 395 | nikkei.com
|
---|
| 396 | noaa.gov
|
---|
| 397 | nokia.com
|
---|
| 398 | npr.org
|
---|
| 399 | nvidia.com
|
---|
| 400 | nydailynews.com
|
---|
| 401 | nypost.com
|
---|
| 402 | nytimes.com
|
---|
| 403 | nyu.edu
|
---|
| 404 | odnoklassniki.ru
|
---|
[33550] | 405 | office.com
|
---|
[33555] | 406 | offset.com
|
---|
[33550] | 407 | ok.ru
|
---|
| 408 | okezone.com
|
---|
[33551] | 409 | opera.com
|
---|
| 410 | oracle.com
|
---|
| 411 | orange.fr
|
---|
| 412 | oreilly.com
|
---|
| 413 | oup.com
|
---|
| 414 | over-blog.com
|
---|
| 415 | ovh.co.uk
|
---|
| 416 | ovh.com
|
---|
| 417 | ovh.net
|
---|
| 418 | ox.ac.uk
|
---|
| 419 | parallels.com
|
---|
| 420 | pastebin.com
|
---|
[33550] | 421 | paypal.com
|
---|
[33551] | 422 | pbs.org
|
---|
[33555] | 423 | pcmag.com
|
---|
[33551] | 424 | people.com
|
---|
| 425 | photobucket.com
|
---|
| 426 | php.net
|
---|
[33561] | 427 | pinterest.com,SINGLEPAGE
|
---|
[33551] | 428 | pixabay.com
|
---|
| 429 | playstation.com
|
---|
| 430 | plesk.com
|
---|
[33555] | 431 | plos.org
|
---|
[33551] | 432 | politico.com
|
---|
[33555] | 433 | prestashop.com
|
---|
[33551] | 434 | prezi.com
|
---|
| 435 | princeton.edu
|
---|
| 436 | privacyshield.gov
|
---|
| 437 | prnewswire.com
|
---|
| 438 | psychologytoday.com
|
---|
[33550] | 439 | qq.com
|
---|
[33551] | 440 | quantcast.com
|
---|
[33550] | 441 | quora.com
|
---|
[33551] | 442 | rakuten.co.jp
|
---|
| 443 | rambler.ru
|
---|
| 444 | rapidshare.com
|
---|
[33550] | 445 | reddit.com
|
---|
[33551] | 446 | repubblica.it
|
---|
[33555] | 447 | researchgate.net
|
---|
[33551] | 448 | reuters.com
|
---|
| 449 | ria.ru
|
---|
| 450 | rottentomatoes.com
|
---|
| 451 | rt.com
|
---|
| 452 | rtve.es
|
---|
[33555] | 453 | sakura.ne.jp
|
---|
[33551] | 454 | samsung.com
|
---|
| 455 | sapo.pt
|
---|
[33555] | 456 | scholastic.com
|
---|
[33551] | 457 | sciencedaily.com
|
---|
| 458 | sciencedirect.com
|
---|
| 459 | sciencemag.org
|
---|
| 460 | scientificamerican.com
|
---|
| 461 | scribd.com
|
---|
| 462 | seattletimes.com
|
---|
| 463 | secureserver.net
|
---|
| 464 | sedo.com
|
---|
| 465 | seesaa.net
|
---|
| 466 | sendspace.com
|
---|
| 467 | sfgate.com
|
---|
| 468 | shopify.com
|
---|
| 469 | shutterstock.com
|
---|
| 470 | siemens.com
|
---|
| 471 | sina.com.cn
|
---|
| 472 | sky.com
|
---|
| 473 | skype.com
|
---|
| 474 | skyrock.com
|
---|
[33555] | 475 | slate.com
|
---|
[33551] | 476 | slideshare.net
|
---|
[33550] | 477 | sm.cn
|
---|
[33551] | 478 | smh.com.au
|
---|
| 479 | so-net.ne.jp
|
---|
| 480 | softonic.com
|
---|
[33550] | 481 | sogou.com
|
---|
| 482 | sohu.com
|
---|
[33551] | 483 | soratemplates.com
|
---|
[33550] | 484 | soso.com
|
---|
[33551] | 485 | soundcloud.com
|
---|
| 486 | spiegel.de
|
---|
| 487 | spotify.com
|
---|
| 488 | springer.com
|
---|
| 489 | sputniknews.com
|
---|
[33555] | 490 | ssl-images-amazon.com
|
---|
[33550] | 491 | stackoverflow.com
|
---|
[33555] | 492 | standard.co.uk
|
---|
[33551] | 493 | stanford.edu
|
---|
| 494 | state.gov
|
---|
| 495 | steamcommunity.com
|
---|
| 496 | steampowered.com
|
---|
| 497 | storage.canalblog.com
|
---|
[33555] | 498 | storage.googleapis.com
|
---|
[33551] | 499 | stores.jp
|
---|
| 500 | storify.com
|
---|
[33561] | 501 | stuff.co.nz,SINGLEPAGE
|
---|
[33551] | 502 | surveymonkey.com
|
---|
| 503 | symantec.com
|
---|
| 504 | t-online.de
|
---|
[33550] | 505 | t.co
|
---|
[33551] | 506 | t.me
|
---|
| 507 | tabelog.com
|
---|
[33550] | 508 | taobao.com
|
---|
[33551] | 509 | target.com
|
---|
[33555] | 510 | teamviewer.com
|
---|
[33551] | 511 | techcrunch.com
|
---|
| 512 | ted.com
|
---|
| 513 | telegram.me
|
---|
| 514 | telegraph.co.uk
|
---|
| 515 | terra.com.br
|
---|
[33555] | 516 | theatlantic.com
|
---|
| 517 | thefreedictionary.com
|
---|
[33551] | 518 | theglobeandmail.com
|
---|
| 519 | theguardian.com
|
---|
| 520 | themeforest.net
|
---|
[33555] | 521 | thenextweb.com
|
---|
[33551] | 522 | thestar.com
|
---|
| 523 | thesun.co.uk
|
---|
| 524 | thetimes.co.uk
|
---|
| 525 | theverge.com
|
---|
| 526 | thoughtco.com
|
---|
[33550] | 527 | tianya.cn
|
---|
[33551] | 528 | time.com
|
---|
| 529 | tinyurl.com
|
---|
[33550] | 530 | tmall.com
|
---|
[33551] | 531 | tmz.com
|
---|
[33550] | 532 | tribunnews.com
|
---|
[33551] | 533 | tripadvisor.com
|
---|
| 534 | trustpilot.com
|
---|
[33550] | 535 | twitch.tv
|
---|
| 536 | twitter.com
|
---|
[33551] | 537 | ucoz.ru
|
---|
| 538 | uiuc.edu
|
---|
| 539 | umich.edu
|
---|
| 540 | un.org
|
---|
| 541 | undeveloped.com
|
---|
| 542 | unesco.org
|
---|
| 543 | uol.com.br
|
---|
| 544 | urbandictionary.com
|
---|
[33555] | 545 | usa.gov
|
---|
[33551] | 546 | usatoday.com
|
---|
| 547 | usgs.gov
|
---|
| 548 | usnews.com
|
---|
| 549 | uspto.gov
|
---|
| 550 | ustream.tv
|
---|
| 551 | utexas.edu
|
---|
| 552 | variety.com
|
---|
| 553 | venturebeat.com
|
---|
| 554 | vice.com
|
---|
| 555 | viglink.com
|
---|
| 556 | vimeo.com
|
---|
[33550] | 557 | vk.com
|
---|
[33551] | 558 | vkontakte.ru
|
---|
| 559 | vox.com
|
---|
| 560 | w3.org
|
---|
[33550] | 561 | w3schools.com
|
---|
[33551] | 562 | wa.me
|
---|
[33550] | 563 | walmart.com
|
---|
[33551] | 564 | washington.edu
|
---|
| 565 | washingtonpost.com
|
---|
| 566 | wattpad.com
|
---|
[33555] | 567 | weather.com
|
---|
[33551] | 568 | web.fc2.com
|
---|
| 569 | webmd.com
|
---|
| 570 | weebly.com
|
---|
[33550] | 571 | weibo.com
|
---|
[33551] | 572 | welt.de
|
---|
| 573 | whatsapp.com
|
---|
| 574 | whitehouse.gov
|
---|
| 575 | who.int
|
---|
| 576 | wikia.com
|
---|
| 577 | wikihow.com
|
---|
| 578 | wikimedia.org
|
---|
[33561] | 579 | wikipedia.org,mi.wikipedia.org
|
---|
| 580 | wiktionary.org,mi.wiktionary.org
|
---|
[33551] | 581 | wiley.com
|
---|
| 582 | windowsphone.com
|
---|
| 583 | wired.com
|
---|
| 584 | wix.com
|
---|
[33561] | 585 | wordpress.org,SUBDOMAIN-COPY
|
---|
[33551] | 586 | worldbank.org
|
---|
| 587 | wp.com
|
---|
| 588 | wsj.com
|
---|
| 589 | xbox.com
|
---|
[33550] | 590 | xinhuanet.com
|
---|
[33551] | 591 | yadi.sk
|
---|
[33555] | 592 | yahoo.co.jp
|
---|
[33550] | 593 | yahoo.com
|
---|
[33551] | 594 | yale.edu
|
---|
[33550] | 595 | yandex.ru
|
---|
[33551] | 596 | yelp.com
|
---|
| 597 | youku.com
|
---|
| 598 | youronlinechoices.com
|
---|
| 599 | youtu.be
|
---|
[33550] | 600 | youtube.com
|
---|
[33551] | 601 | ytimg.com
|
---|
| 602 | zdnet.com
|
---|
[33555] | 603 | zend.com
|
---|
[33551] | 604 | zendesk.com
|
---|
[33555] | 605 | zippyshare.com
|
---|