source: other-projects/maori-lang-detection/mongodb-data/counts_tentativeNonProductSites1.json@ 33820

Last change on this file since 33820 was 33813, checked in by ak19, 4 years ago

With the bugfix from yesterday and the inclusion of http(s):mi.* type URLs in setting the Websites mongodb collection's urlContainsLangCodeInPath property, and updated/improved mongodb queries and their results I have now regenerated the latest geojson json data and maps.

File size: 19.3 KB
Line 
1/*
2
3The websites that have some MRI detected AND which are either in NZ or with NZ TLD
4or (so if they're from overseas) don't contain /mi or mi.* in URL path.
5We'll include Australia, to get the valid "kiwiproperty.com" website,
6otherwise the sole exception, included in the result list.
7
8
9db.getCollection('Websites').find({$and: [
10 {numPagesContainingMRI: {$gt: 0}},
11 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
12 ]}).count()
13
14397
15
16Aggregate results by a count of country codes. Also have a domain listing in the output.
17
18However, we want to group nz TLD with websites that originate in NZ.
19So we also want the converse: to remove websites with .nz TLD from any originating
20country codes that are from outside NZ.
21
22db.getCollection('Websites').find({$and: [
23 {geoLocationCountryCode: {$ne: "NZ"}},
24 {domain: {$not: /\.nz/}},
25 {numPagesContainingMRI: {$gt: 0}},
26 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
27 ]}).count()
28
29221 websites
30
31db.getCollection('Websites').find({$and: [
32 {numPagesContainingMRI: {$gt: 0}},
33 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
34 ]}).count()
35
36176
37
38(221 + 176 = 397, which adds up to above.)
39
40
41Counts by country code excluding NZ related sites
42db.Websites.aggregate([
43 {
44 $match: {
45 $and: [
46 {geoLocationCountryCode: {$ne: "NZ"}},
47 {domain: {$not: /\.nz/}},
48 {numPagesContainingMRI: {$gt: 0}},
49 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
50 ]
51 }
52 },
53 { $unwind: "$geoLocationCountryCode" },
54 {
55 $group: {
56 _id: {$toLower: '$geoLocationCountryCode'},
57 count: { $sum: 1 },
58 domain: { $addToSet: '$domain' }
59 }
60 },
61 { $sort : { count : -1} }
62]);
63
64Count of NZ related sites - output put under a hardcoded _id of "nz"
65and once again requesting a domain listing in output:
66
67db.Websites.aggregate([
68 {
69 $match: {
70 $and: [
71 {numPagesContainingMRI: {$gt: 0}},
72 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
73 ]
74 }
75 },
76 { $unwind: "$geoLocationCountryCode" },
77 {
78 $group: {
79 _id: "nz",
80 count: { $sum: 1 },
81 domain: { $addToSet: '$domain' }
82 }
83 },
84 { $sort : { count : -1} }
85]);
86
87*/
88
89/* 0 */
90{
91 "_id" : "us",
92 "count" : 117.0,
93 "domain" : [
94 "http://shangrilapress.net",
95 "https://www.terakau.org",
96 "http://dannykahei.tripod.com",
97 "https://in.pinterest.com",
98 "http://takethatvacation.com",
99 "http://malecek.com",
100 "http://word-dialect.blogspot.com",
101 "https://www.blue-frontiers.com",
102 "https://static-promote.weebly.com",
103 "http://www.thesalmons.org",
104 "http://ngarangatahi.tripod.com",
105 "http://tkrow.tripod.com",
106 "http://niken8media.logdown.com",
107 "https://www.vaihaunui.net",
108 "https://www.podrozeady.com",
109 "https://www.nccri.ie",
110 "http://georgegi.tripod.com",
111 "http://www.lunar-occultations.com",
112 "http://frontrowphotos.com",
113 "http://linkvip.top",
114 "http://rangiwewehi.com",
115 "http://www.pressreader.com",
116 "http://anglicanhistory.org",
117 "http://www.unicode.org",
118 "https://wol.jw.org",
119 "http://pumanawawhangara.blogspot.com",
120 "http://hannas-reiseblog.blogspot.com",
121 "http://seapixonline.com",
122 "http://ww25.milfsplease.com",
123 "http://www.wikitree.com",
124 "http://ritusehji.blogspot.com",
125 "https://www.pinterest.it",
126 "http://naturalfatburner.net",
127 "http://kiaorahola.blogspot.com",
128 "http://www.hudl.com",
129 "http://shuttersportnelson.photoshelter.com",
130 "https://www.pinterest.ca",
131 "http://precious-testimonies.com",
132 "http://www.muhammad.com",
133 "http://www.gotquestions.org",
134 "https://www.pinterest.co.uk",
135 "https://biblehub.com",
136 "http://tuhua2010.blogspot.com",
137 "http://www.precious-testimonies.com",
138 "http://www.huapala.org",
139 "https://nl.pinterest.com",
140 "http://www.whoisthatr.com",
141 "https://www.oemsec.com",
142 "https://www.dbnames.net",
143 "https://www.myadsclassified.com",
144 "http://mikebonnice.com",
145 "http://fhr.kiwicelts.com",
146 "https://www.webwiki.com",
147 "https://www.pinterest.jp",
148 "https://kjohnsonnz.blogspot.com",
149 "http://svenskadress.net",
150 "http://www.godrules.net",
151 "https://www.pinterest.fr",
152 "http://rhymebrain.com",
153 "http://www.frogsonline.com",
154 "http://www.v3whois.com",
155 "http://piripi.blogspot.com",
156 "http://www.twttoa.com",
157 "http://wikiedit.org",
158 "https://livestream.com",
159 "http://burkekm001.tripod.com",
160 "https://maorinews.com",
161 "http://www.geni.com",
162 "http://www.waimate.com",
163 "http://m.biblepub.com",
164 "http://wowwars.net",
165 "https://www.natekore2018.com",
166 "http://tatai09.blogspot.com",
167 "https://ebible.org",
168 "http://capsuraotearoa.blogspot.com",
169 "http://bahaiprayers.net",
170 "https://www.breaker.audio",
171 "https://www.pipirikiapapatuanuku.org",
172 "http://www.the-naked.com",
173 "https://phet.colorado.edu",
174 "http://manateina.blogspot.com",
175 "http://tkkpipipaopao.blogspot.com",
176 "https://www.hidroponia.org.mx",
177 "http://mrshamiltonskoolkidz.blogspot.com",
178 "http://aclhokiangarocks.blogspot.com",
179 "http://www.eyecontactsite.com",
180 "http://www.hiroa.pf",
181 "http://www.forensicfashion.com",
182 "http://www.code-postal.com",
183 "http://lianzaconference2012.blogspot.com",
184 "http://mahoraroom8.blogspot.com",
185 "http://korora.econ.yale.edu",
186 "https://docs.google.com",
187 "https://www.indexmundi.com",
188 "https://www.seapixonline.com",
189 "https://www.bible.com",
190 "https://www.knowatom.com",
191 "https://chromium.googlesource.com",
192 "http://www.krassotkin.ru",
193 "http://www.roadsmile.com",
194 "https://www.code-postal.com",
195 "http://blogdepasopor.blogspot.com",
196 "http://eartheum.com",
197 "http://www.steve-wheeler.co.uk",
198 "http://www.mkiwi.com",
199 "http://maaori.com",
200 "https://www.kaifineart.com",
201 "https://png.bible",
202 "https://www.poehalisnami.ua",
203 "http://atopeconlostopes.blogspot.com",
204 "http://www.whoisentry.com",
205 "http://loquevendra318.com",
206 "https://za.pinterest.com",
207 "http://www.namesdir.com",
208 "https://drive.google.com",
209 "http://worldradiomap.com"
210 ]
211}
212
213/* 1 */
214{
215 "_id" : "nz",
216 "count" : 176.0,
217 "domain" : [
218 "http://tmoa.tki.org.nz",
219 "http://www.rotoruanz.com",
220 "https://admin.teara.govt.nz",
221 "http://www.tekura.school.nz",
222 "http://www.tetaurawhiri.govt.nz",
223 "https://www.whanau-tahi.school.nz",
224 "http://auturoa.nz",
225 "http://www.ngamanawainc.co.nz",
226 "http://southerntribes.co.nz",
227 "https://player.vimeo.com",
228 "https://www.components-mart.nz",
229 "http://www.cs.waikato.ac.nz",
230 "https://www.terakipaewhenua.school.nz",
231 "http://oilcrash.com",
232 "https://manawatuheritage.pncc.govt.nz",
233 "http://maori.tki.org.nz",
234 "http://kaupare.co.nz",
235 "http://ngatiporoukiponeke.org.nz",
236 "http://cms.sunsmartschools.co.nz",
237 "http://philipbeadle.co.nz",
238 "http://waitarahistory.org.nz",
239 "http://hangaraumatihiko.tki.org.nz",
240 "https://sexualviolence.victimsinfo.govt.nz",
241 "http://www.kura-porirua.school.nz",
242 "http://www.rakaumanga.school.nz",
243 "http://www.huri-translations.pf",
244 "https://kotahimiriona.co.nz",
245 "http://ngarauhuia.ngatiapakiterato.iwi.nz",
246 "http://videos.e-agent.nz",
247 "http://kurakokiri.maori.nz",
248 "http://kuraaiwi.maori.nz",
249 "http://www.tewikiotereomaori.co.nz",
250 "http://arataua.nz",
251 "http://www.brettgraham.co.nz",
252 "http://anglicanprayerbook.nz",
253 "https://e-ako-pangarau.nzmaths.co.nz",
254 "https://www.pinterest.nz",
255 "http://www.tasteofplenty.co.nz",
256 "http://www.nzpcn.org.nz",
257 "https://www.puau.school.nz",
258 "https://www.rereahu.maori.nz",
259 "http://blog.teara.govt.nz",
260 "http://www.ruralfind.co.nz",
261 "https://www.korokikahukura.co.nz",
262 "http://givealittle.co.nz",
263 "http://tewikiotereomaori.nz",
264 "http://dev.nzpcn.org.nz",
265 "http://www.firstworldwar.tki.org.nz",
266 "http://rsnz.natlib.govt.nz",
267 "http://biketorqueyamaha.co.nz",
268 "http://conference.tpwt.maori.nz",
269 "http://myfathersworld.net.nz",
270 "http://whatonga.school.nz",
271 "https://teaomaori.news",
272 "https://www.ashtangatauranga.co.nz",
273 "http://www.eventcinemas.co.nz",
274 "http://artizani.co.nz",
275 "https://www.stats.govt.nz",
276 "https://keepourmoneyclean.govt.nz",
277 "http://www.teipukarea.maori.nz",
278 "http://kuraproductions.co.nz",
279 "http://www.otepoti.school.nz",
280 "https://register.tpota.org.nz",
281 "http://www.tewhanake.maori.nz",
282 "https://office.e-agent.nz",
283 "http://community.nzdl.org",
284 "https://www.blushandbrows.nz",
285 "https://cdn.tehiku.nz",
286 "http://www.oag.govt.nz",
287 "http://tmmkkm.school.nz",
288 "http://www.tetaumuturunanga.iwi.nz",
289 "http://teaohou.natlib.govt.nz",
290 "http://www.kmk.maori.nz",
291 "https://www.maoritelevision.com",
292 "https://sooty.nz",
293 "http://hana.co.nz",
294 "http://waiata.maori.nz",
295 "http://www.pakanae.maori.nz",
296 "http://www.w3vietnam.org.nz",
297 "http://www.zoomin.co.nz",
298 "http://www.hrc.co.nz",
299 "https://www.wingspan.co.nz",
300 "https://www.cruisetourstauranga.co.nz",
301 "http://kurataiao.tki.org.nz",
302 "http://punareo.co.nz",
303 "http://www.finlaysonpark.school.nz",
304 "http://www.kurakokiri.maori.nz",
305 "https://rapuatearatika.education.govt.nz",
306 "https://www.lcds-display.nz",
307 "http://www.livingheritage.org.nz",
308 "http://www.heartland.co.nz",
309 "http://www.biketorqueyamaha.co.nz",
310 "https://2019.nethui.nz",
311 "http://archerpix.com",
312 "http://www.tkkmmokopuna.school.nz",
313 "http://www.wcl.govt.nz",
314 "https://tiritiowaitangi.govt.nz",
315 "http://rakaumanga.school.nz",
316 "http://holyspirit.nz",
317 "http://crimson.co.nz",
318 "https://www.ngamanawainc.co.nz",
319 "http://rexedra.gen.nz",
320 "http://www.kupengahao.co.nz",
321 "https://www.tematawai.maori.nz",
322 "http://tiritiowaitangi.govt.nz",
323 "http://rurued.school.nz",
324 "http://w3vietnam.org.nz",
325 "https://www.rotorua-rafting.co.nz",
326 "https://www.e-agent.nz",
327 "http://reoora.co.nz",
328 "http://archive.stats.govt.nz",
329 "https://www.dnc.org.nz",
330 "https://liveresults.co.nz",
331 "https://www.taitokerautrust.org.nz",
332 "https://www.infinite-electronic.nz",
333 "https://kaiiwicamp.nz",
334 "http://www.tereowrap.nz",
335 "https://m.wairarapatv.co.nz",
336 "http://ngatiwhakaue.iwi.nz",
337 "http://www.kkmmaungarongo.co.nz",
338 "https://rehuamarae.co.nz",
339 "http://www.tmoa.tki.org.nz",
340 "http://www.gans.co.nz",
341 "http://www.topomap.co.nz",
342 "http://www.electionresults.govt.nz",
343 "http://archive.electionresults.govt.nz",
344 "http://satellites.co.nz",
345 "https://haereheikaiako.co.nz",
346 "http://www.twtop.school.nz",
347 "http://www.waiata.maori.nz",
348 "http://www.temarareo.org",
349 "http://tetaurawhiri.govt.nz",
350 "http://www.28maoribattalion.org.nz",
351 "https://ttw1.cwp.govt.nz",
352 "http://www.methodist.org.nz",
353 "http://avonside.net",
354 "https://www.takitimu.ac.nz",
355 "https://www.terito.school.nz",
356 "https://www.electionresults.org.nz",
357 "http://firstworldwar.tki.org.nz",
358 "http://animations.tewhanake.maori.nz",
359 "https://hepatakakupu.nz",
360 "https://www.zenbu.co.nz",
361 "https://www.sporty.co.nz",
362 "https://www.tasteofplenty.co.nz",
363 "http://otorohanga.directorybusiness.co.nz",
364 "https://www.puhaandpakeha.co.nz",
365 "http://kete.wcl.govt.nz",
366 "https://interactives.stuff.co.nz",
367 "http://maori.livingheritage.org.nz",
368 "https://www.hapuhauora.health.nz",
369 "http://kaiiwicamp.nz",
370 "http://talkingtothecan.com",
371 "http://www.tuwharetoa.iwi.nz",
372 "http://nzpostcard.co.nz",
373 "https://paekupu.co.nz",
374 "http://www.runanga.co.nz",
375 "https://curriculumtool.education.govt.nz",
376 "http://www.matarikifestival.org.nz",
377 "http://www.jeremybaker.nz",
378 "http://ngatipahauwera.co.nz",
379 "http://pukapuka.nz",
380 "http://www.writersfestival.co.nz",
381 "http://temahurehure.maori.nz",
382 "http://pukoro.co.nz",
383 "http://tehauora.org.nz",
384 "http://pukekohe.directorybusiness.co.nz",
385 "http://kmpmusic.co.nz",
386 "http://www.maoriinvestments.co.nz",
387 "https://www.komako.org.nz",
388 "https://www.tuiatematangi.ac.nz"
389 ]
390}
391
392/* 2 */
393{
394 "_id" : "de",
395 "count" : 19.0,
396 "domain" : [
397 "http://www.cartogiraffe.com",
398 "http://etymologie.info",
399 "http://arts.mythologica.fr",
400 "http://svenkirsten.com",
401 "http://weltderberge.de",
402 "https://www.you-fly.com",
403 "http://klaaskoehne.de",
404 "http://www.nierstrasz.org",
405 "https://www.tvteile.de",
406 "http://vulkane.ch",
407 "http://etoile-de-lune.net",
408 "http://www.stephe.de",
409 "http://insecta.pro",
410 "http://m.distanta.1km.net",
411 "https://ersatzteile-fachversand.de",
412 "https://laskar02cinta.page.tl",
413 "http://www.behlig.de",
414 "https://www.cartogiraffe.com",
415 "http://www.udhr.de"
416 ]
417}
418
419/* 3 */
420{
421 "_id" : "fr",
422 "count" : 16.0,
423 "domain" : [
424 "http://rapanui.fr",
425 "http://splaf.free.fr",
426 "https://www.lexilogos.com",
427 "http://mahajana.net",
428 "http://www.gif.ovh",
429 "http://baladeornithologique.com",
430 "http://www.gaudry.be",
431 "http://kihikihi.fr",
432 "http://www.blueheavenisland.com",
433 "http://www.gototahiti.net",
434 "http://www.maraamusurfskirace.com",
435 "http://www.rongo-rongo.com",
436 "http://chantsdeluttes.free.fr",
437 "http://pt.city-usa.net",
438 "https://www.manualscat.com",
439 "http://blueheavenisland.com"
440 ]
441}
442
443/* 4 */
444{
445 "_id" : "nl",
446 "count" : 16.0,
447 "domain" : [
448 "https://www.arrowhead.eu",
449 "http://tonhut.nl",
450 "http://nielsonboutique.co.uk",
451 "http://longhornlaw.net",
452 "http://tetsubo.org",
453 "https://arrowheadproject.azurewebsites.net",
454 "http://hidsonphoto.com",
455 "http://www.gouvernante.info",
456 "http://gouvernante.info",
457 "http://diverosa.com",
458 "https://arrowhead.eu",
459 "http://www.nonlinear.demon.nl",
460 "http://www.encyclo.co.uk",
461 "https://www.henrifloor.nl",
462 "http://skimap.info",
463 "http://wearehomework.com"
464 ]
465}
466
467/* 5 */
468{
469 "_id" : "dk",
470 "count" : 8.0,
471 "domain" : [
472 "http://akona.ngapuhitelevision.com",
473 "http://waiatarangatiratanga.ngapuhitelevision.com",
474 "http://jazz.ngapuhitelevision.com",
475 "http://ngapuhitelevision.com",
476 "http://ngapuhiradio.com",
477 "http://www.rennertweb.de",
478 "http://powhiri.ngapuhitelevision.com",
479 "http://komisch.ngapuhitelevision.com"
480 ]
481}
482
483/* 6 */
484{
485 "_id" : "ca",
486 "count" : 7.0,
487 "domain" : [
488 "http://bcmarina.com",
489 "http://www.myrasplace.net",
490 "http://00.gs",
491 "http://aguadilla.airport-authority.com",
492 "http://bckayak.com",
493 "https://articles.imperialtometric.com",
494 "http://daandehn.com"
495 ]
496}
497
498/* 7 */
499{
500 "_id" : "au",
501 "count" : 5.0,
502 "domain" : [
503 "http://theunderwaterworld.com",
504 "https://www.kiwiproperty.com",
505 "http://fionajack.net",
506 "https://infogram.com",
507 "https://koreromaori.com"
508 ]
509}
510
511/* 8 */
512{
513 "_id" : "gb",
514 "count" : 4.0,
515 "domain" : [
516 "https://omniatlas.com",
517 "http://www.wordsearchfun.com",
518 "http://www.woolrych.org",
519 "http://mikestephens.co.uk"
520 ]
521}
522
523/* 9 */
524{
525 "_id" : "es",
526 "count" : 4.0,
527 "domain" : [
528 "https://www.uv.es",
529 "https://www.reclamaciondevuelos.com",
530 "http://www.info-hoteles.com",
531 "http://www.cruceros-princess.mx"
532 ]
533}
534
535/* 10 */
536{
537 "_id" : "cz",
538 "count" : 4.0,
539 "domain" : [
540 "http://www.henryklahola.nazory.cz",
541 "https://www.fipojobs.com",
542 "http://about.ilikeyou.com",
543 "http://henryklahola.nazory.cz"
544 ]
545}
546
547/* 11 */
548{
549 "_id" : "it",
550 "count" : 3.0,
551 "domain" : [
552 "http://oipaz.net",
553 "http://www.marcosanti.it",
554 "http://www.pegasoesmicamion.com"
555 ]
556}
557
558/* 12 */
559{
560 "_id" : "at",
561 "count" : 3.0,
562 "domain" : [
563 "http://www.tmtmm.net",
564 "http://www.petit-prince.at",
565 "http://petit-prince.at"
566 ]
567}
568
569/* 13 */
570{
571 "_id" : "ro",
572 "count" : 2.0,
573 "domain" : [
574 "http://parohiauceadesus.ro",
575 "http://www.parohiauceadesus.ro"
576 ]
577}
578
579/* 14 */
580{
581 "_id" : "ch",
582 "count" : 2.0,
583 "domain" : [
584 "https://nicoledidi.ch",
585 "https://photos.axelebert.org"
586 ]
587}
588
589/* 15 */
590{
591 "_id" : "il",
592 "count" : 2.0,
593 "domain" : [
594 "https://www.hitiaotera.com",
595 "http://www.daat.ac.il"
596 ]
597}
598
599/* 16 */
600{
601 "_id" : "ru",
602 "count" : 1.0,
603 "domain" : [
604 "https://www.gismeteo.lv"
605 ]
606}
607
608/* 17 */
609{
610 "_id" : "bg",
611 "count" : 1.0,
612 "domain" : [
613 "http://anitra.net"
614 ]
615}
616
617/* 18 */
618{
619 "_id" : "mx",
620 "count" : 1.0,
621 "domain" : [
622 "http://www.gelbukh.com"
623 ]
624}
625
626/* 19 */
627{
628 "_id" : "unknown",
629 "count" : 1.0,
630 "domain" : [
631 "https://www.viveipcl.com"
632 ]
633}
634
635/* 20 */
636{
637 "_id" : "jp",
638 "count" : 1.0,
639 "domain" : [
640 "http://yutaka.it-n.jp"
641 ]
642}
643
644/* 21 */
645{
646 "_id" : "cn",
647 "count" : 1.0,
648 "domain" : [
649 "http://kiwi2china.com"
650 ]
651}
652
653/* 22 */
654{
655 "_id" : "ie",
656 "count" : 1.0,
657 "domain" : [
658 "https://coggle.it"
659 ]
660}
661
662/* 23 */
663{
664 "_id" : "ir",
665 "count" : 1.0,
666 "domain" : [
667 "https://www.dideo.ir"
668 ]
669}
670
671/* 24 */
672{
673 "_id" : "fi",
674 "count" : 1.0,
675 "domain" : [
676 "http://pertti.com"
677 ]
678}
Note: See TracBrowser for help on using the repository browser.