Changeset 33854 for other-projects/maori-lang-detection/mongodb-data/6table_nonProductSites1_manualShortlist.json
- Timestamp:
- 2020-01-21T22:01:07+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/mongodb-data/6table_nonProductSites1_manualShortlist.json
r33848 r33854 29 29 30 30 "_id","siteCount","numPagesInMRICount","numPagesContainingMRICount","URLs of pages detected as inMRI" 31 "nz","176.0" ,"4360","9641"31 "nz","176.0" containsMRI vs 96 pages inMRI,"4360","9641" in 176 containsMRI pages vs 7968 in isMRI pages 32 32 "us","29.0", 33 33 1+2+0+0+4+166+0+39 +257+2+21+12+25+13+53+0+1+0+1+11 +32+37+4 +0+0+0 = 681, … … 46 46 "ie","1.0","1","3", https://coggle.it/diagram/WSYB0mLA2QABD5BH/t/ko-au-ko-koe 47 47 48 49 50 51 52 -------------- 53 54 https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1 55 https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 56 https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/ 57 58 N (NZ pages where isMRI comes out true) = 4360 59 solving for n, the sample size 60 confidence level = 90% 61 m, margin of error = 5% 62 63 From the "z alpha/2" table, for 90% confidence, we get a z alpha/2 value of 1.6449 (or 1.645). 64 65 Then the sample size, n, we need is = 1.6449^2 * 4360 / ( 1.6449^2 + (4 * 4359) * 0.05^2) = 255 (rounded up) 66 67 68 For N = 681, 69 sample size n is = 1.6449^2 * 681 / ( 1.6449^2 + (4 * 680) * 0.05^2) = 194 (rounded up) 70 71 72 sample size for NZ: 255 (90% confidence with 5% margine of error, Including a finite correction factor) 73 sample size for US: 194 74 48 75 */ 49 76 … … 67 94 68 95 96 97 NZ - sample 255 pages from: 98 /* 99 db.Websites.aggregate([ 100 { 101 $match: { 102 $and: [ 103 {numPagesContainingMRI: {$gt: 0}}, 104 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 105 ] 106 } 107 }, 108 { $unwind: "$geoLocationCountryCode" }, 109 { 110 $group: { 111 _id: "nz", 112 count: { $sum: 1 }, 113 domain: { $addToSet: '$domain' }, 114 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 115 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 116 } 117 }, 118 { $sort : { count : -1} } 119 ]); 120 121 122 OR is this better: 123 124 db.Websites.aggregate([ 125 { 126 $match: { 127 $and: [ 128 {numPagesInMRI: {$gt: 0}}, 129 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 130 ] 131 } 132 }, 133 { $unwind: "$geoLocationCountryCode" }, 134 { 135 $group: { 136 _id: "nz", 137 count: { $sum: 1 }, 138 domain: { $addToSet: '$domain' }, 139 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 140 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 141 } 142 }, 143 { $sort : { count : -1} } 144 ]); 145 */ 146 147 num NZ sites with > 0 isMRI pages = 96 148 Total numPagesInMRI in NZ sites = 4360 149 Total numPagesContainingMRI in NZ sites = 7968 150 151 Using the results you get a list of domains that matched. 171 nz domains, though it should be 176? -1 152 153 Copy each domain (up to 255 of them) and look for the first 1 or 2 max that matches isMRI: 154 155 1. db.getCollection('Webpages').find({URL:/pukekohe.directorybusiness.co.nz/, isMRI: true}) - check it contains a positive number of pages in MRI and check the first 1-2 pages to make sure they are indeed in MRI. Note down the ratio of MRI finds. e.g. 2/2. 156 157 2. Find those pages that containsMRI but not isMRI and check if there are indeed sentences in MRI. Note down the ratio for the first 2 pages. 158 db.getCollection('Webpages').find({URL:/maori.livingheritage.org.nz/, isMRI: false, containsMRI: true}) 159 160 161 162 /* 1 */ 163 { 164 "_id" : "nz", 165 "count" : 96.0, 166 "domain" : [ 167 "http://www.teipukarea.maori.nz", 3/3 1/3 168 "http://ngatipahauwera.co.nz", 2/2, 2/2 169 "http://www.oag.govt.nz", 2/2 0/2 170 "https://sexualviolence.victimsinfo.govt.nz", 3/3 0/3 171 "http://tmoa.tki.org.nz", 3/3 3/3 172 "http://www.tewhanake.maori.nz", 3/3 2/3 173 "http://www.matarikifestival.org.nz", 4/4 0/3 174 "http://www.otepoti.school.nz", 3/3 0/4 175 !! "https://www.maoritelevision.com", 3/4, 0 [no containsMRI outside isMRI pages] 176 "http://pukapuka.nz", 3/3 1/4 [lorem ipsum used on first 3 pages] 177 "http://community.nzdl.org", 3/3 0/3 [containsMRI has detected Te Taka Keegan as MRI sentence] 178 !! "http://kmpmusic.co.nz", 0-4/4? [but CD listing of some MRI song titles] 0 [no other pages containsMRI] 179 "http://maori.livingheritage.org.nz", 2/2 2/2 180 "http://pukoro.co.nz", 2/2 0/2 181 "https://register.tpota.org.nz", 0/1 [form] 0/2 182 X "https://cdn.tehiku.nz" => DOMAIN: "tehiku.nz", 0/4, 1/3 [but audio content may be in MRI] 183 !! "http://www.runanga.co.nz", 3/3 0 [no containsMRI outside isMRI pages] 184 ! "http://kuraaiwi.maori.nz", 2/4 [navigation only downloaded. But site content checked] 2/3 185 "http://kurataiao.tki.org.nz", 3/3, 1/total 3 186 187 !! "http://satellites.co.nz", 3/3 [kpop], 0 [no containsMRI outside isMRI pages] 188 "http://teaohou.natlib.govt.nz", 4/4, 2/4 189 "http://www.tuwharetoa.iwi.nz", 2/3 0/3 190 X "http://auturoa.nz", 0/4 0/3 [lots of MRI terms among English] - COMMUNITY 191 "https://www.terito.school.nz", 3/3, 0/2 total 192 "https://ttw1.cwp.govt.nz", 3/3 3/3 193 "https://www.whanau-tahi.school.nz", 4/4, 1/2 total 194 "https://e-ako-pangarau.nzmaths.co.nz", 3/3 total, 1/1 total 195 "https://teaomaori.news", 3/3, 0/1 total 196 "http://tetaurawhiri.govt.nz", 3/3 /3/3 [MÄori Language Commission site] 197 "https://www.tuiatematangi.ac.nz", 4/4 3/3 198 "http://animations.tewhanake.maori.nz", 3/3 3/3 199 !! "https://www.dnc.org.nz", 1/1 total, 0 [no containsMRI outside isMRI pages] 200 !! "http://firstworldwar.tki.org.nz", 3/3, 0 [no containsMRI outside isMRI pages] 201 "http://www.28maoribattalion.org.nz", 3/3, 1/3 202 "http://www.tewikiotereomaori.co.nz", 1/1 total, 3/3 203 "http://www.brettgraham.co.nz", 1/1 total, 0/3 204 !! "https://hepatakakupu.nz", 3/3, 0 [no containsMRI outside isMRI pages] 205 206 "http://anglicanprayerbook.nz", 3/3 3/3 207 "http://arataua.nz", 4/4, 2/3 208 "http://blog.teara.govt.nz", 3/3, 0/3 [AS: teara.govt.nz] 209 "http://maori.tki.org.nz", 3/3 3/3 210 DONE (with/out www): "http://www.firstworldwar.tki.org.nz", 211 X "http://www.topomap.co.nz", 0/2 [all placenames], 0 [no containsMRI outside isMRI pages] 212 "https://paekupu.co.nz", 4/4, 0 [no containsMRI outside isMRI pages] 213 "https://haereheikaiako.co.nz", 1/1, 0 [no containsMRI outside isMRI pages] 214 "https://curriculumtool.education.govt.nz", 4/4, 3/3 215 "http://kurakokiri.maori.nz", 3/3, 3/3 [same nav menus on each page] 216 "http://kete.wcl.govt.nz", 2/5 [first 3 misdetected: Tokelauan (American Samoa), Kiribati, Tongan], 0/3 217 "http://www.kkmmaungarongo.co.nz", 3/3, 3/3 218 "http://www.heartland.co.nz", 3/3, 1/1 total 219 "http://oilcrash.com", 2/2 total, 0/3 220 "http://www.kura-porirua.school.nz", 4/4, 2/3 221 "http://videos.e-agent.nz", [AT: e-agent.nz] 3/3, 3/3 [repeated nav] 222 "https://www.sporty.co.nz", 3/3, 0 [no containsMRI outside isMRI pages] 223 "https://www.tematawai.maori.nz", 3/3, 3/3 224 225 "https://www.terakipaewhenua.school.nz", 226 "http://www.tetaurawhiri.govt.nz", 227 "http://archive.stats.govt.nz", 228 "http://tiritiowaitangi.govt.nz", 229 "http://www.waiata.maori.nz", 230 "http://hana.co.nz", 231 "http://kaupare.co.nz", 232 "http://www.tereowrap.nz", 233 "https://www.e-agent.nz", 234 "http://www.hrc.co.nz", 235 "http://ngatiporoukiponeke.org.nz", 236 "http://rurued.school.nz", 237 "http://www.twtop.school.nz", 238 "https://www.infinite-electronic.nz", 239 "http://www.huri-translations.pf", 240 "https://admin.teara.govt.nz", 241 "https://tiritiowaitangi.govt.nz", 242 "http://www.tmoa.tki.org.nz", 243 "https://www.komako.org.nz", 244 "http://www.wcl.govt.nz", 245 "https://office.e-agent.nz", 246 "http://punareo.co.nz", 247 "http://www.kurakokiri.maori.nz", 248 "https://rapuatearatika.education.govt.nz", 249 "http://tmmkkm.school.nz", 250 "https://www.components-mart.nz", 251 "http://www.cs.waikato.ac.nz", 252 "http://www.kupengahao.co.nz", 253 "https://www.hapuhauora.health.nz", 254 "https://www.lcds-display.nz", 255 "http://waiata.maori.nz", 256 "http://cms.sunsmartschools.co.nz", 257 "http://www.livingheritage.org.nz", 258 "http://kuraproductions.co.nz", 259 "https://keepourmoneyclean.govt.nz", 260 "http://www.tekura.school.nz", 261 "http://www.tkkmmokopuna.school.nz", 262 "http://hangaraumatihiko.tki.org.nz", 263 "http://www.pakanae.maori.nz" 264 ], 265 "numPagesInMRICount" : 4360, 266 "numPagesContainingMRICount" : 7968 267 } 268 269 ---------------------------- 270 271 /* 1 */ 272 { 273 "_id" : "nz", 274 "count" : 176.0, 275 "domain" : [ 276 !! "http://pukekohe.directorybusiness.co.nz", 0/2, 0/2, isMRI = 0!! 277 "http://maori.livingheritage.org.nz", 2/2 2/2 278 "http://pukoro.co.nz", 2/2 0/2 279 "http://www.rakaumanga.school.nz", 0/4 0/4 280 "http://www.ngamanawainc.co.nz", 0/2 0/2 281 "https://office.e-agent.nz", 282 "https://www.components-mart.nz", 283 "http://tmmkkm.school.nz", 284 "http://www.rotoruanz.com", 285 "http://www.huri-translations.pf", 286 "https://admin.teara.govt.nz", 287 "http://hangaraumatihiko.tki.org.nz", 288 "https://sexualviolence.victimsinfo.govt.nz", 289 "http://www.tekura.school.nz", 290 "http://philipbeadle.co.nz", 291 "http://www.cs.waikato.ac.nz", 292 "https://www.hapuhauora.health.nz", 293 "http://cms.sunsmartschools.co.nz", 294 "https://keepourmoneyclean.govt.nz", 295 "http://www.kura-porirua.school.nz", 296 "http://waitarahistory.org.nz", 297 "http://oilcrash.com", 298 "http://videos.e-agent.nz", 299 "https://manawatuheritage.pncc.govt.nz", 300 "https://www.terakipaewhenua.school.nz", 301 "http://dev.nzpcn.org.nz", 302 "https://kotahimiriona.co.nz", 303 "http://kurakokiri.maori.nz", 304 "https://www.sporty.co.nz", 305 "http://kaupare.co.nz", 306 "http://ngatiporoukiponeke.org.nz", 307 "https://www.takitimu.ac.nz", 308 "http://www.tetaurawhiri.govt.nz", 309 "http://www.waiata.maori.nz", 310 "http://conference.tpwt.maori.nz", 311 "http://ngatiwhakaue.iwi.nz", 312 "http://www.nzpcn.org.nz", 313 "http://www.ruralfind.co.nz", 314 "https://www.dnc.org.nz", 315 "https://www.puau.school.nz", 316 "https://kaiiwicamp.nz", 317 "https://www.terito.school.nz", 318 "https://www.pinterest.nz", 319 "https://e-ako-pangarau.nzmaths.co.nz", 320 "http://givealittle.co.nz", 321 "https://teaomaori.news", 322 "https://www.korokikahukura.co.nz", 323 "http://myfathersworld.net.nz", 324 "http://www.firstworldwar.tki.org.nz", 325 "https://www.ashtangatauranga.co.nz", 326 "http://biketorqueyamaha.co.nz", 327 "https://www.rereahu.maori.nz", 328 "http://www.tewikiotereomaori.co.nz", 329 "http://www.brettgraham.co.nz", 330 "http://tewikiotereomaori.nz", 331 "http://anglicanprayerbook.nz", 332 "http://arataua.nz", 333 "http://blog.teara.govt.nz", 334 "http://www.otepoti.school.nz", 335 "http://www.kmk.maori.nz", 336 "http://www.eventcinemas.co.nz", 337 "https://www.stats.govt.nz", 338 "http://www.oag.govt.nz", 2/2 0/2 339 "http://whatonga.school.nz", 340 "http://www.tewhanake.maori.nz", 341 "https://www.maoritelevision.com", 342 "http://kuraaiwi.maori.nz", 343 "http://kurataiao.tki.org.nz", 344 "http://teaohou.natlib.govt.nz", 345 "http://www.tetaumuturunanga.iwi.nz", 346 "http://www.tasteofplenty.co.nz", 347 "http://community.nzdl.org", 348 "https://www.blushandbrows.nz", 349 "https://register.tpota.org.nz", 350 "https://cdn.tehiku.nz", 351 "http://www.wcl.govt.nz", 352 "http://www.jeremybaker.nz", 353 "http://punareo.co.nz", 354 "https://rapuatearatika.education.govt.nz", 355 "http://www.kurakokiri.maori.nz", 356 "https://www.cruisetourstauranga.co.nz", 357 "https://sooty.nz", 358 "http://rakaumanga.school.nz", 359 "https://tiritiowaitangi.govt.nz", 360 "http://www.tmoa.tki.org.nz", 361 "http://www.w3vietnam.org.nz", 362 "https://www.infinite-electronic.nz", 363 "https://www.komako.org.nz", 364 "http://nzpostcard.co.nz", 365 "http://artizani.co.nz", 366 "http://www.finlaysonpark.school.nz", 367 "http://crimson.co.nz", 368 "http://holyspirit.nz", 369 "http://www.tkkmmokopuna.school.nz", 370 "http://www.pakanae.maori.nz", 371 "http://www.teipukarea.maori.nz", 372 "http://archerpix.com", 373 "https://2019.nethui.nz", 374 "http://www.kupengahao.co.nz", 375 "https://www.lcds-display.nz", 376 "http://waiata.maori.nz", 377 "http://kuraproductions.co.nz", 378 "http://www.biketorqueyamaha.co.nz", 379 "http://www.livingheritage.org.nz", 380 "http://www.zoomin.co.nz", 381 "http://rsnz.natlib.govt.nz", 382 "http://otorohanga.directorybusiness.co.nz", 383 "http://reoora.co.nz", 384 "http://w3vietnam.org.nz", 385 "https://rehuamarae.co.nz", 386 "https://www.electionresults.org.nz", 387 "https://www.ngamanawainc.co.nz", 388 "https://www.rotorua-rafting.co.nz", 389 "https://www.taitokerautrust.org.nz", 390 "https://www.wingspan.co.nz", 391 "http://www.kkmmaungarongo.co.nz", 392 "http://kete.wcl.govt.nz", 393 "http://www.heartland.co.nz", 394 "http://www.electionresults.govt.nz", 395 "https://www.tematawai.maori.nz", 396 "http://hana.co.nz", 397 "http://www.tereowrap.nz", 398 "http://rurued.school.nz", 399 "http://www.twtop.school.nz", 400 "http://rexedra.gen.nz", 401 "http://archive.stats.govt.nz", 402 "https://liveresults.co.nz", 403 "https://www.e-agent.nz", 404 "http://tiritiowaitangi.govt.nz", 405 "http://www.hrc.co.nz", 406 "http://animations.tewhanake.maori.nz", 407 "https://interactives.stuff.co.nz", 408 "http://avonside.net", 409 "http://www.methodist.org.nz", 410 "https://www.tasteofplenty.co.nz", 411 "http://www.maoriinvestments.co.nz", 412 "https://m.wairarapatv.co.nz", 413 "http://www.gans.co.nz", 414 "https://ttw1.cwp.govt.nz", 415 "http://ngarauhuia.ngatiapakiterato.iwi.nz", 416 "https://www.tuiatematangi.ac.nz", 417 "http://tetaurawhiri.govt.nz", 418 "http://maori.tki.org.nz", 419 "http://www.topomap.co.nz", 420 "https://www.puhaandpakeha.co.nz", 421 "https://haereheikaiako.co.nz", 422 "https://paekupu.co.nz", 423 "https://curriculumtool.education.govt.nz", 424 "http://firstworldwar.tki.org.nz", 425 "http://www.28maoribattalion.org.nz", 426 "https://hepatakakupu.nz", 427 "https://www.zenbu.co.nz", 428 "http://www.matarikifestival.org.nz", 429 "http://pukapuka.nz", 430 "http://ngatipahauwera.co.nz", 2/2 2/2 431 "http://southerntribes.co.nz", 432 "https://player.vimeo.com", 433 "http://tmoa.tki.org.nz", 434 "http://www.writersfestival.co.nz", 435 "http://talkingtothecan.com", 436 "https://www.whanau-tahi.school.nz", 437 "http://satellites.co.nz", 438 "http://auturoa.nz", 439 "http://www.tuwharetoa.iwi.nz", 440 "http://kmpmusic.co.nz", 441 "http://www.temarareo.org", 442 "http://archive.electionresults.govt.nz", 443 "http://kaiiwicamp.nz", 444 "http://tehauora.org.nz", 445 "http://temahurehure.maori.nz", 446 "http://www.runanga.co.nz" 447 ], 448 "numPagesInMRICount" : 4360, 449 "numPagesContainingMRICount" : 9641 450 } 451 452
Note:
See TracChangeset
for help on using the changeset viewer.