Changeset 33804
- Timestamp:
- 2019-12-13T20:00:53+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/MoreReading/mongodb.txt
r33800 r33804 454 454 # Num websites 455 455 db.getCollection('Websites').find({}).count() 456 144 6456 1445 457 457 458 458 # Num webpages … … 464 464 db.getCollection('Websites').find({numPagesInMRI: { $gt: 0}}).count() 465 465 361 466 467 # Number of sites containing at least one sentence for which OpenNLP detected the best language = MRI 468 db.getCollection('Websites').find({numPagesContainingMRI: {$gt: 0}}).count() 469 868 470 471 # Obviously, the union of the above two will be identical to numPagesContainingMRI: 472 db.getCollection('Websites').find({ $or: [ { numPagesInMRI: { $gt: 0 } }, { numPagesContainingMRI: {$gt: 0} } ] } ).count() 473 868 466 474 467 475 # Find number of webpages that are deemed to be overall in MRI (pages where isMRI=true) … … 484 492 # Number of websites that are outside NZ that contain /mi(/) in any of its sub-urls 485 493 db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count() 486 14 8494 147 487 495 488 496 # 5 sites with URLs containing /mi(/) that are in NZ 489 497 db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: "NZ"}).count() 490 5 498 6 499 491 500 492 501 # sort websites that contain /mi(/) in path by geoLocationCountryCode … … 557 566 $group: { 558 567 _id: "$geoLocationCountryCode", 568 count: { $sum: 1 } 569 } 570 }, 571 { $sort : { count : -1} } 572 ]); 573 574 // count of country codes for sites that have at least one page detected as MRI 575 576 db.Websites.aggregate([ 577 { 578 $match: { 579 numPagesInMRI: {$gt: 0} 580 } 581 }, 582 { $unwind: "$geoLocationCountryCode" }, 583 { 584 $group: { 585 _id: {$toLower: '$geoLocationCountryCode'}, 586 count: { $sum: 1 } 587 } 588 }, 589 { $sort : { count : -1} } 590 ]); 591 592 // count of country codes for sites that have at least one page containing at least one sentence detected as MRI 593 db.Websites.aggregate([ 594 { 595 $match: { 596 numPagesContainingMRI: {$gt: 0} 597 } 598 }, 599 { $unwind: "$geoLocationCountryCode" }, 600 { 601 $group: { 602 _id: {$toLower: '$geoLocationCountryCode'}, 559 603 count: { $sum: 1 } 560 604 }
Note:
See TracChangeset
for help on using the changeset viewer.