Ignore:
Timestamp:
2019-12-19T17:13:26+13:00 (4 years ago)
Author:
ak19
Message:

Put the important mongodb queries and results into hdfs-cc-work/GS_README.txt, except for the aggregate queries, where the results are in mongodb-data/counts*.json files committed to svn yesterday and from which geojson maps were produced.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/hdfs-cc-work/GS_README.TXT

    r33809 r33814  
    6936936. Launch the Robo 3T (version 1.3 is one we tested) MongoDB client. Use it to connect to MongoDB's "ateacrawldata" database.
    694694Now you can run queries.
     695
     696
     697Here are most of the important MongoDB queries I ran, and the shorter answers.
     698# Num websites
     699db.getCollection('Websites').find({}).count()
     7001445
     701
     702# Num webpages
     703db.getCollection('Webpages').find({}).count()
     704117496
     705
     706# Find number of websites that have 1 or more pages detected as being in Maori (a positive numPagesInMRI)
     707db.getCollection('Websites').find({numPagesInMRI: { $gt: 0}}).count()
     708361
     709
     710# Number of sites containing at least one sentence for which OpenNLP detected the best language = MRI
     711db.getCollection('Websites').find({numPagesContainingMRI: {$gt: 0}}).count()
     712868
     713
     714# Obviously, the union of the above two will be identical to numPagesContainingMRI:
     715db.getCollection('Websites').find({ $or: [ { numPagesInMRI: { $gt: 0 } }, { numPagesContainingMRI: {$gt: 0} } ] } ).count()
     716868
     717
     718# Find number of webpages that are deemed to be overall in MRI (pages where isMRI=true)
     719db.getCollection('Webpages').find({isMRI:true}).count()
     7207818
     721
     722# Number of pages that contain any number of MRI sentences
     723db.getCollection('Webpages').find({containsMRI: true}).count()
     72420371
     725
     726
     727# Number of sites with URLs containing /mi(/)
     728db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count()
     729X 153
     730# Number of sites with URLs containing /mi(/) OR http(s)://mi.*
     731db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count()
     732670
     733
     734# Number of websites that are outside NZ that contain /mi(/) in any of its sub-urls
     735db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
     736X 147
     737# Number of websites that are outside NZ that contain /mi(/) OR http(s)://mi.* in any of its sub-urls
     738db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
     739656
     740
     741# 6 sites with URLs containing /mi(/) that are in NZ
     742db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: "NZ"}).count()
     743X 6
     744# 14 sites with URLs containing /mi(/) OR http(s)://mi.* that are in NZ
     74514
     746
     747PROJECTION QUERIES:
     748# For all the sites that do not originate in NZ, list their country codes (geoLocationCountryCode
     749# field) and the urlContainsLangCodeInPath field
     750
     751db.getCollection('Websites').find({geoLocationCountryCode: {$ne:"nz"}}, {geoLocationCountryCode:1, urlContainsLangCodeInPath: 1})
     752
     753
     754AGGREGATION QUERIES - the results of important aggregate queries here
     755can be found in the associated mongodb-data/counts*.json files.
     756
     757# count of country codes for all sites
     758db.Websites.aggregate([
     759   
     760    { $unwind: "$geoLocationCountryCode" },
     761    {
     762        $group: {
     763            _id: "$geoLocationCountryCode",
     764            count: { $sum: 1 }
     765        }
     766    },
     767    { $sort : { count : -1} }
     768]);
     769
     770# count of country codes for sites that have at least one page detected as MRI
     771
     772db.Websites.aggregate([
     773    {
     774        $match: {
     775            numPagesInMRI: {$gt: 0}
     776        }
     777    },
     778    { $unwind: "$geoLocationCountryCode" },
     779    {
     780        $group: {
     781            _id: {$toLower: '$geoLocationCountryCode'},
     782            count: { $sum: 1 }
     783        }
     784    },
     785    { $sort : { count : -1} }
     786]);
     787
     788# count of country codes for sites that have at least one page containing at least one sentence detected as MRI
     789db.Websites.aggregate([
     790    {
     791        $match: {
     792            numPagesContainingMRI: {$gt: 0}
     793        }
     794    },
     795    { $unwind: "$geoLocationCountryCode" },
     796    {
     797        $group: {
     798            _id: {$toLower: '$geoLocationCountryCode'},
     799            count: { $sum: 1 }
     800        }
     801    },
     802    { $sort : { count : -1} }
     803]);
     804
     805
     806# ATTEMPT TO FILTER OUT LIKELY AUTO-TRANSLATED SITES
     807# Get a count of all non-NZ (or .nz TLD) sites that don't have /mi(/) or http(s)://mi.*
     808# in the URL path of any crawled web pages of the site
     809db.getCollection('Websites').find(
     810{$and: [
     811    {numPagesContainingMRI: {$gt: 0}},
     812    {geoLocationCountryCode: {$ne: "NZ"}},
     813    {domain: {$not: /.nz$/}},
     814    {urlContainsLangCodeInPath: {$ne: true}}
     815]}).count()
     816
     817220
     818
     819# Aggregate: count by country codes of non-NZ related sites that
     820# don't have the language code in the URL path on any crawled pages of the site
     821
     822db.Websites.aggregate([
     823    {
     824        $match: {
     825            $and: [
     826            {numPagesContainingMRI: {$gt: 0}},
     827            {geoLocationCountryCode: {$ne: "NZ"}},
     828            {domain: {$not: /.nz$/}},
     829            {urlContainsLangCodeInPath: {$ne: true}}
     830            ]
     831        }
     832    },
     833    { $unwind: "$geoLocationCountryCode" },
     834    {
     835        $group: {
     836            _id: {$toLower: '$geoLocationCountryCode'},
     837            count: { $sum: 1 },
     838            domain: { $addToSet: '$domain' }
     839        }
     840    },
     841    { $sort : { count : -1} }
     842]);
     843
     844The above query contains "domain: { $addToSet: '$domain' }"
     845which adds the list of matching domains for each country code
     846to the output of the aggregate result list.
     847This is useful as I'll be inspecting these manually to ensure they're not
     848auto-translated to further reduce the list if necessary.
     849
     850For each resulting domain, I can then inspect that website's pages in the Webpages
     851mongodb collection for whether those pages are relevant or auto-translated with a query
     852of the following form. This example works with the sample site URL https://www.lexilogos.com
     853
     854    db.getCollection('Webpages').find({URL:/lexilogos\.com/, mriSentenceCount: {$gt: 0}})
     855
     856
     857In inspecting Australian sites in the result list, I noticed that one that should not be
     858excluded from the output was https://www.kiwiproperty.com. The TLD is not .nz,
     859and the site originates in Australia, not NZ, but it's still a site of NZ content.
     860This will be an important consideration when constructing some aggregate queries further below.
     861
     862
     863# Count of websites that have at least 1 page containing at least one sentence detected as MRI
     864# AND which websites have mi in the URL path:
     865
     866db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{urlContainsLangCodeInPath: true}]}).count()
     867
     868491
     869
     870
     871# The websites that have some MRI detected AND which are either in NZ or with NZ TLD
     872# or (so if they're from overseas) don't contain /mi or mi.* in URL path:
     873
     874db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{geoLocationCountryCode: "NZ"}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}]}).count()
     875396
     876
     877Include Australia, to get the valid "kiwiproperty.com" website included in the result list:
     878
     879db.getCollection('Websites').find({$and: [
     880                {numPagesContainingMRI: {$gt: 0}},
     881                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
     882            ]}).count()
     883
     884397
     885
     886# aggregate results by a count of country codes
     887db.Websites.aggregate([
     888    {
     889        $match: {
     890            $and: [
     891                {numPagesContainingMRI: {$gt: 0}},
     892                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
     893            ]
     894        }
     895    },
     896    { $unwind: "$geoLocationCountryCode" },
     897    {
     898        $group: {
     899            _id: {$toLower: '$geoLocationCountryCode'},
     900            count: { $sum: 1 }
     901        }
     902    },
     903    { $sort : { count : -1} }
     904]);
     905
     906
     907# Just considering those sites outside NZ or not with .nz TLD:
     908
     909db.getCollection('Websites').find({$and: [
     910                {geoLocationCountryCode: {$ne: "NZ"}},
     911                {domain: {$not: /\.nz/}},
     912                {numPagesContainingMRI: {$gt: 0}},
     913                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
     914            ]}).count()
     915
     916221 websites
     917
     918# counts by country code excluding NZ related sites
     919db.Websites.aggregate([
     920    {
     921        $match: {
     922            $and: [
     923                {geoLocationCountryCode: {$ne: "NZ"}},
     924                {domain: {$not: /\.nz/}},
     925                {numPagesContainingMRI: {$gt: 0}},
     926                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     927            ]
     928        }
     929    },
     930    { $unwind: "$geoLocationCountryCode" },
     931    {
     932        $group: {
     933            _id: {$toLower: '$geoLocationCountryCode'},
     934            count: { $sum: 1 },
     935            domain: { $addToSet: '$domain' }
     936        }
     937    },
     938    { $sort : { count : -1} }
     939]);
     940
     941
     942# But to produce the tentative non-product sites, we also want the aggregate for all NZ sites (from NZ or with .nz tld):
     943db.getCollection('Websites').find({$and: [
     944                {numPagesContainingMRI: {$gt: 0}},
     945                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     946            ]}).count()
     947
     948176
     949
     950(Total is 221+176 = 397, which adds up).
     951
     952# Get the count (and domain listing) output put under a hardcoded  _id of "nz":
     953db.Websites.aggregate([
     954    {
     955        $match: {
     956            $and: [
     957                {numPagesContainingMRI: {$gt: 0}},
     958                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     959            ]
     960        }
     961    },
     962    { $unwind: "$geoLocationCountryCode" },
     963    {
     964        $group: {
     965            _id: "nz",
     966            count: { $sum: 1 },
     967            domain: { $addToSet: '$domain' }
     968        }
     969    },
     970    { $sort : { count : -1} }
     971]);
    695972
    696973--------------------------------------------------------
Note: See TracChangeset for help on using the changeset viewer.