Changeset 33814

Show
Ignore:
Timestamp:
19.12.2019 17:13:26 (5 weeks ago)
Author:
ak19
Message:

Put the important mongodb queries and results into hdfs-cc-work/GS_README.txt, except for the aggregate queries, where the results are in mongodb-data/counts*.json files committed to svn yesterday and from which geojson maps were produced.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/hdfs-cc-work/GS_README.TXT

    r33809 r33814  
    6936936. Launch the Robo 3T (version 1.3 is one we tested) MongoDB client. Use it to connect to MongoDB's "ateacrawldata" database. 
    694694Now you can run queries. 
     695 
     696 
     697Here are most of the important MongoDB queries I ran, and the shorter answers. 
     698# Num websites 
     699db.getCollection('Websites').find({}).count() 
     7001445  
     701 
     702# Num webpages 
     703db.getCollection('Webpages').find({}).count() 
     704117496 
     705 
     706# Find number of websites that have 1 or more pages detected as being in Maori (a positive numPagesInMRI) 
     707db.getCollection('Websites').find({numPagesInMRI: { $gt: 0}}).count() 
     708361 
     709 
     710# Number of sites containing at least one sentence for which OpenNLP detected the best language = MRI 
     711db.getCollection('Websites').find({numPagesContainingMRI: {$gt: 0}}).count() 
     712868 
     713 
     714# Obviously, the union of the above two will be identical to numPagesContainingMRI: 
     715db.getCollection('Websites').find({ $or: [ { numPagesInMRI: { $gt: 0 } }, { numPagesContainingMRI: {$gt: 0} } ] } ).count() 
     716868 
     717 
     718# Find number of webpages that are deemed to be overall in MRI (pages where isMRI=true) 
     719db.getCollection('Webpages').find({isMRI:true}).count() 
     7207818 
     721 
     722# Number of pages that contain any number of MRI sentences 
     723db.getCollection('Webpages').find({containsMRI: true}).count() 
     72420371 
     725 
     726 
     727# Number of sites with URLs containing /mi(/) 
     728db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count() 
     729X 153 
     730# Number of sites with URLs containing /mi(/) OR http(s)://mi.* 
     731db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count() 
     732670 
     733 
     734# Number of websites that are outside NZ that contain /mi(/) in any of its sub-urls 
     735db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count() 
     736X 147 
     737# Number of websites that are outside NZ that contain /mi(/) OR http(s)://mi.* in any of its sub-urls 
     738db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count() 
     739656 
     740 
     741# 6 sites with URLs containing /mi(/) that are in NZ 
     742db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: "NZ"}).count() 
     743X 6 
     744# 14 sites with URLs containing /mi(/) OR http(s)://mi.* that are in NZ 
     74514 
     746 
     747PROJECTION QUERIES: 
     748# For all the sites that do not originate in NZ, list their country codes (geoLocationCountryCode 
     749# field) and the urlContainsLangCodeInPath field 
     750 
     751db.getCollection('Websites').find({geoLocationCountryCode: {$ne:"nz"}}, {geoLocationCountryCode:1, urlContainsLangCodeInPath: 1}) 
     752 
     753 
     754AGGREGATION QUERIES - the results of important aggregate queries here 
     755can be found in the associated mongodb-data/counts*.json files. 
     756 
     757# count of country codes for all sites 
     758db.Websites.aggregate([ 
     759    
     760    { $unwind: "$geoLocationCountryCode" }, 
     761    { 
     762        $group: { 
     763            _id: "$geoLocationCountryCode", 
     764            count: { $sum: 1 } 
     765        } 
     766    }, 
     767    { $sort : { count : -1} } 
     768]); 
     769 
     770# count of country codes for sites that have at least one page detected as MRI  
     771 
     772db.Websites.aggregate([ 
     773    { 
     774        $match: { 
     775            numPagesInMRI: {$gt: 0} 
     776        } 
     777    }, 
     778    { $unwind: "$geoLocationCountryCode" }, 
     779    { 
     780        $group: { 
     781            _id: {$toLower: '$geoLocationCountryCode'}, 
     782            count: { $sum: 1 } 
     783        } 
     784    }, 
     785    { $sort : { count : -1} } 
     786]); 
     787 
     788# count of country codes for sites that have at least one page containing at least one sentence detected as MRI  
     789db.Websites.aggregate([ 
     790    { 
     791        $match: { 
     792            numPagesContainingMRI: {$gt: 0} 
     793        } 
     794    }, 
     795    { $unwind: "$geoLocationCountryCode" }, 
     796    { 
     797        $group: { 
     798            _id: {$toLower: '$geoLocationCountryCode'}, 
     799            count: { $sum: 1 } 
     800        } 
     801    }, 
     802    { $sort : { count : -1} } 
     803]); 
     804 
     805 
     806# ATTEMPT TO FILTER OUT LIKELY AUTO-TRANSLATED SITES 
     807# Get a count of all non-NZ (or .nz TLD) sites that don't have /mi(/) or http(s)://mi.* 
     808# in the URL path of any crawled web pages of the site 
     809db.getCollection('Websites').find( 
     810{$and: [ 
     811    {numPagesContainingMRI: {$gt: 0}}, 
     812    {geoLocationCountryCode: {$ne: "NZ"}}, 
     813    {domain: {$not: /.nz$/}}, 
     814    {urlContainsLangCodeInPath: {$ne: true}} 
     815]}).count() 
     816 
     817220 
     818 
     819# Aggregate: count by country codes of non-NZ related sites that 
     820# don't have the language code in the URL path on any crawled pages of the site 
     821 
     822db.Websites.aggregate([ 
     823    { 
     824        $match: { 
     825            $and: [ 
     826            {numPagesContainingMRI: {$gt: 0}}, 
     827            {geoLocationCountryCode: {$ne: "NZ"}}, 
     828            {domain: {$not: /.nz$/}}, 
     829            {urlContainsLangCodeInPath: {$ne: true}} 
     830            ] 
     831        } 
     832    }, 
     833    { $unwind: "$geoLocationCountryCode" }, 
     834    { 
     835        $group: { 
     836            _id: {$toLower: '$geoLocationCountryCode'}, 
     837            count: { $sum: 1 }, 
     838            domain: { $addToSet: '$domain' } 
     839        } 
     840    }, 
     841    { $sort : { count : -1} } 
     842]); 
     843 
     844The above query contains "domain: { $addToSet: '$domain' }" 
     845which adds the list of matching domains for each country code 
     846to the output of the aggregate result list. 
     847This is useful as I'll be inspecting these manually to ensure they're not 
     848auto-translated to further reduce the list if necessary. 
     849 
     850For each resulting domain, I can then inspect that website's pages in the Webpages 
     851mongodb collection for whether those pages are relevant or auto-translated with a query 
     852of the following form. This example works with the sample site URL https://www.lexilogos.com 
     853 
     854    db.getCollection('Webpages').find({URL:/lexilogos\.com/, mriSentenceCount: {$gt: 0}}) 
     855 
     856 
     857In inspecting Australian sites in the result list, I noticed that one that should not be 
     858excluded from the output was https://www.kiwiproperty.com. The TLD is not .nz, 
     859and the site originates in Australia, not NZ, but it's still a site of NZ content. 
     860This will be an important consideration when constructing some aggregate queries further below. 
     861 
     862 
     863# Count of websites that have at least 1 page containing at least one sentence detected as MRI 
     864# AND which websites have mi in the URL path: 
     865 
     866db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{urlContainsLangCodeInPath: true}]}).count() 
     867 
     868491 
     869 
     870 
     871# The websites that have some MRI detected AND which are either in NZ or with NZ TLD 
     872# or (so if they're from overseas) don't contain /mi or mi.* in URL path: 
     873 
     874db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{geoLocationCountryCode: "NZ"}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}]}).count() 
     875396 
     876 
     877Include Australia, to get the valid "kiwiproperty.com" website included in the result list: 
     878 
     879db.getCollection('Websites').find({$and: [ 
     880                {numPagesContainingMRI: {$gt: 0}}, 
     881                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]} 
     882            ]}).count() 
     883 
     884397 
     885 
     886# aggregate results by a count of country codes 
     887db.Websites.aggregate([ 
     888    { 
     889        $match: { 
     890            $and: [ 
     891                {numPagesContainingMRI: {$gt: 0}}, 
     892                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]} 
     893            ] 
     894        } 
     895    }, 
     896    { $unwind: "$geoLocationCountryCode" }, 
     897    { 
     898        $group: { 
     899            _id: {$toLower: '$geoLocationCountryCode'}, 
     900            count: { $sum: 1 } 
     901        } 
     902    }, 
     903    { $sort : { count : -1} } 
     904]); 
     905 
     906 
     907# Just considering those sites outside NZ or not with .nz TLD: 
     908 
     909db.getCollection('Websites').find({$and: [ 
     910                {geoLocationCountryCode: {$ne: "NZ"}}, 
     911                {domain: {$not: /\.nz/}}, 
     912                {numPagesContainingMRI: {$gt: 0}}, 
     913                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} 
     914            ]}).count() 
     915 
     916221 websites 
     917 
     918# counts by country code excluding NZ related sites 
     919db.Websites.aggregate([ 
     920    { 
     921        $match: { 
     922            $and: [ 
     923                {geoLocationCountryCode: {$ne: "NZ"}}, 
     924                {domain: {$not: /\.nz/}}, 
     925                {numPagesContainingMRI: {$gt: 0}}, 
     926                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}             
     927            ] 
     928        } 
     929    }, 
     930    { $unwind: "$geoLocationCountryCode" }, 
     931    { 
     932        $group: { 
     933            _id: {$toLower: '$geoLocationCountryCode'}, 
     934            count: { $sum: 1 }, 
     935            domain: { $addToSet: '$domain' } 
     936        } 
     937    }, 
     938    { $sort : { count : -1} } 
     939]); 
     940 
     941 
     942# But to produce the tentative non-product sites, we also want the aggregate for all NZ sites (from NZ or with .nz tld): 
     943db.getCollection('Websites').find({$and: [ 
     944                {numPagesContainingMRI: {$gt: 0}}, 
     945                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
     946            ]}).count() 
     947 
     948176 
     949 
     950(Total is 221+176 = 397, which adds up). 
     951 
     952# Get the count (and domain listing) output put under a hardcoded  _id of "nz": 
     953db.Websites.aggregate([ 
     954    { 
     955        $match: { 
     956            $and: [ 
     957                {numPagesContainingMRI: {$gt: 0}}, 
     958                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 
     959            ] 
     960        } 
     961    }, 
     962    { $unwind: "$geoLocationCountryCode" }, 
     963    { 
     964        $group: { 
     965            _id: "nz", 
     966            count: { $sum: 1 }, 
     967            domain: { $addToSet: '$domain' } 
     968        } 
     969    }, 
     970    { $sort : { count : -1} } 
     971]); 
    695972 
    696973--------------------------------------------------------