Changeset 33847


Ignore:
Timestamp:
2020-01-17T19:32:16+13:00 (4 years ago)
Author:
ak19
Message:

indigenousblogs.com did have one page actually in Maori (an XML feed). So adding 1 to the table of counts for US sites with mi in the URL path that contained actual MRI.

Location:
other-projects/maori-lang-detection
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33843 r33847  
    14881488TIDIED:
    14891489NZ: 176
    1490 US: 25+3 from US with mi in URL path = 28
     1490US: 25+4 from US with mi in URL path = 29
    14911491AU: 3
    14921492DE: 2
     
    14971497FR: 1
    14981498IE: 1
    1499 TOTAL: 213+3 from US with mi in URL path = 216
     1499TOTAL: 213+4 from US with mi in URL path = 217
    15001500
    15011501
     
    15251525Of interest or possible interest:
    15261526US:
    1527 !! http://indigenousblogs.com [15/18 blogs work]
     1527!! http://indigenousblogs.com [15/18 blogs work] - has one page in Maori (http://indigenousblogs.com/feeds/mi.xml)
    15281528X https://biblia.gospelprime.com.br - misdetection (containsMRI)
    15291529X ?https://follow3rs.com - seems dodgy and possibly auto-translated. Can't spell account, misspelled as accout
     
    15591559db.getCollection('Webpages').find({$and: [{isMRI: true}, {URL: /indigenousblogs\.com/}]})
    15601560=> http://indigenousblogs.com/mi/
     1561
     1562--------------------------
     1563
     1564
     1565db.Websites.aggregate([
     1566    {
     1567        $match: {
     1568            $and: [
     1569                {geoLocationCountryCode: {$ne: "NZ"}},
     1570                {domain: {$not: /\.nz/}},
     1571                {numPagesContainingMRI: {$gt: 0}},
     1572                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     1573            ]
     1574        }
     1575    },
     1576    { $unwind: "$geoLocationCountryCode" },
     1577    {
     1578        $group: {
     1579            _id: {$toLower: '$geoLocationCountryCode'},
     1580            count: { $sum: 1 },
     1581            domain: { $addToSet: '$domain' },
     1582            numPagesInMRI: { $addToSet: '$numPagesInMRI' },
     1583            numPagesContainingMRI: { $addToSet: '$numPagesContainingMRI' },
     1584            numPagesInMRICount: { $sum: '$numPagesInMRI' },
     1585            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
     1586        }
     1587    },
     1588    { $sort : { count : -1} }
     1589]);
     1590
     1591
     1592To convert json to csv
     1593In gedit replace
     1594\/\*\s*\d+\s*\*\/ => ,
  • other-projects/maori-lang-detection/mongodb-data/6counts_nonProductSites1_manualShortlist.json

    r33844 r33847  
    1212{
    1313    "_id" : "us",
    14     "count" : 28.0
     14    "count" : 29.0
    1515}
    1616{
  • other-projects/maori-lang-detection/mongodb-data/6geojson-features_nonProductSites1_manualShortlist.json

    r33844 r33847  
    1 {"type":"FeatureCollection","features":[{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[170.885971,-40.900557],[170.885971,47.099443],[178.885971,47.099443],[178.885971,-40.900557],[170.885971,-40.900557]]]},"properties":{"code":"NZ","count":176,"region":"New Zealand"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-97.712891,37.09024],[-97.712891,65.09024],[-93.712891,65.09024],[-93.712891,37.09024],[-97.712891,37.09024]]]},"properties":{"code":"US","count":28,"region":"United States"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[131.775136,-25.274398],[131.775136,-22.274398],[135.775136,-22.274398],[135.775136,-25.274398],[131.775136,-25.274398]]]},"properties":{"code":"AU","count":3,"region":"Australia"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[8.451526,51.165691],[8.451526,53.165691],[12.451526,53.165691],[12.451526,51.165691],[8.451526,51.165691]]]},"properties":{"code":"DE","count":2,"region":"Germany"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[7.501785,56.26392],[7.501785,58.26392],[11.501785,58.26392],[11.501785,56.26392],[7.501785,56.26392]]]},"properties":{"code":"DK","count":2,"region":"Denmark"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[23.48583,42.733883],[23.48583,43.733883],[27.48583,43.733883],[27.48583,42.733883],[23.48583,42.733883]]]},"properties":{"code":"BG","count":1,"region":"Bulgaria"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[13.472962,49.817492],[13.472962,50.817492],[17.472962000000003,50.817492],[17.472962000000003,49.817492],[13.472962,49.817492]]]},"properties":{"code":"CZ","count":1,"region":"Czech Republic"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-5.74922,40.463667],[-5.74922,41.463667],[-1.7492200000000002,41.463667],[-1.7492200000000002,40.463667],[-5.74922,40.463667]]]},"properties":{"code":"ES","count":1,"region":"Spain"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[0.21374899999999997,46.227638],[0.21374899999999997,47.227638],[4.213749,47.227638],[4.213749,46.227638],[0.21374899999999997,46.227638]]]},"properties":{"code":"FR","count":1,"region":"France"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-10.24389,53.41291],[-10.24389,54.41291],[-6.24389,54.41291],[-6.24389,53.41291],[-10.24389,53.41291]]]},"properties":{"code":"IE","count":1,"region":"Ireland"}}]}
     1{"type":"FeatureCollection","features":[{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[170.885971,-40.900557],[170.885971,47.099443],[178.885971,47.099443],[178.885971,-40.900557],[170.885971,-40.900557]]]},"properties":{"code":"NZ","count":176,"region":"New Zealand"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-97.712891,37.09024],[-97.712891,66.09024],[-93.712891,66.09024],[-93.712891,37.09024],[-97.712891,37.09024]]]},"properties":{"code":"US","count":29,"region":"United States"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[131.775136,-25.274398],[131.775136,-22.274398],[135.775136,-22.274398],[135.775136,-25.274398],[131.775136,-25.274398]]]},"properties":{"code":"AU","count":3,"region":"Australia"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[8.451526,51.165691],[8.451526,53.165691],[12.451526,53.165691],[12.451526,51.165691],[8.451526,51.165691]]]},"properties":{"code":"DE","count":2,"region":"Germany"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[7.501785,56.26392],[7.501785,58.26392],[11.501785,58.26392],[11.501785,56.26392],[7.501785,56.26392]]]},"properties":{"code":"DK","count":2,"region":"Denmark"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[23.48583,42.733883],[23.48583,43.733883],[27.48583,43.733883],[27.48583,42.733883],[23.48583,42.733883]]]},"properties":{"code":"BG","count":1,"region":"Bulgaria"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[13.472962,49.817492],[13.472962,50.817492],[17.472962000000003,50.817492],[17.472962000000003,49.817492],[13.472962,49.817492]]]},"properties":{"code":"CZ","count":1,"region":"Czech Republic"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-5.74922,40.463667],[-5.74922,41.463667],[-1.7492200000000002,41.463667],[-1.7492200000000002,40.463667],[-5.74922,40.463667]]]},"properties":{"code":"ES","count":1,"region":"Spain"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[0.21374899999999997,46.227638],[0.21374899999999997,47.227638],[4.213749,47.227638],[4.213749,46.227638],[0.21374899999999997,46.227638]]]},"properties":{"code":"FR","count":1,"region":"France"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-10.24389,53.41291],[-10.24389,54.41291],[-6.24389,54.41291],[-6.24389,53.41291],[-10.24389,53.41291]]]},"properties":{"code":"IE","count":1,"region":"Ireland"}}]}
Note: See TracChangeset for help on using the changeset viewer.