Changeset 33847

Show
Ignore:
Timestamp:
17.01.2020 19:32:16 (5 weeks ago)
Author:
ak19
Message:

indigenousblogs.com did have one page actually in Maori (an XML feed). So adding 1 to the table of counts for US sites with mi in the URL path that contained actual MRI.

Location:
other-projects/maori-lang-detection
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33843 r33847  
    14881488TIDIED: 
    14891489NZ: 176 
    1490 US: 25+3 from US with mi in URL path = 28 
     1490US: 25+4 from US with mi in URL path = 29 
    14911491AU: 3 
    14921492DE: 2 
     
    14971497FR: 1 
    14981498IE: 1 
    1499 TOTAL: 213+3 from US with mi in URL path = 216 
     1499TOTAL: 213+4 from US with mi in URL path = 217 
    15001500 
    15011501 
     
    15251525Of interest or possible interest: 
    15261526US:  
    1527 !! http://indigenousblogs.com [15/18 blogs work] 
     1527!! http://indigenousblogs.com [15/18 blogs work] - has one page in Maori (http://indigenousblogs.com/feeds/mi.xml) 
    15281528X https://biblia.gospelprime.com.br - misdetection (containsMRI) 
    15291529X ?https://follow3rs.com - seems dodgy and possibly auto-translated. Can't spell account, misspelled as accout 
     
    15591559db.getCollection('Webpages').find({$and: [{isMRI: true}, {URL: /indigenousblogs\.com/}]}) 
    15601560=> http://indigenousblogs.com/mi/ 
     1561 
     1562-------------------------- 
     1563 
     1564 
     1565db.Websites.aggregate([ 
     1566    { 
     1567        $match: { 
     1568            $and: [ 
     1569                {geoLocationCountryCode: {$ne: "NZ"}}, 
     1570                {domain: {$not: /\.nz/}}, 
     1571                {numPagesContainingMRI: {$gt: 0}}, 
     1572                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}             
     1573            ] 
     1574        } 
     1575    }, 
     1576    { $unwind: "$geoLocationCountryCode" }, 
     1577    { 
     1578        $group: { 
     1579            _id: {$toLower: '$geoLocationCountryCode'}, 
     1580            count: { $sum: 1 }, 
     1581            domain: { $addToSet: '$domain' }, 
     1582            numPagesInMRI: { $addToSet: '$numPagesInMRI' }, 
     1583            numPagesContainingMRI: { $addToSet: '$numPagesContainingMRI' }, 
     1584            numPagesInMRICount: { $sum: '$numPagesInMRI' }, 
     1585            numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 
     1586        } 
     1587    }, 
     1588    { $sort : { count : -1} } 
     1589]); 
     1590 
     1591 
     1592To convert json to csv 
     1593In gedit replace 
     1594\/\*\s*\d+\s*\*\/ => , 
  • other-projects/maori-lang-detection/mongodb-data/6counts_nonProductSites1_manualShortlist.json

    r33844 r33847  
    1212{ 
    1313    "_id" : "us", 
    14     "count" : 28.0 
     14    "count" : 29.0 
    1515} 
    1616{ 
  • other-projects/maori-lang-detection/mongodb-data/6geojson-features_nonProductSites1_manualShortlist.json

    r33844 r33847  
    1 {"type":"FeatureCollection","features":[{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[170.885971,-40.900557],[170.885971,47.099443],[178.885971,47.099443],[178.885971,-40.900557],[170.885971,-40.900557]]]},"properties":{"code":"NZ","count":176,"region":"New Zealand"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-97.712891,37.09024],[-97.712891,65.09024],[-93.712891,65.09024],[-93.712891,37.09024],[-97.712891,37.09024]]]},"properties":{"code":"US","count":28,"region":"United States"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[131.775136,-25.274398],[131.775136,-22.274398],[135.775136,-22.274398],[135.775136,-25.274398],[131.775136,-25.274398]]]},"properties":{"code":"AU","count":3,"region":"Australia"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[8.451526,51.165691],[8.451526,53.165691],[12.451526,53.165691],[12.451526,51.165691],[8.451526,51.165691]]]},"properties":{"code":"DE","count":2,"region":"Germany"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[7.501785,56.26392],[7.501785,58.26392],[11.501785,58.26392],[11.501785,56.26392],[7.501785,56.26392]]]},"properties":{"code":"DK","count":2,"region":"Denmark"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[23.48583,42.733883],[23.48583,43.733883],[27.48583,43.733883],[27.48583,42.733883],[23.48583,42.733883]]]},"properties":{"code":"BG","count":1,"region":"Bulgaria"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[13.472962,49.817492],[13.472962,50.817492],[17.472962000000003,50.817492],[17.472962000000003,49.817492],[13.472962,49.817492]]]},"properties":{"code":"CZ","count":1,"region":"Czech Republic"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-5.74922,40.463667],[-5.74922,41.463667],[-1.7492200000000002,41.463667],[-1.7492200000000002,40.463667],[-5.74922,40.463667]]]},"properties":{"code":"ES","count":1,"region":"Spain"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[0.21374899999999997,46.227638],[0.21374899999999997,47.227638],[4.213749,47.227638],[4.213749,46.227638],[0.21374899999999997,46.227638]]]},"properties":{"code":"FR","count":1,"region":"France"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-10.24389,53.41291],[-10.24389,54.41291],[-6.24389,54.41291],[-6.24389,53.41291],[-10.24389,53.41291]]]},"properties":{"code":"IE","count":1,"region":"Ireland"}}]} 
     1{"type":"FeatureCollection","features":[{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[170.885971,-40.900557],[170.885971,47.099443],[178.885971,47.099443],[178.885971,-40.900557],[170.885971,-40.900557]]]},"properties":{"code":"NZ","count":176,"region":"New Zealand"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-97.712891,37.09024],[-97.712891,66.09024],[-93.712891,66.09024],[-93.712891,37.09024],[-97.712891,37.09024]]]},"properties":{"code":"US","count":29,"region":"United States"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[131.775136,-25.274398],[131.775136,-22.274398],[135.775136,-22.274398],[135.775136,-25.274398],[131.775136,-25.274398]]]},"properties":{"code":"AU","count":3,"region":"Australia"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[8.451526,51.165691],[8.451526,53.165691],[12.451526,53.165691],[12.451526,51.165691],[8.451526,51.165691]]]},"properties":{"code":"DE","count":2,"region":"Germany"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[7.501785,56.26392],[7.501785,58.26392],[11.501785,58.26392],[11.501785,56.26392],[7.501785,56.26392]]]},"properties":{"code":"DK","count":2,"region":"Denmark"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[23.48583,42.733883],[23.48583,43.733883],[27.48583,43.733883],[27.48583,42.733883],[23.48583,42.733883]]]},"properties":{"code":"BG","count":1,"region":"Bulgaria"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[13.472962,49.817492],[13.472962,50.817492],[17.472962000000003,50.817492],[17.472962000000003,49.817492],[13.472962,49.817492]]]},"properties":{"code":"CZ","count":1,"region":"Czech Republic"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-5.74922,40.463667],[-5.74922,41.463667],[-1.7492200000000002,41.463667],[-1.7492200000000002,40.463667],[-5.74922,40.463667]]]},"properties":{"code":"ES","count":1,"region":"Spain"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[0.21374899999999997,46.227638],[0.21374899999999997,47.227638],[4.213749,47.227638],[4.213749,46.227638],[0.21374899999999997,46.227638]]]},"properties":{"code":"FR","count":1,"region":"France"}},{"type":"Feature","geometry":{"type":"Polygon","coordinates":[[[-10.24389,53.41291],[-10.24389,54.41291],[-6.24389,54.41291],[-6.24389,53.41291],[-10.24389,53.41291]]]},"properties":{"code":"IE","count":1,"region":"Ireland"}}]}