Ignore:
Timestamp:
2019-12-18T21:38:44+13:00 (4 years ago)
Author:
ak19
Message:

With the bugfix from yesterday and the inclusion of http(s):mi.* type URLs in setting the Websites mongodb collection's urlContainsLangCodeInPath property, and updated/improved mongodb queries and their results I have now regenerated the latest geojson json data and maps.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33807 r33813  
    461461117496
    462462
    463 # Find number of websites who have 1 or more pages in Maori (a positive numPagesInMRI)
     463# Find number of websites that have 1 or more pages detected as being in Maori (a positive numPagesInMRI)
    464464db.getCollection('Websites').find({numPagesInMRI: { $gt: 0}}).count()
    465465361
     
    488488# Number of sites with URLs containing /mi(/)
    489489db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count()
    490 153
     490X 153
     491# Number of sites with URLs containing /mi(/) OR http(s)://mi.*
     492db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count()
     493670
    491494
    492495# Number of websites that are outside NZ that contain /mi(/) in any of its sub-urls
    493496db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
    494 147
    495 
    496 # 5 sites with URLs containing /mi(/) that are in NZ
     497X 147
     498# Number of websites that are outside NZ that contain /mi(/) OR http(s)://mi.* in any of its sub-urls
     499db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
     500656
     501
     502# 6 sites with URLs containing /mi(/) that are in NZ
    497503db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: "NZ"}).count()
    498 6
     504X 6
     505# 14 sites with URLs containing /mi(/) OR http(s)://mi.* that are in NZ
     50614
    499507
    500508
     
    609617
    610618WORKS:
    611 // count of country codes for sites that have /mi(/) in path
     619// count of country codes for sites that have /mi(/) or http(s)://mi.* in URL path
    612620
    613621db.Websites.aggregate([
     
    743751# These are the TENTATIVE NON-PRODUCT SITES
    744752# Should be less than the point 4, but more than 1 to 3
     753
    745754db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{urlContainsLangCodeInPath: false}, {$and: [{urlContainsLangCodeInPath: true}, {geoLocationCountryCode: "NZ"}]}]}]}).count()
    746 859
     755X 859
     756
     757Now with http(s)://mi.* also excluded, the above query returns a count of:
     758389
     759
     760
     761BUT THIS IS THE CORRECT VERSION OF THE QUERY:
     762db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{geoLocationCountryCode: "NZ"}, {urlContainsLangCodeInPath: false}]}]}).count()
     763389
     764
    747765
    748766# 6. Now do the counts by country code of the above, by pasting the query of point 5 as the $match clause (i.e. without the .count() suffix)
     
    913931    {
    914932        $match: {
    915             $and: [{numPagesContainingMRI: {$gt: 0}}, {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /.nz$/}}, {urlContainsLangCodeInPath: {$ne: true}}, ]
     933            $and: [{numPagesContainingMRI: {$gt: 0}}, {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /.nz$/}}, {urlContainsLangCodeInPath: {$ne: true}}]
    916934        }
    917935    },
     
    926944    { $sort : { count : -1} }
    927945]);
     946
     947
     948We can knock of another 54 non-NZ sites with our new urlContainsLangCodeInPathPrefix field:
     949
     950   db.getCollection('Websites').find({urlContainsLangCodeInPathPrefix: true, geoLocationCountryCode: {$ne: "NZ"}, domain: {$not: /.nz$/}}).count()
     951   54
     952
     953
     954SO, can repeat query with new field "urlContainsLangCodeInPathPrefix":
     955Number of sites containing >= 1 MRI sentences that are not from NZ or of .nz TLD and which don't contain "/mi(/)" or "http(s)://mi." in URL path:
     956   db.getCollection('Websites').find({$and: [
     957                     {numPagesContainingMRI: {$gt: 0}},
     958                     {geoLocationCountryCode: {$ne: "NZ"}},
     959                     {domain: {$not: /.nz$/}},
     960                     {urlContainsLangCodeInPathSuffix: {$ne: true}},
     961                     {urlContainsLangCodeInPathPrefix: {$ne: true}}
     962                ]}).count()
     963
     964   651
     965
     966
     967REDO THE COUNT BY COUNTRY QUERY FOR THIS:
     968
     969db.Websites.aggregate([
     970    {
     971        $match: {
     972            $and: [{numPagesContainingMRI: {$gt: 0}}, {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /.nz$/}}, {urlContainsLangCodeInPathSuffix: {$ne: true}}, {urlContainsLangCodeInPathPrefix: {$ne: true}}]
     973        }
     974    },
     975    { $unwind: "$geoLocationCountryCode" },
     976    {
     977        $group: {
     978            _id: {$toLower: '$geoLocationCountryCode'},
     979            count: { $sum: 1 },
     980            domain: { $addToSet: '$domain' }
     981        }
     982    },
     983    { $sort : { count : -1} }
     984]);
     985
     986
     987AFTER BUGFIX FOR miInURLPath being set at the correct now:
     988db.getCollection('Websites').find(
     989{$and: [
     990    {numPagesContainingMRI: {$gt: 0}},
     991    {geoLocationCountryCode: {$ne: "NZ"}},
     992    {domain: {$not: /.nz$/}},
     993    {urlContainsLangCodeInPath: {$ne: true}}
     994]}).count()
     995
     996220
     997
     998db.Websites.aggregate([
     999    {
     1000        $match: {
     1001            $and: [
     1002            {numPagesContainingMRI: {$gt: 0}},
     1003            {geoLocationCountryCode: {$ne: "NZ"}},
     1004            {domain: {$not: /.nz$/}},
     1005            {urlContainsLangCodeInPath: {$ne: true}}
     1006            ]
     1007        }
     1008    },
     1009    { $unwind: "$geoLocationCountryCode" },
     1010    {
     1011        $group: {
     1012            _id: {$toLower: '$geoLocationCountryCode'},
     1013            count: { $sum: 1 },
     1014            domain: { $addToSet: '$domain' }
     1015        }
     1016    },
     1017    { $sort : { count : -1} }
     1018]);
     1019
     1020Can inspect websites' pages for whether it's relevant/auto-translated as follows:
     1021    db.getCollection('Webpages').find({URL:/svenkirsten.com/, mriSentenceCount: {$gt: 0}})
     1022
    9281023
    9291024* CN: Only 1/113 sites from CN stood out as being of interest: http://kiwi2china.com/
     
    9371032    http://splaf.free.fr/pfurb.html - Tahiti, French Polynesian, ... island names
    9381033    http://mi.fitnessrebates.com - Uses https://wordpress.org/plugins/weglot/ wordpress-compatible multilingual plugin, which ensures translated pages get indexed by google - exactly what we want to avoid
    939 *
    940 
     1034*
     1035
     1036
     1037DE:
     1038http://etymologie.info/~e/n_/nz-___reg.html - placenames, not meaningful
     1039!! https://www.cartogiraffe.com/ - some genuine pages (Rarotongan), but one page is in Czech that had a single word misindentified as MRI
     1040~ http://svenkirsten.com/ - one page mentions "tiki" but the rest is in English. The other is an (English) caption of "Book of Tiki A Maori Maiden"
     1041- herocity - autotranslated
     1042- weltderberge.de - 3 pages mention NZ mountains by name.
     1043~ (arts.mythologica.fr) https://mythologica.fr/oceanie/texte/pantheon_polynesien.pdf - mentions certain Maori Gods and other Polynesian Gods by name.
     1044- https://traynews.com - nothing in MRI, misdetected
     1045~ http://klaaskoehne.de/galleries/nzl-taranaki/index.html - mentions NZ mountain names
     1046- http://www.nierstrasz.org/deGrauwRegister.rtf - misdetected European (Dutch) names as MRI
     1047- https://afrikhepri.org/mi/ - autotranslated
     1048- https://www.tvteile.de - pure German pages, misdetected "Automatik" as a Maori language word
     1049- etoile-de-lune.net - 5 pages containing 1 sentence each but none with 2 sentences detected
     1050
     1051- ITALY:
     1052  http://oipaz.net/IMG/GalleriaAotearoa/ - NZ photogallery with each photo captioned by placename
     1053  http://www.marcosanti.it/Reportage/Oceania_ph/Nuova_Zelanda/ - each photo captioned by NZ placename
     1054  http://www.pegasoesmicamion.com/ - REO abbreviation misidentified, also in REO%20PUBLICIDAD.htm
     1055- AUSTRIA:
     1056  petit-prince.at - Tahitian and Wayuu (Venezuela) translations of Le Petit Prince
     1057  http://www.tmtmm.net/newzealand - photos from NZ named after places and people's names
     1058- ROMANIA: parohiauceadesus.ro - Sentences of single Romanian words misidentified.
     1059- ISRAEL:
     1060  http://www.daat.ac.il - misidentification of "no." as MRI, and Hebrew words.
     1061  https://www.hitiaotera.com/ - misidentifiation of Tahitian pages
     1062- RUSSIA: https://www.gismeteo.lv - misidentification of an email address
     1063- JAPAN: http://yutaka.it-n.jp - many pages of scientific names of (plants?) which are often misdetected as MRI
     1064!! Ireland, ie: https://coggle.it
     1065- IRAN: https://www.dideo.ir/v/yt/d6cgya0ze-E - video title from MaoriTelevision website
     1066? - CZECH republic: https://www.fipojobs.com/new-zealand/jobs-work-p-1 - NZ job position title in MRI but rest in English
     1067- SPAIN: http://www.info-hoteles.com/nz/2/hotels_lake_rotoiti.asp - 2 uses of the same placename
     1068- SINGAPORE: https://omg-solutions.com - autotranslated
     1069- TURKEY: https://www.elitedeluxe.com.tr/mi/yatak-odasi-takimlari - autotranslated
     1070- MEXICO: http://www.gelbukh.com - misidentification, lines of just numbers or phrases like "Area Chair" in English and Russian CVs.
     1071- FINLAND: http://pertti.com - travelogue, placenames
     1072- SWITZERLAND CH:
     1073  nicoledidi.ch - blog, placenames
     1074  https://photos.axelebert.org - Tahiti related content
     1075- UNKNOWN: https://www.viveipcl.com: tours website, placenames mentioned
     1076#- EU: https://www.the-good-stuff-factory.be/mi/ : Autotranslated
     1077!! - BULGARIA: http://anitra.net/activism/humanrights/UDHR/rrt_print.htm (2 pages)
     1078
     1079
     1080TREATING AUSTRALIA AND GREAT BRITAIN MORE SPECIALLY (don't ignore /mi in URL, same as with NZ, but do leave out .nz TLDs since we cover them under NZ - TODO: later find country codes of all .nz TLDs):
     1081[nothing found under "UK", only under "GB"]
     1082
     1083db.getCollection('Websites').find({
     1084    domain: {$not: /.nz$/},
     1085    numPagesContainingMRI: {$gt: 0},   
     1086    $or: [{geoLocationCountryCode: "AU"}, {geoLocationCountryCode: "GB"}]
     1087}).count()
     108811
     1089
     1090db.Websites.aggregate([
     1091    {
     1092        $match: {
     1093           domain: {$not: /.nz$/},
     1094           numPagesContainingMRI: {$gt: 0},           
     1095           $or: [{geoLocationCountryCode: "AU"}, {geoLocationCountryCode: "GB"}]
     1096        }
     1097    },
     1098    { $unwind: "$geoLocationCountryCode" },
     1099    {
     1100        $group: {
     1101            _id: {$toLower: '$geoLocationCountryCode'},
     1102            count: { $sum: 1 },
     1103            domain: { $addToSet: '$domain' }
     1104        }
     1105    },
     1106    { $sort : { count : -1} }
     1107]);
     1108
     1109AUSTRALIA:
     1110!!  https://www.kiwiproperty.com - e.g. https://www.kiwiproperty.com/the-base/mi/he-paepaki/ has some actual MRI sentences. [Not autotranslated]
     1111?   http://fionajack.net - Wellington gallery of artist. A few occurrences of Kia Ora in a title like context (e.g. "Street Party Kia Ora! Kia Ora!")
     1112!!  https://infogram.com/te-marautanga-o-aotearoa-moe-pld-allocations-2012-1go502ygvn562jd  - site of individual pages (like docs.google.com). This one has a relevant infogram image.
     1113!!  https://koreromaori.com - some actual Maori language sentences
     1114    http://theunderwaterworld.com/Galleries/Roimata/roimata-frame.html - placenames
     1115
     1116UK:
     1117    http://www.wordsearchfun.com/200628_Word_Find_wordsearch.html - 2 word games with Maori words (one of them has 3 different views, e.g. print view)
     1118?   https://omniatlas.com/maps/australasia/18400206/plain/ - historical map with Maori iwi names over NZ map regions
     1119?      https://omniatlas.com/maps/australasia/18400206/ - historical map of Australia and NZ at the time of the Treaty of Waitangi, with events marked in English
     1120    https://centrallanguageschool.com - AUTOTRANSLATED
     1121    https://www.solasolv.com - Autotranslated product site
     1122    http://mikestephens.co.uk/ - photo captions containing NZ placenames
     1123    http://www.woolrych.org/nzholiday2004/ - photogallery captioned with NZ placenames
    9411124--------------
    9421125
     
    9591142
    9601143---------------------
     1144
     1145Count of websites that have at least 1 page containing at least one sentence detected as MRI
     1146AND which websites have mi in the URL path:
     1147
     1148db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{urlContainsLangCodeInPath: true}]}).count()
     1149
     1150491
     1151
     1152
     1153
     1154# The websites that have some MRI detected AND which are either in NZ or with NZ TLD
     1155# or (so if they're from overseas) don't contain /mi or mi.* in URL path:
     1156
     1157db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{geoLocationCountryCode: "NZ"}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}]}).count()
     1158396
     1159
     1160Include Australia (to get the valid "kiwiproperty.com" website included in the result list):
     1161
     1162db.getCollection('Websites').find({$and: [
     1163                {numPagesContainingMRI: {$gt: 0}},
     1164                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
     1165            ]}).count()
     1166
     1167397
     1168
     1169# aggregate results by a count of country codes
     1170db.Websites.aggregate([
     1171    {
     1172        $match: {
     1173            $and: [
     1174                {numPagesContainingMRI: {$gt: 0}},
     1175                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
     1176            ]
     1177        }
     1178    },
     1179    { $unwind: "$geoLocationCountryCode" },
     1180    {
     1181        $group: {
     1182            _id: {$toLower: '$geoLocationCountryCode'},
     1183            count: { $sum: 1 }
     1184        }
     1185    },
     1186    { $sort : { count : -1} }
     1187]);
     1188
     1189
     1190# Just considering those sites outside NZ or not with .nz TLD:
     1191
     1192db.getCollection('Websites').find({$and: [
     1193                {geoLocationCountryCode: {$ne: "NZ"}},
     1194                {domain: {$not: /\.nz/}},
     1195                {numPagesContainingMRI: {$gt: 0}},
     1196                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     1197            ]}).count()
     1198
     1199221 websites
     1200
     1201# counts by country code excluding NZ related sites
     1202db.Websites.aggregate([
     1203    {
     1204        $match: {
     1205            $and: [
     1206                {geoLocationCountryCode: {$ne: "NZ"}},
     1207                {domain: {$not: /\.nz/}},
     1208                {numPagesContainingMRI: {$gt: 0}},
     1209                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     1210            ]
     1211        }
     1212    },
     1213    { $unwind: "$geoLocationCountryCode" },
     1214    {
     1215        $group: {
     1216            _id: {$toLower: '$geoLocationCountryCode'},
     1217            count: { $sum: 1 },
     1218            domain: { $addToSet: '$domain' }
     1219        }
     1220    },
     1221    { $sort : { count : -1} }
     1222]);
     1223
     1224
     1225# But to produce the tentative non-product sites, we also want the aggregate for all NZ sites (from NZ or with .nz tld):
     1226db.getCollection('Websites').find({$and: [
     1227                {numPagesContainingMRI: {$gt: 0}},
     1228                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     1229            ]}).count()
     1230
     1231176
     1232
     1233(Total is 221+176 = 397, which adds up).
     1234
     1235# Get the count (and domain listing) output put under a hardcoded  _id of "nz":
     1236db.Websites.aggregate([
     1237    {
     1238        $match: {
     1239            $and: [
     1240                {numPagesContainingMRI: {$gt: 0}},
     1241                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
     1242            ]
     1243        }
     1244    },
     1245    { $unwind: "$geoLocationCountryCode" },
     1246    {
     1247        $group: {
     1248            _id: "nz",
     1249            count: { $sum: 1 },
     1250            domain: { $addToSet: '$domain' }
     1251        }
     1252    },
     1253    { $sort : { count : -1} }
     1254]);
Note: See TracChangeset for help on using the changeset viewer.