Context Navigation

← Previous Change
Next Change →

mongodb.txt

Timestamp:

2019-12-18T21:38:44+13:00 (4 years ago)

Author:

ak19

Message:

With the bugfix from yesterday and the inclusion of http(s):mi.* type URLs in setting the Websites mongodb collection's urlContainsLangCodeInPath property, and updated/improved mongodb queries and their results I have now regenerated the latest geojson json data and maps.

File:

: 1 edited

other-projects/maori-lang-detection/MoreReading/mongodb.txt (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/maori-lang-detection/MoreReading/mongodb.txt

-              r33807
+              r33813
 # Find number of websites who have 1 or more pages in Maori (a positive numPagesInMRI)
+# Find number of websites that have 1 or more pages detected as being in Maori (a positive numPagesInMRI)
 db.getCollection('Websites').find({numPagesInMRI: { $gt: 0}}).count()
 …
 # Number of sites with URLs containing /mi(/)
 db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count()
+X 153
+# Number of sites with URLs containing /mi(/) OR http(s)://mi.*
+db.getCollection('Websites').find({urlContainsLangCodeInPath:true}).count()
 # Number of websites that are outside NZ that contain /mi(/) in any of its sub-urls
 db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
+# 5 sites with URLs containing /mi(/) that are in NZ
+X 147
+# Number of websites that are outside NZ that contain /mi(/) OR http(s)://mi.* in any of its sub-urls
+db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: {$ne : "NZ"} }).count()
+# 6 sites with URLs containing /mi(/) that are in NZ
 db.getCollection('Websites').find({urlContainsLangCodeInPath:true, geoLocationCountryCode: "NZ"}).count()
+X 6
+# 14 sites with URLs containing /mi(/) OR http(s)://mi.* that are in NZ
 …
 WORKS:
 // count of country codes for sites that have /mi(/) in path
+// count of country codes for sites that have /mi(/) or http(s)://mi.* in URL path
 db.Websites.aggregate([
 …
 # These are the TENTATIVE NON-PRODUCT SITES
 # Should be less than the point 4, but more than 1 to 3
 db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{urlContainsLangCodeInPath: false}, {$and: [{urlContainsLangCodeInPath: true}, {geoLocationCountryCode: "NZ"}]}]}]}).count()
+X 859
+Now with http(s)://mi.* also excluded, the above query returns a count of:
+BUT THIS IS THE CORRECT VERSION OF THE QUERY:
+db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{geoLocationCountryCode: "NZ"}, {urlContainsLangCodeInPath: false}]}]}).count()
 # 6. Now do the counts by country code of the above, by pasting the query of point 5 as the $match clause (i.e. without the .count() suffix)
 …
+    {
         $match: {
             $and: [{numPagesContainingMRI: {$gt: 0}}, {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /.nz$/}}, {urlContainsLangCodeInPath: {$ne: true}}, ]
+            $and: [{numPagesContainingMRI: {$gt: 0}}, {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /.nz$/}}, {urlContainsLangCodeInPath: {$ne: true}}]
+        }
     },
 …
     { $sort : { count : -1} }
 ]);
+We can knock of another 54 non-NZ sites with our new urlContainsLangCodeInPathPrefix field:
+   db.getCollection('Websites').find({urlContainsLangCodeInPathPrefix: true, geoLocationCountryCode: {$ne: "NZ"}, domain: {$not: /.nz$/}}).count()
+SO, can repeat query with new field "urlContainsLangCodeInPathPrefix":
+Number of sites containing >= 1 MRI sentences that are not from NZ or of .nz TLD and which don't contain "/mi(/)" or "http(s)://mi." in URL path:
+   db.getCollection('Websites').find({$and: [
+                     {numPagesContainingMRI: {$gt: 0}},
+                     {geoLocationCountryCode: {$ne: "NZ"}},
+                     {domain: {$not: /.nz$/}},
+                     {urlContainsLangCodeInPathSuffix: {$ne: true}},
+                     {urlContainsLangCodeInPathPrefix: {$ne: true}}
+                ]}).count()
+REDO THE COUNT BY COUNTRY QUERY FOR THIS:
+db.Websites.aggregate([
+    {
+        $match: {
+            $and: [{numPagesContainingMRI: {$gt: 0}}, {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /.nz$/}}, {urlContainsLangCodeInPathSuffix: {$ne: true}}, {urlContainsLangCodeInPathPrefix: {$ne: true}}]
+        }
+    },
+    { $unwind: "$geoLocationCountryCode" },
+    {
+        $group: {
+            _id: {$toLower: '$geoLocationCountryCode'},
+            count: { $sum: 1 },
+            domain: { $addToSet: '$domain' }
+        }
+    },
+    { $sort : { count : -1} }
+]);
+AFTER BUGFIX FOR miInURLPath being set at the correct now:
+db.getCollection('Websites').find(
+{$and: [
+    {numPagesContainingMRI: {$gt: 0}},
+    {geoLocationCountryCode: {$ne: "NZ"}},
+    {domain: {$not: /.nz$/}},
+    {urlContainsLangCodeInPath: {$ne: true}}
+]}).count()
+db.Websites.aggregate([
+    {
+        $match: {
+            $and: [
+            {numPagesContainingMRI: {$gt: 0}},
+            {geoLocationCountryCode: {$ne: "NZ"}},
+            {domain: {$not: /.nz$/}},
+            {urlContainsLangCodeInPath: {$ne: true}}
+            ]
+        }
+    },
+    { $unwind: "$geoLocationCountryCode" },
+    {
+        $group: {
+            _id: {$toLower: '$geoLocationCountryCode'},
+            count: { $sum: 1 },
+            domain: { $addToSet: '$domain' }
+        }
+    },
+    { $sort : { count : -1} }
+]);
+Can inspect websites' pages for whether it's relevant/auto-translated as follows:
+    db.getCollection('Webpages').find({URL:/svenkirsten.com/, mriSentenceCount: {$gt: 0}})
 * CN: Only 1/113 sites from CN stood out as being of interest: http://kiwi2china.com/
 …
     http://splaf.free.fr/pfurb.html - Tahiti, French Polynesian, ... island names
     http://mi.fitnessrebates.com - Uses https://wordpress.org/plugins/weglot/ wordpress-compatible multilingual plugin, which ensures translated pages get indexed by google - exactly what we want to avoid
+*
+*
+DE:
+http://etymologie.info/~e/n_/nz-___reg.html - placenames, not meaningful
+!! https://www.cartogiraffe.com/ - some genuine pages (Rarotongan), but one page is in Czech that had a single word misindentified as MRI
+~ http://svenkirsten.com/ - one page mentions "tiki" but the rest is in English. The other is an (English) caption of "Book of Tiki A Maori Maiden"
+- herocity - autotranslated
+- weltderberge.de - 3 pages mention NZ mountains by name.
+~ (arts.mythologica.fr) https://mythologica.fr/oceanie/texte/pantheon_polynesien.pdf - mentions certain Maori Gods and other Polynesian Gods by name.
+- https://traynews.com - nothing in MRI, misdetected
+~ http://klaaskoehne.de/galleries/nzl-taranaki/index.html - mentions NZ mountain names
+- http://www.nierstrasz.org/deGrauwRegister.rtf - misdetected European (Dutch) names as MRI
+- https://afrikhepri.org/mi/ - autotranslated
+- https://www.tvteile.de - pure German pages, misdetected "Automatik" as a Maori language word
+- etoile-de-lune.net - 5 pages containing 1 sentence each but none with 2 sentences detected
+- ITALY:
+  http://oipaz.net/IMG/GalleriaAotearoa/ - NZ photogallery with each photo captioned by placename
+  http://www.marcosanti.it/Reportage/Oceania_ph/Nuova_Zelanda/ - each photo captioned by NZ placename
+  http://www.pegasoesmicamion.com/ - REO abbreviation misidentified, also in REO%20PUBLICIDAD.htm
+- AUSTRIA:
+  petit-prince.at - Tahitian and Wayuu (Venezuela) translations of Le Petit Prince
+  http://www.tmtmm.net/newzealand - photos from NZ named after places and people's names
+- ROMANIA: parohiauceadesus.ro - Sentences of single Romanian words misidentified.
+- ISRAEL:
+  http://www.daat.ac.il - misidentification of "no." as MRI, and Hebrew words.
+  https://www.hitiaotera.com/ - misidentifiation of Tahitian pages
+- RUSSIA: https://www.gismeteo.lv - misidentification of an email address
+- JAPAN: http://yutaka.it-n.jp - many pages of scientific names of (plants?) which are often misdetected as MRI
+!! Ireland, ie: https://coggle.it
+- IRAN: https://www.dideo.ir/v/yt/d6cgya0ze-E - video title from MaoriTelevision website
+? - CZECH republic: https://www.fipojobs.com/new-zealand/jobs-work-p-1 - NZ job position title in MRI but rest in English
+- SPAIN: http://www.info-hoteles.com/nz/2/hotels_lake_rotoiti.asp - 2 uses of the same placename
+- SINGAPORE: https://omg-solutions.com - autotranslated
+- TURKEY: https://www.elitedeluxe.com.tr/mi/yatak-odasi-takimlari - autotranslated
+- MEXICO: http://www.gelbukh.com - misidentification, lines of just numbers or phrases like "Area Chair" in English and Russian CVs.
+- FINLAND: http://pertti.com - travelogue, placenames
+- SWITZERLAND CH:
+  nicoledidi.ch - blog, placenames
+  https://photos.axelebert.org - Tahiti related content
+- UNKNOWN: https://www.viveipcl.com: tours website, placenames mentioned
+#- EU: https://www.the-good-stuff-factory.be/mi/ : Autotranslated
+!! - BULGARIA: http://anitra.net/activism/humanrights/UDHR/rrt_print.htm (2 pages)
+TREATING AUSTRALIA AND GREAT BRITAIN MORE SPECIALLY (don't ignore /mi in URL, same as with NZ, but do leave out .nz TLDs since we cover them under NZ - TODO: later find country codes of all .nz TLDs):
+[nothing found under "UK", only under "GB"]
+db.getCollection('Websites').find({
+    domain: {$not: /.nz$/},
+    numPagesContainingMRI: {$gt: 0},
+    $or: [{geoLocationCountryCode: "AU"}, {geoLocationCountryCode: "GB"}]
+}).count()
+db.Websites.aggregate([
+    {
+        $match: {
+           domain: {$not: /.nz$/},
+           numPagesContainingMRI: {$gt: 0},
+           $or: [{geoLocationCountryCode: "AU"}, {geoLocationCountryCode: "GB"}]
+        }
+    },
+    { $unwind: "$geoLocationCountryCode" },
+    {
+        $group: {
+            _id: {$toLower: '$geoLocationCountryCode'},
+            count: { $sum: 1 },
+            domain: { $addToSet: '$domain' }
+        }
+    },
+    { $sort : { count : -1} }
+]);
+AUSTRALIA:
+!!  https://www.kiwiproperty.com - e.g. https://www.kiwiproperty.com/the-base/mi/he-paepaki/ has some actual MRI sentences. [Not autotranslated]
+?   http://fionajack.net - Wellington gallery of artist. A few occurrences of Kia Ora in a title like context (e.g. "Street Party Kia Ora! Kia Ora!")
+!!  https://infogram.com/te-marautanga-o-aotearoa-moe-pld-allocations-2012-1go502ygvn562jd  - site of individual pages (like docs.google.com). This one has a relevant infogram image.
+!!  https://koreromaori.com - some actual Maori language sentences
+    http://theunderwaterworld.com/Galleries/Roimata/roimata-frame.html - placenames
+UK:
+    http://www.wordsearchfun.com/200628_Word_Find_wordsearch.html - 2 word games with Maori words (one of them has 3 different views, e.g. print view)
+?   https://omniatlas.com/maps/australasia/18400206/plain/ - historical map with Maori iwi names over NZ map regions
+?      https://omniatlas.com/maps/australasia/18400206/ - historical map of Australia and NZ at the time of the Treaty of Waitangi, with events marked in English
+    https://centrallanguageschool.com - AUTOTRANSLATED
+    https://www.solasolv.com - Autotranslated product site
+    http://mikestephens.co.uk/ - photo captions containing NZ placenames
+    http://www.woolrych.org/nzholiday2004/ - photogallery captioned with NZ placenames
 --------------
 …
 ---------------------
+Count of websites that have at least 1 page containing at least one sentence detected as MRI
+AND which websites have mi in the URL path:
+db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{urlContainsLangCodeInPath: true}]}).count()
+# The websites that have some MRI detected AND which are either in NZ or with NZ TLD
+# or (so if they're from overseas) don't contain /mi or mi.* in URL path:
+db.getCollection('Websites').find({$and: [{numPagesContainingMRI: {$gt: 0}},{$or: [{geoLocationCountryCode: "NZ"}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}]}).count()
+Include Australia (to get the valid "kiwiproperty.com" website included in the result list):
+db.getCollection('Websites').find({$and: [
+                {numPagesContainingMRI: {$gt: 0}},
+                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
+            ]}).count()
+# aggregate results by a count of country codes
+db.Websites.aggregate([
+    {
+        $match: {
+            $and: [
+                {numPagesContainingMRI: {$gt: 0}},
+                {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
+            ]
+        }
+    },
+    { $unwind: "$geoLocationCountryCode" },
+    {
+        $group: {
+            _id: {$toLower: '$geoLocationCountryCode'},
+            count: { $sum: 1 }
+        }
+    },
+    { $sort : { count : -1} }
+]);
+# Just considering those sites outside NZ or not with .nz TLD:
+db.getCollection('Websites').find({$and: [
+                {geoLocationCountryCode: {$ne: "NZ"}},
+                {domain: {$not: /\.nz/}},
+                {numPagesContainingMRI: {$gt: 0}},
+                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
+            ]}).count()
+websites
+# counts by country code excluding NZ related sites
+db.Websites.aggregate([
+    {
+        $match: {
+            $and: [
+                {geoLocationCountryCode: {$ne: "NZ"}},
+                {domain: {$not: /\.nz/}},
+                {numPagesContainingMRI: {$gt: 0}},
+                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
+            ]
+        }
+    },
+    { $unwind: "$geoLocationCountryCode" },
+    {
+        $group: {
+            _id: {$toLower: '$geoLocationCountryCode'},
+            count: { $sum: 1 },
+            domain: { $addToSet: '$domain' }
+        }
+    },
+    { $sort : { count : -1} }
+]);
+# But to produce the tentative non-product sites, we also want the aggregate for all NZ sites (from NZ or with .nz tld):
+db.getCollection('Websites').find({$and: [
+                {numPagesContainingMRI: {$gt: 0}},
+                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
+            ]}).count()
+(Total is 221+176 = 397, which adds up).
+# Get the count (and domain listing) output put under a hardcoded  _id of "nz":
+db.Websites.aggregate([
+    {
+        $match: {
+            $and: [
+                {numPagesContainingMRI: {$gt: 0}},
+                {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
+            ]
+        }
+    },
+    { $unwind: "$geoLocationCountryCode" },
+    {
+        $group: {
+            _id: "nz",
+            count: { $sum: 1 },
+            domain: { $addToSet: '$domain' }
+        }
+    },
+    { $sort : { count : -1} }
+]);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33813 for other-projects/maori-lang-detection/MoreReading/mongodb.txt

Legend:

other-projects/maori-lang-detection/MoreReading/mongodb.txt

Download in other formats: