Changeset 33889
- Timestamp:
- 2020-02-03T15:48:40+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/mongodb-data
- Files:
-
- 9 added
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/mongodb-data/1a_table_miInUrlPath.csv
r33848 r33889 1 "_id","count","numPagesInMRICount","numPagesContainingMRICount" 2 "US","408.0","1169","3872" 3 "CN","123.0","281","1144" 4 "FR","34.0","754","1091" 5 "UNKNOWN","19.0","115","125" 6 "NZ","14.0","618","1097" 7 "DE","12.0","145","212" 8 "NL","8.0","76","115" 9 "CA","7.0","29","119" 10 "HK","7.0","3","12" 11 "AU","7.0","19","117" 12 "GB","5.0","3","7" 13 "JP","5.0","1","3" 14 "UA","4.0","9","10" 15 "RU","4.0","0","14" 16 "VG","2.0","0","0" 17 "SG","2.0","2","13" 18 "DK","2.0","0","0" 19 "IE","1.0","17","21" 20 "ZA","1.0","0","0" 21 "TR","1.0","0","2" 22 "SE","1.0","0","0" 23 "EU","1.0","0","7" 24 "CZ","1.0","0","0" 25 "ES","1.0","4","4" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites" 2 "US","408.0","1169","3872","199487" 3 "CN","123.0","281","1144","79576" 4 "FR","34.0","754","1091","7983" 5 "UNKNOWN","19.0","115","125","1196" 6 "NZ","14.0","618","1097","5959" 7 "DE","12.0","145","212","5718" 8 "NL","8.0","76","115","8351" 9 "CA","7.0","29","119","11577" 10 "AU","7.0","19","117","549" 11 "HK","7.0","3","12","620" 12 "JP","5.0","1","3","420" 13 "GB","5.0","3","7","1948" 14 "UA","4.0","9","10","1144" 15 "RU","4.0","0","14","30" 16 "DK","2.0","0","0","24" 17 "VG","2.0","0","0","2" 18 "SG","2.0","2","13","1373" 19 "ZA","1.0","0","0","2" 20 "SE","1.0","0","0","6" 21 "TR","1.0","0","2","505" 22 "EU","1.0","0","7","250" 23 "ES","1.0","4","4","3648" 24 "IE","1.0","17","21","451" 25 "CZ","1.0","0","0","1" 26 -
other-projects/maori-lang-detection/mongodb-data/1b_table_noMiInUrlPath.csv
r33848 r33889 1 "_id","count","numPagesInMRICount","numPagesContainingMRICount" 2 "US","288.0","2136","5452" 3 "UNKNOWN","154.0","0","12" 4 "NZ","101.0","2035","5129" 5 "DE","40.0","1","208" 6 "AU","36.0","164","313" 7 "FR","35.0","22","244" 8 "NL","24.0","127","265" 9 "GB","13.0","1","42" 10 "CA","12.0","1","209" 11 "DK","8.0","4","8" 12 "ES","7.0","1","50" 13 "CZ","6.0","0","32" 14 "JP","5.0","0","100" 15 "IT","4.0","0","18" 16 "RO","3.0","73","116" 17 "IE","3.0","1","3" 18 "RU","3.0","0","1" 19 "AT","3.0","0","61" 20 "IN","3.0","0","0" 21 "SE","3.0","0","0" 22 "IL","3.0","0","19" 23 "CH","3.0","0","4" 24 "CN","2.0","0","34" 25 "PL","2.0","0","0" 26 "CK","2.0","0","0" 27 "IO","1.0","0","0" 28 "SG","1.0","3","10" 29 "FI","1.0","0","29" 30 "UA","1.0","0","0" 31 "IR","1.0","0","1" 32 "ZA","1.0","0","0" 33 "MX","1.0","1","21" 34 "PT","1.0","0","0" 35 "GR","1.0","1","3" 36 "PF","1.0","0","0" 37 "ME","1.0","0","0" 38 "BG","1.0","2","2" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites" 2 "US","288.0","2136","5452","213005" 3 "UNKNOWN","154.0","0","12","78" 4 "NZ","101.0","2035","5129","31360" 5 "DE","40.0","1","208","30046" 6 "AU","36.0","164","313","10490" 7 "FR","35.0","22","244","29152" 8 "NL","24.0","127","265","10918" 9 "GB","13.0","1","42","2751" 10 "CA","12.0","1","209","11931" 11 "DK","8.0","4","8","16" 12 "ES","7.0","1","50","2671" 13 "CZ","6.0","0","32","9969" 14 "JP","5.0","0","100","1005" 15 "IT","4.0","0","18","684" 16 "SE","3.0","0","0","3" 17 "IN","3.0","0","0","110" 18 "AT","3.0","0","61","984" 19 "RU","3.0","0","1","1994" 20 "IE","3.0","1","3","6" 21 "RO","3.0","73","116","240" 22 "IL","3.0","0","19","641" 23 "CH","3.0","0","4","305" 24 "PL","2.0","0","0","4671" 25 "CN","2.0","0","34","716" 26 "CK","2.0","0","0","2" 27 "PT","1.0","0","0","2852" 28 "MX","1.0","1","21","236" 29 "IR","1.0","0","1","5" 30 "ZA","1.0","0","0","1" 31 "FI","1.0","0","29","1124" 32 "UA","1.0","0","0","294" 33 "SG","1.0","3","10","23" 34 "IO","1.0","0","0","2" 35 "BG","1.0","2","2","2" 36 "ME","1.0","0","0","1" 37 "PF","1.0","0","0","2" 38 "GR","1.0","1","3","4" 39 -
other-projects/maori-lang-detection/mongodb-data/1table_allCrawledSites.csv
r33848 r33889 1 "_id","count","numPagesInMRICount","numPagesContainingMRICount" 2 "US","696.0","3305","9324" 3 "UNKNOWN","173.0","115","137" 4 "CN","125.0","281","1178" 5 "NZ","115.0","2653","6226" 6 "FR","69.0","776","1335" 7 "DE","52.0","146","420" 8 "AU","43.0","183","430" 9 "NL","32.0","203","380" 10 "CA","19.0","30","328" 11 "GB","18.0","4","49" 12 "DK","10.0","4","8" 13 "JP","10.0","1","103" 14 "ES","8.0","5","54" 15 "CZ","7.0","0","32" 16 "RU","7.0","0","15" 17 "HK","7.0","3","12" 18 "UA","5.0","9","10" 19 "IE","4.0","18","24" 20 "IT","4.0","0","18" 21 "SE","4.0","0","0" 22 "RO","3.0","73","116" 23 "SG","3.0","5","23" 24 "AT","3.0","0","61" 25 "IN","3.0","0","0" 26 "IL","3.0","0","19" 27 "CH","3.0","0","4" 28 "PL","2.0","0","0" 29 "ZA","2.0","0","0" 30 "VG","2.0","0","0" 31 "CK","2.0","0","0" 32 "IO","1.0","0","0" 33 "FI","1.0","0","29" 34 "IR","1.0","0","1" 35 "TR","1.0","0","2" 36 "EU","1.0","0","7" 37 "PT","1.0","0","0" 38 "MX","1.0","1","21" 39 "GR","1.0","1","3" 40 "PF","1.0","0","0" 41 "ME","1.0","0","0" 42 "BG","1.0","2","2" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossSites" 2 "US","696.0","3305","9324","412492" 3 "UNKNOWN","173.0","115","137","1274" 4 "CN","125.0","281","1178","80292" 5 "NZ","115.0","2653","6226","37319" 6 "FR","69.0","776","1335","37135" 7 "DE","52.0","146","420","35764" 8 "AU","43.0","183","430","11039" 9 "NL","32.0","203","380","19269" 10 "CA","19.0","30","328","23508" 11 "GB","18.0","4","49","4699" 12 "JP","10.0","1","103","1425" 13 "DK","10.0","4","8","40" 14 "ES","8.0","5","54","6319" 15 "RU","7.0","0","15","2024" 16 "HK","7.0","3","12","620" 17 "CZ","7.0","0","32","9970" 18 "UA","5.0","9","10","1438" 19 "SE","4.0","0","0","9" 20 "IT","4.0","0","18","684" 21 "IE","4.0","18","24","457" 22 "IN","3.0","0","0","110" 23 "SG","3.0","5","23","1396" 24 "AT","3.0","0","61","984" 25 "RO","3.0","73","116","240" 26 "IL","3.0","0","19","641" 27 "CH","3.0","0","4","305" 28 "VG","2.0","0","0","2" 29 "ZA","2.0","0","0","3" 30 "PL","2.0","0","0","4671" 31 "CK","2.0","0","0","2" 32 "PT","1.0","0","0","2852" 33 "IR","1.0","0","1","5" 34 "TR","1.0","0","2","505" 35 "MX","1.0","1","21","236" 36 "FI","1.0","0","29","1124" 37 "IO","1.0","0","0","2" 38 "EU","1.0","0","7","250" 39 "BG","1.0","2","2","2" 40 "ME","1.0","0","0","1" 41 "PF","1.0","0","0","2" 42 "GR","1.0","1","3","4" 43 -
other-projects/maori-lang-detection/mongodb-data/2table_sitesWithPagesInMRI.csv
r33886 r33889 1 "_id","count","numPagesInMRICount","numPagesContainingMRICount" 2 "us","206.0","3305","6327" 3 "nz","53.0","2653","5045" 4 "cn","32.0","281","542" 5 "fr","18.0","776","1101" 6 "au","11.0","183","358" 7 "nl","10.0","203","216" 8 "de","5.0","146","190" 9 "dk","4.0","4","4" 10 "gb","3.0","4","13" 11 "ca","3.0","30","35" 12 "unknown","2.0","115","125" 13 "ie","2.0","18","24" 14 "ua","2.0","9","10" 15 "es","2.0","5","5" 16 "sg","2.0","5","23" 17 "jp","1.0","1","3" 18 "hk","1.0","3","8" 19 "ro","1.0","73","104" 20 "bg","1.0","2","2" 21 "gr","1.0","1","3" 22 "mx","1.0","1","21" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossSitesWithPositiveMRICount" 2 "us","206.0","3305","6327","174620" 3 "nz","53.0","2653","5045","21901" 4 "cn","32.0","281","542","27405" 5 "fr","18.0","776","1101","17622" 6 "au","11.0","183","358","1329" 7 "nl","10.0","203","216","9185" 8 "de","5.0","146","190","4195" 9 "dk","4.0","4","4","10" 10 "gb","3.0","4","13","2935" 11 "ca","3.0","30","35","2823" 12 "es","2.0","5","5","3649" 13 "ua","2.0","9","10","1140" 14 "sg","2.0","5","23","1357" 15 "ie","2.0","18","24","454" 16 "unknown","2.0","115","125","943" 17 "hk","1.0","3","8","500" 18 "jp","1.0","1","3","377" 19 "gr","1.0","1","3","4" 20 "mx","1.0","1","21","236" 21 "ro","1.0","73","104","105" 22 "bg","1.0","2","2","2" 23 -
other-projects/maori-lang-detection/mongodb-data/3table_sitesWithPagesContainingMRI.csv
r33848 r33889 1 "_id","count","numPagesInMRICount","numPagesContainingMRICount" 2 "us","486.0","3305","9324" 3 "cn","114.0","281","1178" 4 "nz","89.0","2653","6226" 5 "fr","36.0","776","1335" 6 "de","27.0","146","420" 7 "nl","22.0","203","380" 8 "au","21.0","183","430" 9 "ca","12.0","30","328" 10 "dk","8.0","4","8" 11 "gb","7.0","4","49" 12 "es","7.0","5","54" 13 "cz","4.0","0","32" 14 "unknown","3.0","115","137" 15 "ro","3.0","73","116" 16 "it","3.0","0","18" 17 "at","3.0","0","61" 18 "il","2.0","0","19" 19 "ua","2.0","9","10" 20 "ru","2.0","0","15" 21 "ch","2.0","0","4" 22 "hk","2.0","3","12" 23 "sg","2.0","5","23" 24 "ie","2.0","18","24" 25 "jp","2.0","1","103" 26 "tr","1.0","0","2" 27 "mx","1.0","1","21" 28 "fi","1.0","0","29" 29 "eu","1.0","0","7" 30 "gr","1.0","1","3" 31 "bg","1.0","2","2" 32 "ir","1.0","0","1" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossSitesWithPositiveContainsMRI" 2 "us","486.0","3305","9324","353593" 3 "cn","114.0","281","1178","78881" 4 "nz","89.0","2653","6226","36874" 5 "fr","36.0","776","1335","33647" 6 "de","27.0","146","420","31884" 7 "nl","22.0","203","380","16016" 8 "au","21.0","183","430","5013" 9 "ca","12.0","30","328","13795" 10 "dk","8.0","4","8","16" 11 "gb","7.0","4","49","3736" 12 "es","7.0","5","54","6318" 13 "cz","4.0","0","32","4698" 14 "it","3.0","0","18","396" 15 "unknown","3.0","115","137","1016" 16 "ro","3.0","73","116","240" 17 "at","3.0","0","61","984" 18 "ua","2.0","9","10","1140" 19 "ru","2.0","0","15","923" 20 "il","2.0","0","19","639" 21 "hk","2.0","3","12","600" 22 "sg","2.0","5","23","1357" 23 "ie","2.0","18","24","454" 24 "jp","2.0","1","103","966" 25 "ch","2.0","0","4","304" 26 "fi","1.0","0","29","1124" 27 "tr","1.0","0","2","505" 28 "mx","1.0","1","21","236" 29 "eu","1.0","0","7","250" 30 "gr","1.0","1","3","4" 31 "bg","1.0","2","2","2" 32 "ir","1.0","0","1","5" 33 -
other-projects/maori-lang-detection/mongodb-data/4table_tentativeNonProductSites.csv
r33848 r33889 1 "_id","siteCount","numPagesInMRICount","numPagesContainingMRICount" 2 "us","181.0","2212","5579" 3 "nz","89.0","2653","6226" 4 "au","21.0","183","430" 5 "de","19.0","1","208" 6 "fr","17.0","22","244" 7 "nl","16.0","127","265" 8 "dk","8.0","4","8" 9 "ca","7.0","1","209" 10 "es","6.0","1","50" 11 "gb","5.0","1","42" 12 "cz","4.0","0","32" 13 "at","3.0","0","61" 14 "it","3.0","0","18" 15 "ro","3.0","73","116" 16 "ch","2.0","0","4" 17 "il","2.0","0","19" 18 "ru","1.0","0","1" 19 "jp","1.0","0","100" 20 "unknown","1.0","0","12" 21 "ie","1.0","1","3" 22 "fi","1.0","0","29" 23 "sg","1.0","3","10" 24 "bg","1.0","2","2" 25 "cn","1.0","0","34" 26 "gr","1.0","1","3" 27 "ir","1.0","0","1" 28 "mx","1.0","1","21" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites" 2 "us","181.0","2212","5579","158656" 3 "nz","89.0","2653","6226","36874" 4 "au","21.0","183","430","5013" 5 "de","19.0","1","208","26617" 6 "fr","17.0","22","244","25705" 7 "nl","16.0","127","265","7669" 8 "dk","8.0","4","8","16" 9 "ca","7.0","1","209","2228" 10 "es","6.0","1","50","2670" 11 "gb","5.0","1","42","1813" 12 "cz","4.0","0","32","4698" 13 "ro","3.0","73","116","240" 14 "it","3.0","0","18","396" 15 "at","3.0","0","61","984" 16 "il","2.0","0","19","639" 17 "ch","2.0","0","4","304" 18 "sg","1.0","3","10","23" 19 "ir","1.0","0","1","5" 20 "fi","1.0","0","29","1124" 21 "ie","1.0","1","3","3" 22 "ru","1.0","0","1","909" 23 "jp","1.0","0","100","589" 24 "mx","1.0","1","21","236" 25 "unknown","1.0","0","12","73" 26 "gr","1.0","1","3","4" 27 "bg","1.0","2","2","2" 28 "cn","1.0","0","34","270" 29 -
other-projects/maori-lang-detection/mongodb-data/5table_tentativeNonProductSites1.csv
r33883 r33889 1 "_id","siteCount (numPagesContainingMRICount > 0)","numPagesInMRICount","numPagesContainingMRICount" 2 "nz","176.0","4360","9641" 3 "us","117.0","757","2655" 4 "de","19.0","1","208" 5 "nl","16.0","127","265" 6 "fr","16.0","22","243" 7 "dk","8.0","4","8" 8 "ca","7.0","1","209" 9 "au","5.0","8","102" 10 "cz","4.0","0","32" 11 "gb","4.0","1","40" 12 "es","4.0","1","7" 13 "it","3.0","0","18" 14 "at","3.0","0","61" 15 "ro","2.0","0","12" 16 "il","2.0","0","19" 17 "ch","2.0","0","4" 18 "ir","1.0","0","1" 19 "fi","1.0","0","29" 20 "ie","1.0","1","3" 21 "ru","1.0","0","1" 22 "jp","1.0","0","100" 23 "mx","1.0","1","21" 24 "unknown","1.0","0","12" 25 "bg","1.0","2","2" 26 "cn","1.0","0","34" 1 "countryCode","count","numPagesInMRICount","numPagesContainingMRICount","totalPagesAcrossMatchingSites" 2 "nz","176.0","4360","9641","87657" 3 "us","117.0","757","2655","113936" 4 "de","19.0","1","208","26617" 5 "fr","16.0","22","243","25547" 6 "nl","16.0","127","265","7669" 7 "dk","8.0","4","8","16" 8 "ca","7.0","1","209","2228" 9 "au","5.0","8","102","560" 10 "gb","4.0","1","40","1809" 11 "es","4.0","1","7","1354" 12 "cz","4.0","0","32","4698" 13 "it","3.0","0","18","396" 14 "at","3.0","0","61","984" 15 "ro","2.0","0","12","135" 16 "il","2.0","0","19","639" 17 "ch","2.0","0","4","304" 18 "fi","1.0","0","29","1124" 19 "ie","1.0","1","3","3" 20 "jp","1.0","0","100","589" 21 "ru","1.0","0","1","909" 22 "unknown","1.0","0","12","73" 23 "mx","1.0","1","21","236" 24 "bg","1.0","2","2","2" 25 "cn","1.0","0","34","270" 26 "ir","1.0","0","1","5" 27 -
other-projects/maori-lang-detection/mongodb-data/tables.txt
r33878 r33889 1 1 Instructions for producing the tables: 2 2 a. Copy the Javascript version of results for each mongodb query listed below into a text editor. 3 b. Then regex replace \/\*\s*\d+\s*\*\/ with ","and embed all the JS inside [].3 b. OPTIONAL: Then regex replace \/\*\s*\d+\s*\*\/ with a comma (','), remove the very first comma, and embed all the JS inside []. 4 4 c. Paste that Javascript into https://json-csv.com/ to get the CSV tables 5 5 … … 17 17 /*domain: { $addToSet: '$domain' },*/ 18 18 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 19 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 19 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 20 totalPagesAcrossSites: { $sum: '$totalPages'} 20 21 } 21 22 }, … … 35 36 /*domain: { $addToSet: '$domain' },*/ 36 37 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 37 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 38 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 39 totalPagesAcrossMatchingSites: { $sum: '$totalPages'} 38 40 } 39 41 }, … … 53 55 /*domain: { $addToSet: '$domain' },*/ 54 56 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 55 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 57 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 58 totalPagesAcrossMatchingSites: { $sum: '$totalPages'} 56 59 } 57 60 }, … … 75 78 /*domain: { $addToSet: '$domain' },*/ 76 79 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 77 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 80 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 81 totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'} 78 82 } 79 83 }, … … 97 101 /*domain: { $addToSet: '$domain' },*/ 98 102 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 99 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 103 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 104 totalPagesAcrossSitesWithPositiveContainsMRI: { $sum: '$totalPages'} 100 105 } 101 106 }, … … 122 127 /*domain: { $addToSet: '$domain' },*/ 123 128 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 124 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 129 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 130 totalPagesAcrossMatchingSites: { $sum: '$totalPages'} 125 131 } 126 132 }, … … 151 157 /*domain: { $addToSet: '$domain' },*/ 152 158 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 153 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 159 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 160 totalPagesAcrossMatchingSites: { $sum: '$totalPages'} 154 161 } 155 162 }, … … 175 182 /*domain: { $addToSet: '$domain' },*/ 176 183 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 177 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 178 } 179 }, 180 { $sort : { count : -1} } 181 ]); 182 183 184 To find NZ web pages in MRI the following may be BETTER, 184 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 185 totalPagesAcrossMatchingSites: { $sum: '$totalPages'} 186 } 187 }, 188 { $sort : { count : -1} } 189 ]); 190 191 192 To find NZ web pages IN MRI the following may be BETTER, 185 193 as it looks for sites with positive numPagesINMRI rather than sites that only have positive containingMRI: 186 194 … … 201 209 domain: { $addToSet: '$domain' }, 202 210 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 203 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 204 } 205 }, 206 { $sort : { count : -1} } 207 ]); 208 211 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }, 212 totalPagesAcrossMatchingSites: { $sum: '$totalPages'} 213 } 214 }, 215 { $sort : { count : -1} } 216 ]); 217
Note:
See TracChangeset
for help on using the changeset viewer.