source: other-projects/maori-lang-detection/mongodb-data/4counts_tentativeNonProductSites.json@ 33823

Last change on this file since 33823 was 33823, checked in by ak19, 4 years ago

Recommitting mongo-data folder with renamed files with numbering.

File size: 2.4 KB
Line 
1/*
2
3The websites that have some MRI detected AND which are either in NZ or with NZ TLD
4or (so if they're from overseas) don't contain /mi or mi.* in URL path.
5We'll include Australia, to get the valid "kiwiproperty.com" website,
6otherwise the sole exception, included in the result list.
7
8
9db.getCollection('Websites').find({$and: [
10 {numPagesContainingMRI: {$gt: 0}},
11 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
12 ]}).count()
13
14397
15
16Aggregate results by a count of country codes:
17
18db.Websites.aggregate([
19 {
20 $match: {
21 $and: [
22 {numPagesContainingMRI: {$gt: 0}},
23 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
24 ]
25 }
26 },
27 { $unwind: "$geoLocationCountryCode" },
28 {
29 $group: {
30 _id: {$toLower: '$geoLocationCountryCode'},
31 count: { $sum: 1 }
32 }
33 },
34 { $sort : { count : -1} }
35]);
36
37*/
38
39/* 1 */
40{
41 "_id" : "us",
42 "count" : 181.0
43}
44
45/* 2 */
46{
47 "_id" : "nz",
48 "count" : 89.0
49}
50
51/* 3 */
52{
53 "_id" : "au",
54 "count" : 21.0
55}
56
57/* 4 */
58{
59 "_id" : "de",
60 "count" : 19.0
61}
62
63/* 5 */
64{
65 "_id" : "fr",
66 "count" : 17.0
67}
68
69/* 6 */
70{
71 "_id" : "nl",
72 "count" : 16.0
73}
74
75/* 7 */
76{
77 "_id" : "dk",
78 "count" : 8.0
79}
80
81/* 8 */
82{
83 "_id" : "ca",
84 "count" : 7.0
85}
86
87/* 9 */
88{
89 "_id" : "es",
90 "count" : 6.0
91}
92
93/* 10 */
94{
95 "_id" : "gb",
96 "count" : 5.0
97}
98
99/* 11 */
100{
101 "_id" : "cz",
102 "count" : 4.0
103}
104
105/* 12 */
106{
107 "_id" : "at",
108 "count" : 3.0
109}
110
111/* 13 */
112{
113 "_id" : "it",
114 "count" : 3.0
115}
116
117/* 14 */
118{
119 "_id" : "ro",
120 "count" : 3.0
121}
122
123/* 15 */
124{
125 "_id" : "il",
126 "count" : 2.0
127}
128
129/* 16 */
130{
131 "_id" : "ch",
132 "count" : 2.0
133}
134
135/* 17 */
136{
137 "_id" : "bg",
138 "count" : 1.0
139}
140
141/* 18 */
142{
143 "_id" : "sg",
144 "count" : 1.0
145}
146
147/* 19 */
148{
149 "_id" : "mx",
150 "count" : 1.0
151}
152
153/* 20 */
154{
155 "_id" : "ir",
156 "count" : 1.0
157}
158
159/* 21 */
160{
161 "_id" : "cn",
162 "count" : 1.0
163}
164
165/* 22 */
166{
167 "_id" : "ie",
168 "count" : 1.0
169}
170
171/* 23 */
172{
173 "_id" : "jp",
174 "count" : 1.0
175}
176
177/* 24 */
178{
179 "_id" : "fi",
180 "count" : 1.0
181}
182
183/* 25 */
184{
185 "_id" : "gr",
186 "count" : 1.0
187}
188
189/* 26 */
190{
191 "_id" : "ru",
192 "count" : 1.0
193}
194
195/* 27 */
196{
197 "_id" : "unknown",
198 "count" : 1.0
199}
200
Note: See TracBrowser for help on using the repository browser.