source: other-projects/maori-lang-detection/mongodb-data/tables.txt@ 33913

Last change on this file since 33913 was 33913, checked in by ak19, 4 years ago
  1. Adjusted table mongodb query statements to be more exact, but same results. 2. Adjusted code to not treat Australia specially, as the AU site with mi in URL path has now shifted to US. 3. Differences in geoLocation results from previous mongoDB ingest to present one documented for cases not dealing with mi in URL path of overseas domains. 4.
File size: 7.6 KB
Line 
1Instructions for producing the tables:
2a. Copy the Javascript version of results for each mongodb query listed below into a text editor.
3b. OPTIONAL: Then regex replace \/\*\s*\d+\s*\*\/ with a comma (','), remove the very first comma, and embed all the JS inside [].
4c. Paste that Javascript into https://json-csv.com/ to get the CSV tables
5
6Note that for step 6, there are 2 mongodb queries. The results of both have to be merged into a single csv file.
7-----------
81. Table 1
9
10db.Websites.aggregate([
11
12 { $unwind: "$geoLocationCountryCode" },
13 {
14 $group: {
15 _id: "$geoLocationCountryCode",
16 count: { $sum: 1 },
17 /*domain: { $addToSet: '$domain' },*/
18 numPagesInMRICount: { $sum: '$numPagesInMRI' },
19 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
20 totalPagesAcrossSites: { $sum: '$totalPages'}
21 }
22 },
23 { $sort : { count : -1} }
24]);
25
26
271a.
28
29db.Websites.aggregate([
30 { $match: {urlContainsLangCodeInPath: true} },
31 { $unwind: "$geoLocationCountryCode" },
32 {
33 $group: {
34 _id: "$geoLocationCountryCode",
35 count: { $sum: 1 },
36 /*domain: { $addToSet: '$domain' },*/
37 numPagesInMRICount: { $sum: '$numPagesInMRI' },
38 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
39 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
40 }
41 },
42 { $sort : { count : -1} }
43]);
44
45
461b.
47
48db.Websites.aggregate([
49 {$match: {urlContainsLangCodeInPath: false} },
50 { $unwind: "$geoLocationCountryCode" },
51 {
52 $group: {
53 _id: "$geoLocationCountryCode",
54 count: { $sum: 1 },
55 /*domain: { $addToSet: '$domain' },*/
56 numPagesInMRICount: { $sum: '$numPagesInMRI' },
57 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
58 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
59 }
60 },
61 { $sort : { count : -1} }
62]);
63
64-----------
652. Table 2
66
67db.Websites.aggregate([
68 {
69 $match: {
70 numPagesInMRI: {$gt: 0}
71 }
72 },
73 { $unwind: "$geoLocationCountryCode" },
74 {
75 $group: {
76 _id: {$toLower: '$geoLocationCountryCode'},
77 count: { $sum: 1 },
78 /*domain: { $addToSet: '$domain' },*/
79 numPagesInMRICount: { $sum: '$numPagesInMRI' },
80 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
81 totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
82 }
83 },
84 { $sort : { count : -1} }
85]);
86
87-----------
883. Table 3
89
90db.Websites.aggregate([
91 {
92 $match: {
93 numPagesContainingMRI: {$gt: 0}
94 }
95 },
96 { $unwind: "$geoLocationCountryCode" },
97 {
98 $group: {
99 _id: {$toLower: '$geoLocationCountryCode'},
100 count: { $sum: 1 },
101 /*domain: { $addToSet: '$domain' },*/
102 numPagesInMRICount: { $sum: '$numPagesInMRI' },
103 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
104 totalPagesAcrossSitesWithPositiveContainsMRI: { $sum: '$totalPages'}
105 }
106 },
107 { $sort : { count : -1} }
108]);
109
110-----------
111
1124. Table 4
113db.Websites.aggregate([
114 {
115 $match: {
116 $and: [
117 {numPagesContainingMRI: {$gt: 0}},
118 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
119 ]
120 }
121 },
122 { $unwind: "$geoLocationCountryCode" },
123 {
124 $group: {
125 _id: {$toLower: '$geoLocationCountryCode'},
126 count: { $sum: 1 },
127 /*domain: { $addToSet: '$domain' },*/
128 numPagesInMRICount: { $sum: '$numPagesInMRI' },
129 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
130 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
131 }
132 },
133 { $sort : { count : -1} }
134]);
135
136-----------
1375. Table 5
138
139Outside of NZ:
140
141db.Websites.aggregate([
142 {
143 $match: {
144 $and: [
145 {geoLocationCountryCode: {$ne: "NZ"}},
146 {domain: {$not: /\.nz$/}},
147 {numPagesContainingMRI: {$gt: 0}},
148 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
149 ]
150 }
151 },
152 { $unwind: "$geoLocationCountryCode" },
153 {
154 $group: {
155 _id: {$toLower: '$geoLocationCountryCode'},
156 count: { $sum: 1 },
157 /*domain: { $addToSet: '$domain' },*/
158 numPagesInMRICount: { $sum: '$numPagesInMRI' },
159 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
160 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
161 }
162 },
163 { $sort : { count : -1} }
164]);
165
166
167NZ:
168db.Websites.aggregate([
169 {
170 $match: {
171 $and: [
172 {numPagesContainingMRI: {$gt: 0}},
173 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
174 ]
175 }
176 },
177 { $unwind: "$geoLocationCountryCode" },
178 {
179 $group: {
180 _id: "nz",
181 count: { $sum: 1 },
182 /*domain: { $addToSet: '$domain' },*/
183 numPagesInMRICount: { $sum: '$numPagesInMRI' },
184 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
185 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
186 }
187 },
188 { $sort : { count : -1} }
189]);
190
191
192To find NZ web pages IN MRI the following may be BETTER,
193as it looks for sites with positive numPagesINMRI rather than sites that only have positive containingMRI:
194
195db.Websites.aggregate([
196 {
197 $match: {
198 $and: [
199 {numPagesInMRI: {$gt: 0}},
200 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
201 ]
202 }
203 },
204 { $unwind: "$geoLocationCountryCode" },
205 {
206 $group: {
207 _id: "nz",
208 count: { $sum: 1 },
209 domain: { $addToSet: '$domain' },
210 numPagesInMRICount: { $sum: '$numPagesInMRI' },
211 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
212 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
213 }
214 },
215 { $sort : { count : -1} }
216]);
217
218
2195b. Table 5b:
220Table of count of sites with numPagesContainingMRI > 0
221
222Combine the following two:
223
224- OVERSEAS
225
226db.Websites.aggregate([
227 {
228 $match: {
229 $and: [
230 {geoLocationCountryCode: {$ne: "NZ"}},
231 {domain: {$not: /\.nz$/}},
232 {numPagesContainingMRI: {$gt: 0}}
233 ]
234 }
235 },
236 { $unwind: "$geoLocationCountryCode" },
237 {
238 $group: {
239 _id: {$toLower: '$geoLocationCountryCode'},
240 count: { $sum: 1 },
241 /*domain: { $addToSet: '$domain' },*/
242 numPagesInMRICount: { $sum: '$numPagesInMRI' },
243 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
244 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
245 }
246 },
247 { $sort : { count : -1} }
248]);
249
250- NZ:
251
252db.Websites.aggregate([
253 {
254 $match: {
255 $and: [
256 {numPagesContainingMRI: {$gt: 0}},
257 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
258 ]
259 }
260 },
261 { $unwind: "$geoLocationCountryCode" },
262 {
263 $group: {
264 _id: "nz",
265 count: { $sum: 1 },
266 /*domain: { $addToSet: '$domain' },*/
267 numPagesInMRICount: { $sum: '$numPagesInMRI' },
268 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
269 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
270 }
271 },
272 { $sort : { count : -1} }
273]);
274
Note: See TracBrowser for help on using the repository browser.