source: other-projects/maori-lang-detection/mongodb-data/tables.txt@ 33894

Last change on this file since 33894 was 33894, checked in by ak19, 4 years ago
  1. Adding map, counts.json and geo-json files for 5b count of sites by countrycode with numPagesContainingMRI > 0, unfiltered by mi in URL path or not. 2. Tables file has mongodb query code for 5b data. 3. Map, counts.json and geo-json files for 6 (count of sites by country code from manual short listing of sites) now renamed to reflect that the shortlist considers ALL manually selected sites regardless of mi in URL path or not.
File size: 7.6 KB
Line 
1Instructions for producing the tables:
2a. Copy the Javascript version of results for each mongodb query listed below into a text editor.
3b. OPTIONAL: Then regex replace \/\*\s*\d+\s*\*\/ with a comma (','), remove the very first comma, and embed all the JS inside [].
4c. Paste that Javascript into https://json-csv.com/ to get the CSV tables
5
6Note that for step 6, there are 2 mongodb queries. The results of both have to be merged into a single csv file.
7-----------
81. Table 1
9
10db.Websites.aggregate([
11
12 { $unwind: "$geoLocationCountryCode" },
13 {
14 $group: {
15 _id: "$geoLocationCountryCode",
16 count: { $sum: 1 },
17 /*domain: { $addToSet: '$domain' },*/
18 numPagesInMRICount: { $sum: '$numPagesInMRI' },
19 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
20 totalPagesAcrossSites: { $sum: '$totalPages'}
21 }
22 },
23 { $sort : { count : -1} }
24]);
25
26
271a.
28
29db.Websites.aggregate([
30 { $match: {urlContainsLangCodeInPath: true} },
31 { $unwind: "$geoLocationCountryCode" },
32 {
33 $group: {
34 _id: "$geoLocationCountryCode",
35 count: { $sum: 1 },
36 /*domain: { $addToSet: '$domain' },*/
37 numPagesInMRICount: { $sum: '$numPagesInMRI' },
38 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
39 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
40 }
41 },
42 { $sort : { count : -1} }
43]);
44
45
461b.
47
48db.Websites.aggregate([
49 {$match: {urlContainsLangCodeInPath: false} },
50 { $unwind: "$geoLocationCountryCode" },
51 {
52 $group: {
53 _id: "$geoLocationCountryCode",
54 count: { $sum: 1 },
55 /*domain: { $addToSet: '$domain' },*/
56 numPagesInMRICount: { $sum: '$numPagesInMRI' },
57 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
58 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
59 }
60 },
61 { $sort : { count : -1} }
62]);
63
64-----------
652. Table 2
66
67db.Websites.aggregate([
68 {
69 $match: {
70 numPagesInMRI: {$gt: 0}
71 }
72 },
73 { $unwind: "$geoLocationCountryCode" },
74 {
75 $group: {
76 _id: {$toLower: '$geoLocationCountryCode'},
77 count: { $sum: 1 },
78 /*domain: { $addToSet: '$domain' },*/
79 numPagesInMRICount: { $sum: '$numPagesInMRI' },
80 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
81 totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
82 }
83 },
84 { $sort : { count : -1} }
85]);
86
87-----------
883. Table 3
89
90db.Websites.aggregate([
91 {
92 $match: {
93 numPagesContainingMRI: {$gt: 0}
94 }
95 },
96 { $unwind: "$geoLocationCountryCode" },
97 {
98 $group: {
99 _id: {$toLower: '$geoLocationCountryCode'},
100 count: { $sum: 1 },
101 /*domain: { $addToSet: '$domain' },*/
102 numPagesInMRICount: { $sum: '$numPagesInMRI' },
103 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
104 totalPagesAcrossSitesWithPositiveContainsMRI: { $sum: '$totalPages'}
105 }
106 },
107 { $sort : { count : -1} }
108]);
109
110-----------
111
1124. Table 4
113db.Websites.aggregate([
114 {
115 $match: {
116 $and: [
117 {numPagesContainingMRI: {$gt: 0}},
118 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
119 ]
120 }
121 },
122 { $unwind: "$geoLocationCountryCode" },
123 {
124 $group: {
125 _id: {$toLower: '$geoLocationCountryCode'},
126 count: { $sum: 1 },
127 /*domain: { $addToSet: '$domain' },*/
128 numPagesInMRICount: { $sum: '$numPagesInMRI' },
129 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
130 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
131 }
132 },
133 { $sort : { count : -1} }
134]);
135
136-----------
1375. Table 5
138
139Outside of NZ:
140
141db.Websites.aggregate([
142 {
143 $match: {
144 $and: [
145 {geoLocationCountryCode: {$ne: "NZ"}},
146 {domain: {$not: /\.nz/}},
147 {numPagesContainingMRI: {$gt: 0}},
148 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
149 ]
150 }
151 },
152 { $unwind: "$geoLocationCountryCode" },
153 {
154 $group: {
155 _id: {$toLower: '$geoLocationCountryCode'},
156 count: { $sum: 1 },
157 /*domain: { $addToSet: '$domain' },*/
158 numPagesInMRICount: { $sum: '$numPagesInMRI' },
159 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
160 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
161 }
162 },
163 { $sort : { count : -1} }
164]);
165
166
167NZ:
168db.Websites.aggregate([
169 {
170 $match: {
171 $and: [
172 {numPagesContainingMRI: {$gt: 0}},
173 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
174 ]
175 }
176 },
177 { $unwind: "$geoLocationCountryCode" },
178 {
179 $group: {
180 _id: "nz",
181 count: { $sum: 1 },
182 /*domain: { $addToSet: '$domain' },*/
183 numPagesInMRICount: { $sum: '$numPagesInMRI' },
184 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
185 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
186 }
187 },
188 { $sort : { count : -1} }
189]);
190
191
192To find NZ web pages IN MRI the following may be BETTER,
193as it looks for sites with positive numPagesINMRI rather than sites that only have positive containingMRI:
194
195db.Websites.aggregate([
196 {
197 $match: {
198 $and: [
199 {numPagesInMRI: {$gt: 0}},
200 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
201 ]
202 }
203 },
204 { $unwind: "$geoLocationCountryCode" },
205 {
206 $group: {
207 _id: "nz",
208 count: { $sum: 1 },
209 domain: { $addToSet: '$domain' },
210 numPagesInMRICount: { $sum: '$numPagesInMRI' },
211 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
212 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
213 }
214 },
215 { $sort : { count : -1} }
216]);
217
218
2195b. Table 5b:
220Table of count of sites with numPagesCoMRI > 0
221
222Combine the following two:
223
224- OVERSEAS
225
226db.Websites.aggregate([
227 {
228 $match: {
229 $and: [
230 {geoLocationCountryCode: {$ne: "NZ"}},
231 {domain: {$not: /\.nz/}},
232 {numPagesContainingMRI: {$gt: 0}}
233 ]
234 }
235 },
236 { $unwind: "$geoLocationCountryCode" },
237 {
238 $group: {
239 _id: {$toLower: '$geoLocationCountryCode'},
240 count: { $sum: 1 },
241 /*domain: { $addToSet: '$domain' },*/
242 numPagesInMRICount: { $sum: '$numPagesInMRI' },
243 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
244 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
245 }
246 },
247 { $sort : { count : -1} }
248]);
249
250- NZ:
251
252db.Websites.aggregate([
253 {
254 $match: {
255 $and: [
256 {numPagesContainingMRI: {$gt: 0}},
257 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
258 ]
259 }
260 },
261 { $unwind: "$geoLocationCountryCode" },
262 {
263 $group: {
264 _id: "nz",
265 count: { $sum: 1 },
266 /*domain: { $addToSet: '$domain' },*/
267 numPagesInMRICount: { $sum: '$numPagesInMRI' },
268 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
269 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
270 }
271 },
272 { $sort : { count : -1} }
273]);
274
Note: See TracBrowser for help on using the repository browser.