source: other-projects/maori-lang-detection/mongodb-data/tables.txt@ 33878

Last change on this file since 33878 was 33878, checked in by ak19, 4 years ago

Better comment

File size: 5.5 KB
Line 
1Instructions for producing the tables:
2a. Copy the Javascript version of results for each mongodb query listed below into a text editor.
3b. Then regex replace \/\*\s*\d+\s*\*\/ with "," and embed all the JS inside [].
4c. Paste that Javascript into https://json-csv.com/ to get the CSV tables
5
6Note that for step 6, there are 2 mongodb queries. The results of both have to be merged into a single csv file.
7-----------
81. Table 1
9
10db.Websites.aggregate([
11
12 { $unwind: "$geoLocationCountryCode" },
13 {
14 $group: {
15 _id: "$geoLocationCountryCode",
16 count: { $sum: 1 },
17 /*domain: { $addToSet: '$domain' },*/
18 numPagesInMRICount: { $sum: '$numPagesInMRI' },
19 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
20 }
21 },
22 { $sort : { count : -1} }
23]);
24
25
261a.
27
28db.Websites.aggregate([
29 { $match: {urlContainsLangCodeInPath: true} },
30 { $unwind: "$geoLocationCountryCode" },
31 {
32 $group: {
33 _id: "$geoLocationCountryCode",
34 count: { $sum: 1 },
35 /*domain: { $addToSet: '$domain' },*/
36 numPagesInMRICount: { $sum: '$numPagesInMRI' },
37 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
38 }
39 },
40 { $sort : { count : -1} }
41]);
42
43
441b.
45
46db.Websites.aggregate([
47 {$match: {urlContainsLangCodeInPath: false} },
48 { $unwind: "$geoLocationCountryCode" },
49 {
50 $group: {
51 _id: "$geoLocationCountryCode",
52 count: { $sum: 1 },
53 /*domain: { $addToSet: '$domain' },*/
54 numPagesInMRICount: { $sum: '$numPagesInMRI' },
55 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
56 }
57 },
58 { $sort : { count : -1} }
59]);
60
61-----------
622. Table 2
63
64db.Websites.aggregate([
65 {
66 $match: {
67 numPagesInMRI: {$gt: 0}
68 }
69 },
70 { $unwind: "$geoLocationCountryCode" },
71 {
72 $group: {
73 _id: {$toLower: '$geoLocationCountryCode'},
74 count: { $sum: 1 },
75 /*domain: { $addToSet: '$domain' },*/
76 numPagesInMRICount: { $sum: '$numPagesInMRI' },
77 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
78 }
79 },
80 { $sort : { count : -1} }
81]);
82
83-----------
843. Table 3
85
86db.Websites.aggregate([
87 {
88 $match: {
89 numPagesContainingMRI: {$gt: 0}
90 }
91 },
92 { $unwind: "$geoLocationCountryCode" },
93 {
94 $group: {
95 _id: {$toLower: '$geoLocationCountryCode'},
96 count: { $sum: 1 },
97 /*domain: { $addToSet: '$domain' },*/
98 numPagesInMRICount: { $sum: '$numPagesInMRI' },
99 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
100 }
101 },
102 { $sort : { count : -1} }
103]);
104
105-----------
106
1074. Table 4
108db.Websites.aggregate([
109 {
110 $match: {
111 $and: [
112 {numPagesContainingMRI: {$gt: 0}},
113 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
114 ]
115 }
116 },
117 { $unwind: "$geoLocationCountryCode" },
118 {
119 $group: {
120 _id: {$toLower: '$geoLocationCountryCode'},
121 count: { $sum: 1 },
122 /*domain: { $addToSet: '$domain' },*/
123 numPagesInMRICount: { $sum: '$numPagesInMRI' },
124 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
125 }
126 },
127 { $sort : { count : -1} }
128]);
129
130-----------
1315. Table 5
132
133Outside of NZ:
134
135db.Websites.aggregate([
136 {
137 $match: {
138 $and: [
139 {geoLocationCountryCode: {$ne: "NZ"}},
140 {domain: {$not: /\.nz/}},
141 {numPagesContainingMRI: {$gt: 0}},
142 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
143 ]
144 }
145 },
146 { $unwind: "$geoLocationCountryCode" },
147 {
148 $group: {
149 _id: {$toLower: '$geoLocationCountryCode'},
150 count: { $sum: 1 },
151 /*domain: { $addToSet: '$domain' },*/
152 numPagesInMRICount: { $sum: '$numPagesInMRI' },
153 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
154 }
155 },
156 { $sort : { count : -1} }
157]);
158
159
160NZ:
161db.Websites.aggregate([
162 {
163 $match: {
164 $and: [
165 {numPagesContainingMRI: {$gt: 0}},
166 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
167 ]
168 }
169 },
170 { $unwind: "$geoLocationCountryCode" },
171 {
172 $group: {
173 _id: "nz",
174 count: { $sum: 1 },
175 /*domain: { $addToSet: '$domain' },*/
176 numPagesInMRICount: { $sum: '$numPagesInMRI' },
177 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
178 }
179 },
180 { $sort : { count : -1} }
181]);
182
183
184To find NZ web pages in MRI the following may be BETTER,
185as it looks for sites with positive numPagesINMRI rather than sites that only have positive containingMRI:
186
187db.Websites.aggregate([
188 {
189 $match: {
190 $and: [
191 {numPagesInMRI: {$gt: 0}},
192 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
193 ]
194 }
195 },
196 { $unwind: "$geoLocationCountryCode" },
197 {
198 $group: {
199 _id: "nz",
200 count: { $sum: 1 },
201 domain: { $addToSet: '$domain' },
202 numPagesInMRICount: { $sum: '$numPagesInMRI' },
203 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
204 }
205 },
206 { $sort : { count : -1} }
207]);
208
Note: See TracBrowser for help on using the repository browser.