source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java@ 33913

Last change on this file since 33913 was 33913, checked in by ak19, 4 years ago
  1. Adjusted table mongodb query statements to be more exact, but same results. 2. Adjusted code to not treat Australia specially, as the AU site with mi in URL path has now shifted to US. 3. Differences in geoLocation results from previous mongoDB ingest to present one documented for cases not dealing with mi in URL path of overseas domains. 4.
File size: 30.1 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7
8// to use collection.find() filters like eq(), regex() etc
9import static com.mongodb.client.model.Filters.*;
10// to use collection.find().projection() filters like include() etc
11import static com.mongodb.client.model.Projections.*;
12// to use aggregation functions like unwind(), match(), sort() etc
13import static com.mongodb.client.model.Aggregates.*;
14// to use functions like sum() and addToSet() within aggregation functions
15import static com.mongodb.client.model.Accumulators.*;
16
17//import org.bson.conversions.Bson;
18import com.mongodb.BasicDBObject;
19
20
21import com.mongodb.Block;
22
23import org.bson.BsonArray;
24import org.bson.BsonString;
25import org.bson.BsonValue;
26import org.bson.Document;
27import org.bson.conversions.Bson;
28import org.bson.json.JsonMode;
29import org.bson.json.JsonWriterSettings;
30
31import com.mongodb.util.JSON;
32//import com.mongodb.DBObject;
33
34
35import com.google.gson.*; // for pretty printing
36
37import java.io.BufferedReader;
38import java.io.BufferedWriter;
39import java.io.File;
40import java.io.FileReader;
41import java.io.FileWriter;
42import java.io.IOException;
43import java.io.UncheckedIOException;
44import java.io.Writer;
45import javax.xml.ws.Holder;
46
47
48import java.util.Arrays;
49import java.util.ArrayList;
50import java.util.List;
51import java.util.TreeSet;
52
53
54import org.apache.log4j.Logger;
55import org.apache.commons.csv.*;
56
57/**
58 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
59 *
60 * TO COMPILE:
61 * maori-lang-detection/src$
62 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBQueryer.java
63 *
64 * TO RUN:
65 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBQueryer
66 *
67 * Manually connecting to mongodb from client:
68 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
69 * Then after connecting with pwd, type:
70 * use DBNAME
71 *
72 * Or connect to mongodb and specify db in one statement:
73 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
74 *
75 * Some links:
76 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
77 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
78 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
79 * IMPORTANT LINK:
80 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
81 *
82 * API:
83 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
84 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
85 */
86public class MongoDBQueryer extends MongoDBAccess {
87
88 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBQueryer.class.getName());
89
90 public static final String NEWLINE = System.getProperty("line.separator");
91
92 /** mongodb filter types to execute */
93 public static final int IS_MRI = 0;
94 public static final int CONTAINS_MRI = 1;
95
96 /** Some reused fieldnames in the Websites collection */
97 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
98 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
99
100
101
102 public MongoDBQueryer() throws Exception {
103 super();
104 }
105
106
107 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
108 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
109 }
110 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
111 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
112 }
113
114 /**
115 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
116 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
117 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
118 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
119 *
120 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
121 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
122 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
123 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
124 */
125 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
126
127 final ArrayList<String> urlsList = new ArrayList<String>();
128
129 // remove any http(s)://(www.) from the start of URL first
130 // since it goes into a regex
131 domain = Utility.stripProtocolAndWWWFromURL(domain);
132
133 // load the "webpages" db table
134 // in mongodb, the equivalent of db tables are called 'collections'
135 MongoCollection<Document> collection = getWebpagesCollection();
136
137 // code we'll execute in Iterable.forEach() below
138 // see also https://www.baeldung.com/foreach-java
139 Block<Document> storeURL = new Block<Document>() {
140 @Override
141 public void apply(final Document document) {
142 //System.out.println(document.toJson());
143 String url = document.getString("URL");
144 // add to our urlsList
145 //System.out.println(url);
146 urlsList.add(url);
147 }
148 };
149
150
151 // Run the following mongodb query:
152 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
153
154 // 1. One way that works:
155 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
156
157 // 2. Another way:
158 //String query = "{URL: /DOMAIN/, isMRI: true}";
159 String query = "{URL: /DOMAIN/, ";
160 if(filterType == IS_MRI) {
161 query += "isMRI: true}";
162 } else if(filterType == CONTAINS_MRI) {
163 query += "containsMRI: true}";
164 }
165
166 domain = domain.replace(".", "\\."); // escape dots in domain for regex
167 query = query.replace("DOMAIN", domain);
168
169 //System.err.println("Executing find query: " + query);
170
171 BasicDBObject findObj = BasicDBObject.parse(query);
172 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
173
174
175 collection.find(findObj).projection(projectionObj).forEach(storeURL);
176
177 return urlsList;
178 }
179
180 /**
181 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
182 *
183 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
184 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
185 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
186 *
187 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
188 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
189 * On using group(TExpression) inside collection.aggregate().
190 *
191 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
192 * and https://www.javatpoint.com/java-8-foreach
193 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
194 *
195 * Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,
196 * listing all the base domain strings (unsorted with protocol and any www)
197 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
198 * matching sites.
199 *
200 * The mongodb aggregate() we want to run this time:
201 *
202 db.Websites.aggregate([
203 {
204 $match: {
205 $and: [
206 {numPagesContainingMRI: {$gt: 0}},
207 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
208 ]
209 }
210 },
211 { $unwind: "$geoLocationCountryCode" },
212 {
213 $group: {
214 _id: "nz",
215 count: { $sum: 1 },
216 domain: { $addToSet: '$domain' }
217 }
218 },
219 { $sort : { count : -1} }
220 ]);
221 */
222 public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
223 // working with the WebSites collection, not WebPages collection!
224 MongoCollection<Document> collection = getWebsitesCollection();
225
226 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
227
228 // Want a counter,
229 // but lambda expressions can only take final variables and those can't be incremented
230 // But can use an array to store incrementable counter or Holder type
231 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
232 Holder<Integer> docNum = new Holder<>(0);
233
234 Bson orQuery = or(
235 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
236 BasicDBObject.parse("{domain: /\\.nz$/}")
237 );
238 Bson andQuery = and(
239 BasicDBObject.parse(mriFilterString),
240 orQuery);
241
242 // Hopefully the lambda expression (forEach()) at end means
243 // we write out each result Document as we get it
244 collection.aggregate(Arrays.asList(
245 match(andQuery),
246 unwind("$geoLocationCountryCode"),
247 group("NZ", Arrays.asList(sum("count", 1),
248 addToSet("domain", "$domain"))),
249 sort(BasicDBObject.parse("{count : -1}"))
250 )).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
251
252 // should only have one doc for NZ since it's a count by geolocation.
253
254 return;
255 }
256
257
258 /**
259 * Count of overseas (non-NZ and non-.nz TLD) websites
260 * containing a positive number of sentences in MRI,
261 * listing all the base domain strings (unsorted with protocol and any www)
262 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
263 * matching sites. Regardless of whether there's an mi in the URL path of any or not.
264 *
265 * The aggregate() we want to run this time:
266 *
267 db.Websites.aggregate([
268 {
269 $match: {
270 $and: [
271 {geoLocationCountryCode: {$ne: "NZ"}},
272 {domain: {$not: /\.nz/}},
273 {numPagesContainingMRI: {$gt: 0}}
274 ]
275 }
276 },
277 { $unwind: "$geoLocationCountryCode" },
278 {
279 $group: {
280 _id: {$toLower: '$geoLocationCountryCode'},
281 count: { $sum: 1 },
282 domain: { $addToSet: '$domain' }
283 }
284 },
285 { $sort : { count : -1} }
286 ]);
287 */
288 public void aggregateContainsMRIForOverseas(Writer writer, int filterType)
289 throws UncheckedIOException {
290 // working with the WebSites collection, not WebPages collection!
291 MongoCollection<Document> collection = getWebsitesCollection();
292
293 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
294
295 // Want a counter,
296 // but lambda expressions can only take final variables and those can't be incremented
297 // But can use an array to store incrementable counter or Holder
298 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
299 Holder<Integer> docNum = new Holder<>(1);
300
301 Bson andQuery = and(
302 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
303 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
304 BasicDBObject.parse(mriFilterString));
305
306 collection.aggregate(Arrays.asList(
307 match(andQuery),
308 unwind("$geoLocationCountryCode"),
309 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
310 addToSet("domain", "$domain"))),
311 sort(BasicDBObject.parse("{count : -1}"))
312 )).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
313
314 return;
315 }
316
317
318 /** Count by country code of overseas (non-NZ and non-nz TLD) websites
319 * containing a positive number of sentences in MRI,
320 * listing all the base domain strings (unordered and with protocol and any www)
321 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
322 * matching sites.
323 *
324 * The aggregate() we want to run this time:
325 *
326 db.Websites.aggregate([
327 {
328 $match: {
329 $and: [
330 {geoLocationCountryCode: {$ne: "NZ"}},
331 {domain: {$not: /\.nz/}},
332 {numPagesContainingMRI: {$gt: 0}},
333 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
334 ]
335 }
336 },
337 { $unwind: "$geoLocationCountryCode" },
338 {
339 $group: {
340 _id: {$toLower: '$geoLocationCountryCode'},
341 count: { $sum: 1 },
342 domain: { $addToSet: '$domain' }
343 }
344 },
345 { $sort : { count : -1} }
346 ]);
347 */
348 public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
349 boolean isMiInURLPath) throws UncheckedIOException
350 {
351 // working with the WebSites collection, not WebPages collection!
352 MongoCollection<Document> collection = getWebsitesCollection();
353
354 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
355
356 // Want a counter,
357 // but lambda expressions can only take final variables and those can't be incremented
358 // But can use an array to store incrementable counter or Holder
359 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
360 Holder<Integer> docNum = new Holder<>(1);
361
362 /*
363 Bson orQuery = or(
364 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
365 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
366 // e.g. "{urlContainsLangCodeInPath: false}"
367 );
368 */
369 Bson andQuery = and(
370 BasicDBObject.parse(mriFilterString),
371 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
372 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
373 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery);
374
375 collection.aggregate(Arrays.asList(
376 match(andQuery), //match(BasicDBObject.parse(matchQuery))
377 // match((List<DBObject>)JSON.parse(matchQuery)),
378 unwind("$geoLocationCountryCode"),
379 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
380 addToSet("domain", "$domain"))),
381 sort(BasicDBObject.parse("{count : -1}"))
382 )).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
383
384 // casting to Block<Document> necessary because otherwise we see the error at
385 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
386
387 // Less efficient way is to keep all the results in memory and then
388 // write them out one at a time
389 /*
390 AggregateIterable<Document> output
391 = collection.aggregate(Arrays.asList(
392 match(andQuery), //match(BasicDBObject.parse(matchQuery))
393 // match((List<DBObject>)JSON.parse(matchQuery)),
394 unwind("$geoLocationCountryCode"),
395 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
396 sort(BasicDBObject.parse("{count : -1}"))
397 ));
398
399
400 for (Document doc : output) {
401 //System.out.println(doc);
402 System.out.println(doc.toJson());
403
404 }
405 */
406 return;
407 }
408
409 /** Perform the aggregates for writing out the summary tables. */
410 public void writeTables(File outFolder) {
411 // In this function, we're always dealing with the Websites mongodb collection.
412 MongoCollection<Document> collection = getWebsitesCollection();
413
414 String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI",
415 "3table_sitesWithPagesContainingMRI", "4table_containsMRI_exclTentativeProductSites",
416 "5table_sitesWithPagesContainingMRI_allNZGrouped"
417 };
418 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
419 File outFile = new File(outFolder, tableNames[tableNum] + ".json");
420 File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
421 try (
422 Writer writer = new BufferedWriter(new FileWriter(outFile));
423 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); // quote ALL vs MINIMAL vs NON_NUMERIC fields
424 ) {
425
426 // Write out the CSV column headings
427 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
428 csvWriter.printRecord("countryCode", "siteCount",
429 "numPagesInMRI count","numPagesContainingMRICount",
430 "totalPagesAcrossMatchingSites"/*, "domain"*/);
431
432 AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
433
434
435 int docNum = 0;
436 // get any NZ specific row's data, if it exists for this table
437 // and add that as the docNum="0th" doc
438 Document nzDoc = getNZTableRowData(collection, tableNum);
439 if(nzDoc != null) {
440 writeDocAsJsonRecord(docNum, nzDoc, writer);
441 writeDocAsCSVRecord(docNum, nzDoc, csvWriter);
442 }
443 // all other table row data start at 1 for docNum
444 for (Document doc : output) {
445 //System.out.println(doc);
446 writeDocAsJsonRecord(++docNum, doc, writer);
447 writeDocAsCSVRecord(++docNum, doc, csvWriter);
448 }
449
450 logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
451 } catch(UncheckedIOException ioe) {
452 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
453 }
454 catch(Exception e) {
455 logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
456 }
457 }
458 }
459
460 public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {
461
462 Document nzRowData = null;
463 switch(tableNum) {
464 case 1: case 2: case 3: case 4:
465
466 break;
467
468 case 5:
469 /* Get NZ only table data:
470 db.Websites.aggregate([
471 {
472 $match: {
473 $and: [
474 {numPagesContainingMRI: {$gt: 0}},
475 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
476 ]
477 }
478 },
479 { $unwind: "$geoLocationCountryCode" },
480 {
481 $group: {
482 _id: "NZ",
483 count: { $sum: 1 },
484 //domain: { $addToSet: '$domain' },
485 numPagesInMRICount: { $sum: '$numPagesInMRI' },
486 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
487 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
488 }
489 },
490 { $sort : { count : -1} }
491 ]);
492
493 */
494 Bson orQuery = or(
495 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
496 BasicDBObject.parse("{domain: /\\.nz$/}")
497 );
498 Bson andQuery = and(
499 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
500 orQuery
501 );
502 AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
503 match(andQuery),
504 unwind("$geoLocationCountryCode"),
505 group("NZ", Arrays.asList(
506 sum("count", 1),
507 /*addToSet("domain", "$domain"),*/
508 sum("numPagesInMRICount", "$numPagesInMRI"),
509 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
510 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
511 sort(BasicDBObject.parse("{count : -1}"))
512 ));
513
514 nzRowData = output.first(); // first and only document in result
515
516 break;
517
518 default: logger.error("Unknown table number: " + tableNum);
519 }
520
521 return nzRowData;
522 }
523
524 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum)
525 {
526
527 AggregateIterable<Document> output = null;
528 Bson orQuery = null;
529 Bson andQuery = null;
530
531 switch(tableNum) {
532
533 case 1:
534 /* 1table_allCrawledSites -
535
536 db.Websites.aggregate([
537 { $unwind: "$geoLocationCountryCode" },
538 {
539 $group: {
540 _id: "$geoLocationCountryCode",
541 count: { $sum: 1 },
542 //domain: { $addToSet: '$domain' },
543 numPagesInMRICount: { $sum: '$numPagesInMRI' },
544 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
545 totalPagesAcrossSites: { $sum: '$totalPages'}
546 }
547 },
548 { $sort : { count : -1} }
549 ]);
550 */
551 output = collection.aggregate(Arrays.asList(
552 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
553 unwind("$geoLocationCountryCode"),
554 group("$geoLocationCountryCode", Arrays.asList(
555 sum("count", 1),
556 /*addToSet("domain", "$domain"),*/
557 sum("numPagesInMRICount", "$numPagesInMRI"),
558 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
559 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
560 sort(BasicDBObject.parse("{count : -1}"))
561 ));
562 break;
563
564 case 2:
565 /*
566 db.Websites.aggregate([
567 { $match: { numPagesInMRI: {$gt: 0} } },
568 { $unwind: "$geoLocationCountryCode" },
569 {
570 $group: {
571 _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower: _id:'$geoLocationCountryCode'
572 count: { $sum: 1 },
573 //domain: { $addToSet: '$domain' },
574 numPagesInMRICount: { $sum: '$numPagesInMRI' },
575 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
576 totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
577 }
578 },
579 { $sort : { count : -1} }
580 ]);
581 */
582 output = collection.aggregate(Arrays.asList(
583 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
584 unwind("$geoLocationCountryCode"),
585 group("$geoLocationCountryCode", Arrays.asList(
586 sum("count", 1),
587 /*addToSet("domain", "$domain"),*/
588 sum("numPagesInMRICount", "$numPagesInMRI"),
589 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
590 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
591 sort(BasicDBObject.parse("{count : -1}"))
592 ));
593 break;
594
595 case 3:
596 /*
597 db.Websites.aggregate([
598 {
599 $match: { numPagesContainingMRI: {$gt: 0} }
600 },
601 { $unwind: "$geoLocationCountryCode" },
602 {
603 $group: {
604 _id: '$geoLocationCountryCode',
605 count: { $sum: 1 },
606 //domain: { $addToSet: '$domain' },
607 numPagesInMRICount: { $sum: '$numPagesInMRI' },
608 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
609 totalPagesAcrossSitesWithPosContainsMRI: { $sum: '$totalPages'}
610 }
611 },
612 { $sort : { count : -1} }
613 ]);
614 */
615 output = collection.aggregate(Arrays.asList(
616 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
617 unwind("$geoLocationCountryCode"),
618 group("$geoLocationCountryCode", Arrays.asList(
619 sum("count", 1),
620 /*addToSet("domain", "$domain"),*/
621 sum("numPagesInMRICount", "$numPagesInMRI"),
622 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
623 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
624 sort(BasicDBObject.parse("{count : -1}"))
625 ));
626 break;
627
628 case 4:
629 /*
630 db.Websites.aggregate([
631 {
632 $match: {
633 $and: [
634 {numPagesContainingMRI: {$gt: 0}},
635 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
636 ]
637 }
638 },
639 { $unwind: "$geoLocationCountryCode" },
640 {
641 $group: {
642 _id: {$toLower: '$geoLocationCountryCode'},
643 count: { $sum: 1 },
644 //domain: { $addToSet: '$domain' },
645 numPagesInMRICount: { $sum: '$numPagesInMRI' },
646 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
647 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
648 }
649 },
650 { $sort : { count : -1} }
651 ]);
652 */
653
654 orQuery = or(
655 BasicDBObject.parse("{geoLocationCountryCode: /(NZ|AU)/}"),
656 //BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
657 BasicDBObject.parse("{domain: /\\.nz$/}"),
658 BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
659 );
660 andQuery = and(
661 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
662 orQuery);
663 output = collection.aggregate(Arrays.asList(
664 match(andQuery),
665 unwind("$geoLocationCountryCode"),
666 group("$geoLocationCountryCode", Arrays.asList(
667 sum("count", 1),
668 /*addToSet("domain", "$domain"),*/
669 sum("numPagesInMRICount", "$numPagesInMRI"),
670 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
671 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
672 sort(BasicDBObject.parse("{count : -1}"))
673 ));
674 break;
675
676 case 5:
677 /*
678 Table of count by countryCode of sites with numPagesContainingMRI > 0
679 Just do OVERSEAS here, NZ handled separately
680
681 db.Websites.aggregate([
682 {
683 $match: {
684 $and: [
685 {geoLocationCountryCode: {$ne: "NZ"}},
686 {domain: {$not: /\.nz$/}},
687 {numPagesContainingMRI: {$gt: 0}}
688 ]
689 }
690 },
691 { $unwind: "$geoLocationCountryCode" },
692 {
693 $group: {
694 _id: '$geoLocationCountryCode',
695 count: { $sum: 1 },
696 //domain: { $addToSet: '$domain' },
697 numPagesInMRICount: { $sum: '$numPagesInMRI' },
698 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
699 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
700 }
701 },
702 { $sort : { count : -1} }
703 ]);
704 */
705
706 andQuery = and(
707 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
708 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
709 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}")
710 );
711 output = collection.aggregate(Arrays.asList(
712 match(andQuery),
713 unwind("$geoLocationCountryCode"),
714 group("$geoLocationCountryCode", Arrays.asList(
715 sum("count", 1),
716 /*addToSet("domain", "$domain"),*/
717 sum("numPagesInMRICount", "$numPagesInMRI"),
718 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
719 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
720 sort(BasicDBObject.parse("{count : -1}"))
721 ));
722
723 break;
724 default: logger.error("Unknown table number: " + tableNum);
725 }
726
727 return output;
728
729 }
730
731
732
733 /**
734 * called by lambda forEach() call on Document objects to write them out to a file.
735 * Have to deal with unreported exceptions here that can't be dealt with when doing
736 * the actual forEach(). See
737 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
738 */
739 public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException {
740
741 // If there's a domain field in the json Doc, sort this domain listing alphabetically
742 Object domainList = doc.remove("domain");
743 ///logger.info("CLASS: " + domainList.getClass());
744 if(domainList != null) {
745 List sortedList = sortAlphabetically(domainList);
746 doc.put("uniqueCount", sortedList.size());
747 doc.put("domain", sortedList);
748 }
749
750 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
751 // Can't control json output to add newlines after each array element,
752 // no matter which JsonMode is used.
753
754 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
755 // Still can't control array element output,
756 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
757 //JsonWriterSettings writeSettings = new JsonWriterSettings();
758 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
759 //writer.write(doc.toJson(writeSettings) + NEWLINE);
760
761 // Not the JsonWriter of mongodb java driver:
762 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
763
764 // Have to use gson's pretty print to produce a json string that contains
765 // newlines after every array element in the json:
766
767 String jsonStr = prettyPrintJson(doc.toJson());
768 //System.err.println(jsonStr);
769 try {
770 writer.write("/* " + docNum + " */" + NEWLINE);
771 writer.write(jsonStr + NEWLINE + NEWLINE);
772 } catch (IOException ex) {
773 //throw ex;
774 throw new UncheckedIOException(ex);
775 }
776 }
777
778 public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
779 String jsonStr = prettyPrintJson(doc.toJson());
780 //System.err.println(jsonStr);
781 try {
782 writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
783 } catch (IOException ex) {
784 //throw ex;
785 throw new UncheckedIOException(ex);
786 }
787 }
788
789 // TODO
790 //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
791 public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
792 String jsonStr = doc.toJson();
793 JsonParser parser = new JsonParser();
794 JsonElement json = parser.parse(jsonStr);
795
796 JsonObject jsonObj = (JsonObject)json;
797
798 String countryCode = jsonObj.get("_id").getAsString();
799 int siteCount = jsonObj.get("count").getAsInt();
800 int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
801 int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
802 int totalPagesAcrossMatchingSites = jsonObj.get("totalPagesAcrossMatchingSites").getAsInt();
803
804 //System.err.println(jsonStr);
805 try {
806 //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
807 csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount, totalPagesAcrossMatchingSites);
808 } catch (IOException ex) {
809 //throw ex;
810 throw new UncheckedIOException(ex);
811 }
812 }
813
814 public String prettyPrintJson(String jsonStr) {
815 Gson gson = new GsonBuilder().setPrettyPrinting().create();
816 JsonParser jp = new JsonParser();
817 JsonElement je = jp.parse(jsonStr);
818 String prettyJsonString = gson.toJson(je);
819 return prettyJsonString;
820 }
821
822 private List sortAlphabetically(Object list) {
823 //BsonArray domainList = (BsonArray)list;
824 ArrayList<BsonValue> domainList = (ArrayList<BsonValue>)list;
825 /*
826 // for(String domain : domainList) {
827 for(int i = domainList.size() - 1; i >= 0; i--) {
828 BsonString domain = domainList.get(i).asString();
829 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
830 domainList.set(i, new BsonString(domainStr));
831 }
832 Collections.sort(domainList);
833 // still need to get rid of non-unique values...
834 */
835
836 TreeSet<String> set = new TreeSet<String>();
837 for(int i = domainList.size() - 1; i >= 0; i--) {
838 ///BsonValue val = domainList.get(i);
839 ///BsonString domain = val.asString();
840 //BsonString domain = domainList.get(i).asString();
841 //String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
842 Object domain = domainList.get(i);
843 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
844 set.add(domainStr);
845 //domainList.set(i, new BsonString(domainStr));
846 }
847
848 domainList = new ArrayList<BsonValue>(); //new BsonArray();
849 for(String s : set) {
850 domainList.add(new BsonString(s));
851 }
852 return domainList;
853 }
854
855}
Note: See TracBrowser for help on using the repository browser.