source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java@ 33919

Last change on this file since 33919 was 33919, checked in by ak19, 4 years ago

SummaryTool now uses the CountryCodeCountsMapData.java class to generate the geojson-features files from the tables it already created using MongoDB query results. Switched over from geojson.tools to geojson.io since the latter allows passing geojson mapdata in the URL. The firefox screenshotting is still not working. But I can't even get complex geojson features to work from the commandline yet, so then there's another possible layer of complexity when running firefox as a Java process. Added jna jar files used by Greenstone's SafeProcess for launching Firefox as a Java process.

File size: 31.9 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7
8// to use collection.find() filters like eq(), regex() etc
9import static com.mongodb.client.model.Filters.*;
10// to use collection.find().projection() filters like include() etc
11import static com.mongodb.client.model.Projections.*;
12// to use aggregation functions like unwind(), match(), sort() etc
13import static com.mongodb.client.model.Aggregates.*;
14// to use functions like sum() and addToSet() within aggregation functions
15import static com.mongodb.client.model.Accumulators.*;
16
17//import org.bson.conversions.Bson;
18import com.mongodb.BasicDBObject;
19
20
21import com.mongodb.Block;
22
23import org.bson.BsonArray;
24import org.bson.BsonString;
25import org.bson.BsonValue;
26import org.bson.Document;
27import org.bson.conversions.Bson;
28import org.bson.json.JsonMode;
29import org.bson.json.JsonWriterSettings;
30
31import com.mongodb.util.JSON;
32//import com.mongodb.DBObject;
33
34
35import com.google.gson.*; // for pretty printing
36
37import java.io.BufferedReader;
38import java.io.BufferedWriter;
39import java.io.File;
40import java.io.FileReader;
41import java.io.FileWriter;
42import java.io.IOException;
43import java.io.UncheckedIOException;
44import java.io.Writer;
45import javax.xml.ws.Holder;
46
47
48import java.util.Arrays;
49import java.util.ArrayList;
50import java.util.List;
51import java.util.TreeSet;
52
53
54import org.apache.log4j.Logger;
55import org.apache.commons.csv.*;
56
57/**
58 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
59 *
60 * TO COMPILE:
61 * maori-lang-detection/src$
62 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBQueryer.java
63 *
64 * TO RUN:
65 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBQueryer
66 *
67 * Manually connecting to mongodb from client:
68 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
69 * Then after connecting with pwd, type:
70 * use DBNAME
71 *
72 * Or connect to mongodb and specify db in one statement:
73 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
74 *
75 * Some links:
76 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
77 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
78 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
79 * IMPORTANT LINK:
80 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
81 *
82 * API:
83 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
84 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
85 */
86public class MongoDBQueryer extends MongoDBAccess {
87
88 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBQueryer.class.getName());
89
90 public static final String NEWLINE = System.getProperty("line.separator");
91
92 /** mongodb filter types to execute */
93 public static final int IS_MRI = 0;
94 public static final int CONTAINS_MRI = 1;
95
96 /** Some reused fieldnames in the Websites collection */
97 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
98 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
99
100
101
102 public MongoDBQueryer() throws Exception {
103 super();
104 }
105
106
107 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
108 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
109 }
110 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
111 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
112 }
113
114 /**
115 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
116 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
117 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
118 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
119 *
120 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
121 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
122 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
123 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
124 */
125 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
126
127 final ArrayList<String> urlsList = new ArrayList<String>();
128
129 // remove any http(s)://(www.) from the start of URL first
130 // since it goes into a regex
131 domain = Utility.stripProtocolAndWWWFromURL(domain);
132
133 // load the "webpages" db table
134 // in mongodb, the equivalent of db tables are called 'collections'
135 MongoCollection<Document> collection = getWebpagesCollection();
136
137 // code we'll execute in Iterable.forEach() below
138 // see also https://www.baeldung.com/foreach-java
139 Block<Document> storeURL = new Block<Document>() {
140 @Override
141 public void apply(final Document document) {
142 //System.out.println(document.toJson());
143 String url = document.getString("URL");
144 // add to our urlsList
145 //System.out.println(url);
146 urlsList.add(url);
147 }
148 };
149
150
151 // Run the following mongodb query:
152 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
153
154 // 1. One way that works:
155 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
156
157 // 2. Another way:
158 //String query = "{URL: /DOMAIN/, isMRI: true}";
159 String query = "{URL: /DOMAIN/, ";
160 if(filterType == IS_MRI) {
161 query += "isMRI: true}";
162 } else if(filterType == CONTAINS_MRI) {
163 query += "containsMRI: true}";
164 }
165
166 domain = domain.replace(".", "\\."); // escape dots in domain for regex
167 query = query.replace("DOMAIN", domain);
168
169 //System.err.println("Executing find query: " + query);
170
171 BasicDBObject findObj = BasicDBObject.parse(query);
172 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
173
174
175 collection.find(findObj).projection(projectionObj).forEach(storeURL);
176
177 return urlsList;
178 }
179
180 /**
181 * Does a mongoDB query like the following, depending on filter type:
182 * db.getCollection('Webpages').find({isMRI: true}).count()
183 * @param filterType can be either IS_MRI or CONTAINS_MRI.
184 * @return the number of webpages that matched the filterType setting.
185 */
186 public long countOfWebpagesMatching(int filterType) {
187 String query = (filterType == IS_MRI) ? "{isMRI: true}" : "{containsMRI: true}";
188 long result = -1;
189 MongoCollection<Document> collection = getWebpagesCollection();
190
191
192 try {
193 BasicDBObject queryObj = BasicDBObject.parse(query);
194 //result = collection.find(queryObj).count();
195 // https://stackoverflow.com/questions/32683458/how-to-call-count-operation-after-find-with-mongodb-java-driver
196 result = collection.countDocuments(queryObj);
197
198 } catch(Exception e) {
199 logger.error("MongoDB couldn't parse provided query " + query);
200 }
201
202 return result;
203 }
204
205 /**
206 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
207 *
208 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
209 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
210 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
211 *
212 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
213 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
214 * On using group(TExpression) inside collection.aggregate().
215 *
216 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
217 * and https://www.javatpoint.com/java-8-foreach
218 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
219 *
220 * Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,
221 * listing all the base domain strings (unsorted with protocol and any www)
222 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
223 * matching sites.
224 *
225 * The mongodb aggregate() we want to run this time:
226 *
227 db.Websites.aggregate([
228 {
229 $match: {
230 $and: [
231 {numPagesContainingMRI: {$gt: 0}},
232 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
233 ]
234 }
235 },
236 { $unwind: "$geoLocationCountryCode" },
237 {
238 $group: {
239 _id: "nz",
240 count: { $sum: 1 },
241 domain: { $addToSet: '$domain' }
242 }
243 },
244 { $sort : { count : -1} }
245 ]);
246 */
247 public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
248 // working with the WebSites collection, not WebPages collection!
249 MongoCollection<Document> collection = getWebsitesCollection();
250
251 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
252
253 // Want a counter,
254 // but lambda expressions can only take final variables and those can't be incremented
255 // But can use an array to store incrementable counter or Holder type
256 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
257 Holder<Integer> docNum = new Holder<>(0);
258
259 Bson orQuery = or(
260 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
261 BasicDBObject.parse("{domain: /\\.nz$/}")
262 );
263 Bson andQuery = and(
264 BasicDBObject.parse(mriFilterString),
265 orQuery);
266
267 // Hopefully the lambda expression (forEach()) at end means
268 // we write out each result Document as we get it
269 collection.aggregate(Arrays.asList(
270 match(andQuery),
271 unwind("$geoLocationCountryCode"),
272 group("NZ", Arrays.asList(sum("count", 1),
273 addToSet("domain", "$domain"))),
274 sort(BasicDBObject.parse("{count : -1}"))
275 )).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
276
277 // should only have one doc for NZ since it's a count by geolocation.
278
279 return;
280 }
281
282
283 /**
284 * Count of overseas (non-NZ and non-.nz TLD) websites
285 * containing a positive number of sentences in MRI,
286 * listing all the base domain strings (unsorted with protocol and any www)
287 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
288 * matching sites. Regardless of whether there's an mi in the URL path of any or not.
289 *
290 * The aggregate() we want to run this time:
291 *
292 db.Websites.aggregate([
293 {
294 $match: {
295 $and: [
296 {geoLocationCountryCode: {$ne: "NZ"}},
297 {domain: {$not: /\.nz/}},
298 {numPagesContainingMRI: {$gt: 0}}
299 ]
300 }
301 },
302 { $unwind: "$geoLocationCountryCode" },
303 {
304 $group: {
305 _id: {$toLower: '$geoLocationCountryCode'},
306 count: { $sum: 1 },
307 domain: { $addToSet: '$domain' }
308 }
309 },
310 { $sort : { count : -1} }
311 ]);
312 */
313 public void aggregateContainsMRIForOverseas(Writer writer, int filterType)
314 throws UncheckedIOException {
315 // working with the WebSites collection, not WebPages collection!
316 MongoCollection<Document> collection = getWebsitesCollection();
317
318 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
319
320 // Want a counter,
321 // but lambda expressions can only take final variables and those can't be incremented
322 // But can use an array to store incrementable counter or Holder
323 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
324 Holder<Integer> docNum = new Holder<>(1);
325
326 Bson andQuery = and(
327 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
328 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
329 BasicDBObject.parse(mriFilterString));
330
331 collection.aggregate(Arrays.asList(
332 match(andQuery),
333 unwind("$geoLocationCountryCode"),
334 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
335 addToSet("domain", "$domain"))),
336 sort(BasicDBObject.parse("{count : -1}"))
337 )).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
338
339 return;
340 }
341
342
343 /** Count by country code of overseas (non-NZ and non-nz TLD) websites
344 * containing a positive number of sentences in MRI,
345 * listing all the base domain strings (unordered and with protocol and any www)
346 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
347 * matching sites.
348 *
349 * The aggregate() we want to run this time:
350 *
351 db.Websites.aggregate([
352 {
353 $match: {
354 $and: [
355 {geoLocationCountryCode: {$ne: "NZ"}},
356 {domain: {$not: /\.nz/}},
357 {numPagesContainingMRI: {$gt: 0}},
358 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
359 ]
360 }
361 },
362 { $unwind: "$geoLocationCountryCode" },
363 {
364 $group: {
365 _id: {$toLower: '$geoLocationCountryCode'},
366 count: { $sum: 1 },
367 domain: { $addToSet: '$domain' }
368 }
369 },
370 { $sort : { count : -1} }
371 ]);
372 */
373 public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
374 boolean isMiInURLPath) throws UncheckedIOException
375 {
376 // working with the WebSites collection, not WebPages collection!
377 MongoCollection<Document> collection = getWebsitesCollection();
378
379 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
380
381 // Want a counter,
382 // but lambda expressions can only take final variables and those can't be incremented
383 // But can use an array to store incrementable counter or Holder
384 // https://stackoverflow.com/questions/28790784/java-8-preferred-way-to-count-iterations-of-a-lambda
385 Holder<Integer> docNum = new Holder<>(1);
386
387 /*
388 Bson orQuery = or(
389 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
390 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
391 // e.g. "{urlContainsLangCodeInPath: false}"
392 );
393 */
394 Bson andQuery = and(
395 BasicDBObject.parse(mriFilterString),
396 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
397 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
398 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}"));//orQuery);
399
400 collection.aggregate(Arrays.asList(
401 match(andQuery), //match(BasicDBObject.parse(matchQuery))
402 // match((List<DBObject>)JSON.parse(matchQuery)),
403 unwind("$geoLocationCountryCode"),
404 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
405 addToSet("domain", "$domain"))),
406 sort(BasicDBObject.parse("{count : -1}"))
407 )).forEach((Block<Document>)doc -> writeDoc(docNum.value++, doc, writer));
408
409 // casting to Block<Document> necessary because otherwise we see the error at
410 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
411
412 // Less efficient way is to keep all the results in memory and then
413 // write them out one at a time
414 /*
415 AggregateIterable<Document> output
416 = collection.aggregate(Arrays.asList(
417 match(andQuery), //match(BasicDBObject.parse(matchQuery))
418 // match((List<DBObject>)JSON.parse(matchQuery)),
419 unwind("$geoLocationCountryCode"),
420 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
421 sort(BasicDBObject.parse("{count : -1}"))
422 ));
423
424
425 for (Document doc : output) {
426 //System.out.println(doc);
427 System.out.println(doc.toJson());
428
429 }
430 */
431 return;
432 }
433
434 /** Perform the aggregates for writing out the summary tables. */
435 public String[] writeTables(File outFolder) {
436 // In this function, we're always dealing with the Websites mongodb collection.
437 MongoCollection<Document> collection = getWebsitesCollection();
438
439 String[] tableNames = {
440 "",
441 "1table_allCrawledSites",
442 "2table_sitesWithPagesInMRI",
443 "3table_sitesWithPagesContainingMRI",
444 "4table_containsMRI_exclTentativeProductSites",
445 "5table_sitesWithPagesContainingMRI_allNZGrouped",
446 "5table_sitesWithPagesInMRI_allNZGrouped"
447 };
448 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
449 File outFile = new File(outFolder, tableNames[tableNum] + ".json");
450 File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
451 try (
452 Writer writer = new BufferedWriter(new FileWriter(outFile));
453 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); // quote ALL vs MINIMAL vs NON_NUMERIC fields
454 ) {
455
456 // Write out the CSV column headings
457 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
458 csvWriter.printRecord("countryCode", "siteCount",
459 "numPagesInMRI count","numPagesContainingMRICount",
460 "totalPagesAcrossMatchingSites"/*, "domain"*/);
461
462 AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
463
464
465 int docNum = 0;
466 // get any NZ specific row's data, if it exists for this table
467 // and add that as the docNum="0th" doc
468 Document nzDoc = getNZTableRowData(collection, tableNum);
469 if(nzDoc != null) {
470 writeDocAsJsonRecord(docNum, nzDoc, writer);
471 writeDocAsCSVRecord(docNum, nzDoc, csvWriter);
472 }
473 // all other table row data start at 1 for docNum
474 for (Document doc : output) {
475 //System.out.println(doc);
476 writeDocAsJsonRecord(++docNum, doc, writer);
477 writeDocAsCSVRecord(++docNum, doc, csvWriter);
478 }
479
480 logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
481 } catch(UncheckedIOException ioe) {
482 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
483 }
484 catch(Exception e) {
485 logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
486 }
487 }
488
489 return tableNames;
490 }
491
492 public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {
493
494 Document nzRowData = null;
495 switch(tableNum) {
496 case 1: case 2: case 3: case 4:
497
498 break;
499
500 //case 5:
501 //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}";
502 //case 6:
503 //filterQueryStr = "{numPagesInMRI: {$gt: 0}}";
504 case 5: case 6:
505 String filterQueryStr = (tableNum == 5) ?
506 "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
507
508 /* Get NZ only table data.
509 Can be numPagesContainingMRI or numPagesInMRI > 0 depending on filterQueryStr.
510
511 db.Websites.aggregate([
512 {
513 $match: {
514 $and: [
515 {numPagesContainingMRI: {$gt: 0}},
516 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz$/}]}
517 ]
518 }
519 },
520 { $unwind: "$geoLocationCountryCode" },
521 {
522 $group: {
523 _id: "NZ",
524 count: { $sum: 1 },
525 //domain: { $addToSet: '$domain' },
526 numPagesInMRICount: { $sum: '$numPagesInMRI' },
527 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
528 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
529 }
530 },
531 { $sort : { count : -1} }
532 ]);
533
534 */
535
536 Bson orQuery = or(
537 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
538 BasicDBObject.parse("{domain: /\\.nz$/}")
539 );
540 Bson andQuery = and(
541 BasicDBObject.parse(filterQueryStr), // e.g."{numPagesContainingMRI: {$gt: 0}}"
542 orQuery
543 );
544 AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
545 match(andQuery),
546 unwind("$geoLocationCountryCode"),
547 group("NZ", Arrays.asList(
548 sum("count", 1),
549 /*addToSet("domain", "$domain"),*/
550 sum("numPagesInMRICount", "$numPagesInMRI"),
551 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
552 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
553 sort(BasicDBObject.parse("{count : -1}"))
554 ));
555
556 nzRowData = output.first(); // first and only document in result
557
558 break;
559
560
561 default: logger.error("Unknown table number: " + tableNum);
562 }
563
564 return nzRowData;
565 }
566
567 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum)
568 {
569 //String filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; // only used if tableNum = 5|6
570
571 AggregateIterable<Document> output = null;
572 Bson orQuery = null;
573 Bson andQuery = null;
574
575 switch(tableNum) {
576
577 case 1:
578 /* 1table_allCrawledSites -
579
580 db.Websites.aggregate([
581 { $unwind: "$geoLocationCountryCode" },
582 {
583 $group: {
584 _id: "$geoLocationCountryCode",
585 count: { $sum: 1 },
586 //domain: { $addToSet: '$domain' },
587 numPagesInMRICount: { $sum: '$numPagesInMRI' },
588 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
589 totalPagesAcrossSites: { $sum: '$totalPages'}
590 }
591 },
592 { $sort : { count : -1} }
593 ]);
594 */
595 output = collection.aggregate(Arrays.asList(
596 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
597 unwind("$geoLocationCountryCode"),
598 group("$geoLocationCountryCode", Arrays.asList(
599 sum("count", 1),
600 /*addToSet("domain", "$domain"),*/
601 sum("numPagesInMRICount", "$numPagesInMRI"),
602 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
603 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
604 sort(BasicDBObject.parse("{count : -1}"))
605 ));
606 break;
607
608 case 2:
609 /*
610 db.Websites.aggregate([
611 { $match: { numPagesInMRI: {$gt: 0} } },
612 { $unwind: "$geoLocationCountryCode" },
613 {
614 $group: {
615 _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower: _id:'$geoLocationCountryCode'
616 count: { $sum: 1 },
617 //domain: { $addToSet: '$domain' },
618 numPagesInMRICount: { $sum: '$numPagesInMRI' },
619 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
620 totalPagesAcrossSitesWithPositiveMRICount: { $sum: '$totalPages'}
621 }
622 },
623 { $sort : { count : -1} }
624 ]);
625 */
626 output = collection.aggregate(Arrays.asList(
627 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
628 unwind("$geoLocationCountryCode"),
629 group("$geoLocationCountryCode", Arrays.asList(
630 sum("count", 1),
631 /*addToSet("domain", "$domain"),*/
632 sum("numPagesInMRICount", "$numPagesInMRI"),
633 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
634 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
635 sort(BasicDBObject.parse("{count : -1}"))
636 ));
637 break;
638
639 case 3:
640 /*
641 db.Websites.aggregate([
642 {
643 $match: { numPagesContainingMRI: {$gt: 0} }
644 },
645 { $unwind: "$geoLocationCountryCode" },
646 {
647 $group: {
648 _id: '$geoLocationCountryCode',
649 count: { $sum: 1 },
650 //domain: { $addToSet: '$domain' },
651 numPagesInMRICount: { $sum: '$numPagesInMRI' },
652 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
653 totalPagesAcrossSitesWithPosContainsMRI: { $sum: '$totalPages'}
654 }
655 },
656 { $sort : { count : -1} }
657 ]);
658 */
659 output = collection.aggregate(Arrays.asList(
660 match(BasicDBObject.parse("{ numPagesContainingMRI: {$gt: 0} }")),
661 unwind("$geoLocationCountryCode"),
662 group("$geoLocationCountryCode", Arrays.asList(
663 sum("count", 1),
664 /*addToSet("domain", "$domain"),*/
665 sum("numPagesInMRICount", "$numPagesInMRI"),
666 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
667 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
668 sort(BasicDBObject.parse("{count : -1}"))
669 ));
670 break;
671
672 case 4:
673 /*
674 db.Websites.aggregate([
675 {
676 $match: {
677 $and: [
678 {numPagesContainingMRI: {$gt: 0}},
679 {$or: [{geoLocationCountryCode: /(NZ|AU)/}, {domain: /\.nz$/}, {urlContainsLangCodeInPath: false}]}
680 ]
681 }
682 },
683 { $unwind: "$geoLocationCountryCode" },
684 {
685 $group: {
686 _id: {$toLower: '$geoLocationCountryCode'},
687 count: { $sum: 1 },
688 //domain: { $addToSet: '$domain' },
689 numPagesInMRICount: { $sum: '$numPagesInMRI' },
690 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
691 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
692 }
693 },
694 { $sort : { count : -1} }
695 ]);
696 */
697
698 orQuery = or(
699 BasicDBObject.parse("{geoLocationCountryCode: /(NZ|AU)/}"),
700 //BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
701 BasicDBObject.parse("{domain: /\\.nz$/}"),
702 BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
703 );
704 andQuery = and(
705 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
706 orQuery);
707 output = collection.aggregate(Arrays.asList(
708 match(andQuery),
709 unwind("$geoLocationCountryCode"),
710 group("$geoLocationCountryCode", Arrays.asList(
711 sum("count", 1),
712 /*addToSet("domain", "$domain"),*/
713 sum("numPagesInMRICount", "$numPagesInMRI"),
714 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
715 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
716 sort(BasicDBObject.parse("{count : -1}"))
717 ));
718 break;
719 //case 5:
720 //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}";
721 //case 6:
722 //filterQueryStr = "{numPagesInMRI: {$gt: 0}}";
723 case 5: case 6:
724 String filterQueryStr = (tableNum == 5) ?
725 "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
726 /*
727 Table of count by countryCode of sites with numPagesContainingMRI > 0
728 (or numPagesInMRI > 0).
729 Just do OVERSEAS here, NZ handled separately
730
731 db.Websites.aggregate([
732 {
733 $match: {
734 $and: [
735 {geoLocationCountryCode: {$ne: "NZ"}},
736 {domain: {$not: /\.nz$/}},
737 {numPagesContainingMRI: {$gt: 0}}
738 ]
739 }
740 },
741 { $unwind: "$geoLocationCountryCode" },
742 {
743 $group: {
744 _id: '$geoLocationCountryCode',
745 count: { $sum: 1 },
746 //domain: { $addToSet: '$domain' },
747 numPagesInMRICount: { $sum: '$numPagesInMRI' },
748 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' },
749 totalPagesAcrossMatchingSites: { $sum: '$totalPages'}
750 }
751 },
752 { $sort : { count : -1} }
753 ]);
754 */
755
756 andQuery = and(
757 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
758 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
759 BasicDBObject.parse(filterQueryStr) // e.g. "{numPagesContainingMRI: {$gt: 0}}"
760 );
761 output = collection.aggregate(Arrays.asList(
762 match(andQuery),
763 unwind("$geoLocationCountryCode"),
764 group("$geoLocationCountryCode", Arrays.asList(
765 sum("count", 1),
766 /*addToSet("domain", "$domain"),*/
767 sum("numPagesInMRICount", "$numPagesInMRI"),
768 sum("numPagesContainingMRICount", "$numPagesContainingMRI"),
769 sum("totalPagesAcrossMatchingSites", "$totalPages"))),
770 sort(BasicDBObject.parse("{count : -1}"))
771 ));
772
773 break;
774 default: logger.error("Unknown table number: " + tableNum);
775 }
776
777 return output;
778
779 }
780
781
782
783 /**
784 * called by lambda forEach() call on Document objects to write them out to a file.
785 * Have to deal with unreported exceptions here that can't be dealt with when doing
786 * the actual forEach(). See
787 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
788 */
789 public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException {
790
791 // If there's a domain field in the json Doc, sort this domain listing alphabetically
792 Object domainList = doc.remove("domain");
793 ///logger.info("CLASS: " + domainList.getClass());
794 if(domainList != null) {
795 List sortedList = sortAlphabetically(domainList);
796 doc.put("uniqueCount", sortedList.size());
797 doc.put("domain", sortedList);
798 }
799
800 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
801 // Can't control json output to add newlines after each array element,
802 // no matter which JsonMode is used.
803
804 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
805 // Still can't control array element output,
806 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
807 //JsonWriterSettings writeSettings = new JsonWriterSettings();
808 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
809 //writer.write(doc.toJson(writeSettings) + NEWLINE);
810
811 // Not the JsonWriter of mongodb java driver:
812 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
813
814 // Have to use gson's pretty print to produce a json string that contains
815 // newlines after every array element in the json:
816
817 String jsonStr = prettyPrintJson(doc.toJson());
818 //System.err.println(jsonStr);
819 try {
820 writer.write("/* " + docNum + " */" + NEWLINE);
821 writer.write(jsonStr + NEWLINE + NEWLINE);
822 } catch (IOException ex) {
823 //throw ex;
824 throw new UncheckedIOException(ex);
825 }
826 }
827
828 public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
829 String jsonStr = prettyPrintJson(doc.toJson());
830 //System.err.println(jsonStr);
831 try {
832 writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
833 } catch (IOException ex) {
834 //throw ex;
835 throw new UncheckedIOException(ex);
836 }
837 }
838
839 // TODO
840 //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
841 public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
842 String jsonStr = doc.toJson();
843 JsonParser parser = new JsonParser();
844 JsonElement json = parser.parse(jsonStr);
845
846 JsonObject jsonObj = (JsonObject)json;
847
848 String countryCode = jsonObj.get("_id").getAsString();
849 int siteCount = jsonObj.get("count").getAsInt();
850 int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
851 int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
852 int totalPagesAcrossMatchingSites = jsonObj.get("totalPagesAcrossMatchingSites").getAsInt();
853
854 //System.err.println(jsonStr);
855 try {
856 //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
857 csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount, totalPagesAcrossMatchingSites);
858 } catch (IOException ex) {
859 //throw ex;
860 throw new UncheckedIOException(ex);
861 }
862 }
863
864 public String prettyPrintJson(String jsonStr) {
865 Gson gson = new GsonBuilder().setPrettyPrinting().create();
866 JsonParser jp = new JsonParser();
867 JsonElement je = jp.parse(jsonStr);
868 String prettyJsonString = gson.toJson(je);
869 return prettyJsonString;
870 }
871
872 private List sortAlphabetically(Object list) {
873 //BsonArray domainList = (BsonArray)list;
874 ArrayList<BsonValue> domainList = (ArrayList<BsonValue>)list;
875 /*
876 // for(String domain : domainList) {
877 for(int i = domainList.size() - 1; i >= 0; i--) {
878 BsonString domain = domainList.get(i).asString();
879 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
880 domainList.set(i, new BsonString(domainStr));
881 }
882 Collections.sort(domainList);
883 // still need to get rid of non-unique values...
884 */
885
886 TreeSet<String> set = new TreeSet<String>();
887 for(int i = domainList.size() - 1; i >= 0; i--) {
888 ///BsonValue val = domainList.get(i);
889 ///BsonString domain = val.asString();
890 //BsonString domain = domainList.get(i).asString();
891 //String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
892 Object domain = domainList.get(i);
893 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
894 set.add(domainStr);
895 //domainList.set(i, new BsonString(domainStr));
896 }
897
898 domainList = new ArrayList<BsonValue>(); //new BsonArray();
899 for(String s : set) {
900 domainList.add(new BsonString(s));
901 }
902 return domainList;
903 }
904
905}
Note: See TracBrowser for help on using the repository browser.