source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33887

Last change on this file since 33887 was 33887, checked in by ak19, 4 years ago
  1. Added support for writing out tables in csv format too. 2. Second table written out now. 3. Moved getFilePath() into Utility.
File size: 26.4 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7import com.mongodb.client.MongoDatabase;
8//import com.mongodb.client.MongoIterable;
9
10// to use collection.find() filters like eq(), regex() etc
11import static com.mongodb.client.model.Filters.*;
12// to use collection.find().projection() filters like include() etc
13import static com.mongodb.client.model.Projections.*;
14// to use aggregation functions like unwind(), match(), sort() etc
15import static com.mongodb.client.model.Aggregates.*;
16// to use functions like sum() and addToSet() within aggregation functions
17import static com.mongodb.client.model.Accumulators.*;
18
19//import org.bson.conversions.Bson;
20import com.mongodb.BasicDBObject;
21import com.mongodb.MongoClient;
22import com.mongodb.MongoCredential;
23import com.mongodb.ServerAddress;
24import com.mongodb.MongoClientOptions;
25
26import com.mongodb.Block;
27
28import org.bson.Document;
29import org.bson.conversions.Bson;
30import org.bson.json.JsonMode;
31import org.bson.json.JsonWriterSettings;
32
33import com.mongodb.util.JSON;
34//import com.mongodb.DBObject;
35
36
37import com.google.gson.*; // for pretty printing
38
39import java.io.BufferedReader;
40import java.io.BufferedWriter;
41import java.io.File;
42import java.io.FileReader;
43import java.io.FileWriter;
44import java.io.IOException;
45import java.io.UncheckedIOException;
46import java.io.Writer;
47
48import java.util.Arrays;
49import java.util.ArrayList;
50import java.util.List;
51import java.util.Properties;
52import java.util.regex.Pattern;
53
54import org.apache.log4j.Logger;
55
56import org.greenstone.atea.morphia.*;
57import dev.morphia.*;
58
59import org.apache.commons.csv.*;
60
61/**
62 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
63 *
64 * TO COMPILE:
65 * maori-lang-detection/src$
66 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
67 *
68 * TO RUN:
69 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
70 *
71 * Manually connecting to mongodb from client:
72 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
73 * Then after connecting with pwd, type:
74 * use DBNAME
75 *
76 * Or connect to mongodb and specify db in one statement:
77 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
78 *
79 * Some links:
80 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
81 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
82 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
83 * IMPORTANT LINK:
84 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
85 *
86 * API:
87 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
88 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
89 */
90public class MongoDBAccess implements AutoCloseable {
91
92 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
93
94 static final String PROPS_FILENAME = "config.properties";
95 public static final String WEBPAGES_COLLECTION = "Webpages";
96 public static final String WEBSITES_COLLECTION = "Websites";
97
98 public static final String NEWLINE = System.getProperty("line.separator");
99
100 /** mongodb filter types to execute */
101 public static final int IS_MRI = 0;
102 public static final int CONTAINS_MRI = 1;
103
104 /** Some reused fieldnames in the Websites collection */
105 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
106 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
107
108 // configuration details, some with fallback values
109 private String HOST = "localhost";
110 private int PORT = 27017; // mongodb port
111 private String USERNAME;
112 private String PASSWORD;
113 private String DB_NAME ="ateacrawldata";
114
115 private MongoClient mongo = null;
116 private MongoDatabase database = null;
117
118 /**
119 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
120 * for MongoDB
121 */
122 public Datastore datastore = null;
123
124 public MongoDBAccess() throws Exception {
125 boolean success = false;
126
127 // Read in the username and password from our props file
128 Properties props = new Properties();
129
130 //File propsFile = new File(PROPS_FILENAME);
131 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
132 try {
133 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
134 } catch(Exception e) {
135 logger.error(e);
136 }
137
138
139 USERNAME = props.getProperty("mongodb.user", "");
140 if(USERNAME.equals("")) {
141 USERNAME = "root";
142 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
143 }
144 PASSWORD = props.getProperty("mongodb.pwd");
145
146 logger.debug("Got pwd: " + PASSWORD);
147
148 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
149
150 success = false;
151 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
152 }
153
154 HOST = props.getProperty("mongodb.host", HOST);
155 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
156 PORT = Integer.parseInt(port);
157 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
158
159 logger.info("Connecting to mongodb with:");
160 logger.info(" - host: " + HOST);
161 logger.info(" - port: " + PORT);
162 logger.info(" - user: " + USERNAME);
163 logger.info(" - db name: " + DB_NAME);
164 }
165
166 /**
167 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
168 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
169 */
170 public void connectToDB() throws Exception {
171
172 // Creating a Mongo client
173 mongo = new MongoClient( HOST, PORT );
174
175 // Creating Credentials
176 MongoCredential credential;
177 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
178 System.out.println("Connected to the database successfully");
179
180 // Accessing the database
181 this.database = mongo.getDatabase(DB_NAME);
182 logger.info("Credentials: "+ credential);
183
184 /*
185 MongoCredential credential;
186 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
187 logger.info("Credentials: "+ credential);
188
189 // Create our Mongo client
190 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
191 System.out.println("Connected to the database successfully");
192
193 this.database = mongo.getDatabase(DB_NAME);
194 */
195
196 Morphia morphia = new Morphia();
197 morphia.mapPackage("com.greenstone.atea.morphia");
198 datastore = morphia.createDatastore(mongo, DB_NAME);
199 datastore.ensureIndexes();
200
201 }
202
203 // TODO: which fields should be indexed?
204
205 public void showCollections() {
206 //MongoIterable<String> colls = this.database.listCollectionNames();
207 for(String coll : this.database.listCollectionNames()) {
208 System.err.println("coll: " + coll);
209 }
210 }
211
212 /*
213 public void insertWebsiteInfo(WebsiteInfo website)
214 {
215 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
216 Document document = new Document("_id", website.id)
217 .append("siteFolderName", website.siteFolderName)
218 .append("domain", website.domain)
219 .append("totalPages", website.totalPages)
220 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
221 .append("numPagesInMRI", website.numPagesInMRI)
222 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
223 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
224 .append("redoCrawl", website.redoCrawl);
225
226 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
227 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
228 document.put("countryCode", website.geoLocationCountryCode);
229 }
230
231 collection.insertOne(document);
232 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
233 + " inserted successfully into " + WEBSITES_COLLECTION);
234 }
235 */
236
237 /**
238 * Inserts a web page into the mongodb. Besides page related metadata and full body text
239 * the language information per sentence and per 2 adjacent sentences also get stored
240 * into the mongodb.
241 */
242 /*
243 public void insertWebpageInfo(WebpageInfo webpage)
244 {
245 int mri_sentence_count = 0;
246
247 // load the webpages db 'table'
248 // in mongodb, the equivalent of db tables are called 'collections'
249 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
250
251 Document document = new Document("_id", webpage.webpageID)
252 .append("siteid", webpage.websiteID)
253 .append("url", webpage.URL)
254 .append("isMRI", webpage.isMRI)
255 .append("totalSentences", webpage.totalSentences)
256 .append("charEncoding", webpage.charEncoding)
257 .append("modTime", webpage.modifiedTime)
258 .append("fetchTime", webpage.fetchTime);
259
260 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
261 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
262 List<BasicDBObject> sentencesList = new ArrayList<>();
263 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
264
265 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
266
267 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
268 bsonRecord.put("sentence", sentenceInfo.sentence);
269
270 sentencesList.add(bsonRecord);
271
272 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
273 mri_sentence_count++;
274 }
275
276 }
277 document.put("singleSentences", sentencesList);
278
279 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
280 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
281
282 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
283 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
284 bsonRecord.put("sentence", sentenceInfo.sentence);
285
286 overlappingSentencesList.add(bsonRecord);
287 }
288 document.put("overlappingSentences", overlappingSentencesList);
289
290 // also put the full text in there
291 document.put("text", webpage.text);
292
293 // also store the count of sentences in MRI
294 webpage.setMRISentenceCount(mri_sentence_count);
295 document.put("mriSentenceCount", mri_sentence_count);
296
297
298 collection.insertOne(document);
299 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
300 }
301 */
302
303 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
304 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
305 }
306 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
307 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
308 }
309
310 /**
311 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
312 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
313 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
314 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
315 *
316 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
317 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
318 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
319 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
320 */
321 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
322
323 final ArrayList<String> urlsList = new ArrayList<String>();
324
325 // remove any http(s)://(www.) from the start of URL first
326 // since it goes into a regex
327 domain = Utility.stripProtocolAndWWWFromURL(domain);
328
329 // load the "webpages" db table
330 // in mongodb, the equivalent of db tables are called 'collections'
331 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
332
333 // code we'll execute in Iterable.forEach() below
334 // see also https://www.baeldung.com/foreach-java
335 Block<Document> storeURL = new Block<Document>() {
336 @Override
337 public void apply(final Document document) {
338 //System.out.println(document.toJson());
339 String url = document.getString("URL");
340 // add to our urlsList
341 //System.out.println(url);
342 urlsList.add(url);
343 }
344 };
345
346
347 // Run the following mongodb query:
348 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
349
350 // 1. One way that works:
351 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
352
353 // 2. Another way:
354 //String query = "{URL: /DOMAIN/, isMRI: true}";
355 String query = "{URL: /DOMAIN/, ";
356 if(filterType == IS_MRI) {
357 query += "isMRI: true}";
358 } else if(filterType == CONTAINS_MRI) {
359 query += "containsMRI: true}";
360 }
361
362 domain = domain.replace(".", "\\."); // escape dots in domain for regex
363 query = query.replace("DOMAIN", domain);
364
365 //System.err.println("Executing find query: " + query);
366
367 BasicDBObject findObj = BasicDBObject.parse(query);
368 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
369
370
371 collection.find(findObj).projection(projectionObj).forEach(storeURL);
372
373 return urlsList;
374 }
375
376 /**
377 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
378 *
379 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
380 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
381 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
382 *
383 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
384 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
385 * On using group(TExpression) inside collection.aggregate().
386 *
387 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
388 * and https://www.javatpoint.com/java-8-foreach
389 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
390 *
391 *
392 * The mongodb aggregate() we want to run this time:
393 *
394 db.Websites.aggregate([
395 {
396 $match: {
397 $and: [
398 {numPagesContainingMRI: {$gt: 0}},
399 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
400 ]
401 }
402 },
403 { $unwind: "$geoLocationCountryCode" },
404 {
405 $group: {
406 _id: "nz",
407 count: { $sum: 1 },
408 domain: { $addToSet: '$domain' }
409 }
410 },
411 { $sort : { count : -1} }
412 ]);
413 */
414 public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
415 // working with the WebSites collection, not WebPages collection!
416 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
417
418 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
419
420 Bson orQuery = or(
421 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
422 BasicDBObject.parse("{domain: /\\.nz/}")
423 );
424 Bson andQuery = and(
425 BasicDBObject.parse(mriFilterString),
426 orQuery);
427
428 // Hopefully the lambda expression (forEach()) at end means
429 // we write out each result Document as we get it
430 collection.aggregate(Arrays.asList(
431 match(andQuery),
432 unwind("$geoLocationCountryCode"),
433 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
434 sort(BasicDBObject.parse("{count : -1}"))
435 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
436
437 // should only have one doc for NZ since it's a count by geolocation.
438
439 return;
440 }
441
442 /**
443 * The aggregate() we want to run this time:
444 *
445 db.Websites.aggregate([
446 {
447 $match: {
448 $and: [
449 {geoLocationCountryCode: {$ne: "NZ"}},
450 {domain: {$not: /\.nz/}},
451 {numPagesContainingMRI: {$gt: 0}},
452 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
453 ]
454 }
455 },
456 { $unwind: "$geoLocationCountryCode" },
457 {
458 $group: {
459 _id: {$toLower: '$geoLocationCountryCode'},
460 count: { $sum: 1 },
461 domain: { $addToSet: '$domain' }
462 }
463 },
464 { $sort : { count : -1} }
465 ]);
466 */
467 public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
468 boolean isMiInURLPath) throws UncheckedIOException
469 {
470 // working with the WebSites collection, not WebPages collection!
471 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
472
473 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
474
475 Bson orQuery = or(
476 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
477 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
478 // e.g. "{urlContainsLangCodeInPath: false}"
479 );
480 Bson andQuery = and(
481 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
482 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
483 BasicDBObject.parse(mriFilterString),
484 orQuery);
485
486
487 collection.aggregate(Arrays.asList(
488 match(andQuery), //match(BasicDBObject.parse(matchQuery))
489 // match((List<DBObject>)JSON.parse(matchQuery)),
490 unwind("$geoLocationCountryCode"),
491 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
492 sort(BasicDBObject.parse("{count : -1}"))
493 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
494
495 // casting to Block<Document> necessary because otherwise we see the error at
496 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
497
498 // Less efficient way is to keep all the results in memory and then
499 // write them out one at a time
500 /*
501 AggregateIterable<Document> output
502 = collection.aggregate(Arrays.asList(
503 match(andQuery), //match(BasicDBObject.parse(matchQuery))
504 // match((List<DBObject>)JSON.parse(matchQuery)),
505 unwind("$geoLocationCountryCode"),
506 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
507 sort(BasicDBObject.parse("{count : -1}"))
508 ));
509
510
511 for (Document doc : output) {
512 //System.out.println(doc);
513 System.out.println(doc.toJson());
514
515 }
516 */
517 return;
518 }
519
520 /** Do the aggregates for writing out tables.
521 Table1:
522
523 */
524 public void writeTables(File outFolder) {
525 // In this function, we're always dealing with the Websites mongodb collection.
526 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
527
528 String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"};
529 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
530 File outFile = new File(outFolder, tableNames[tableNum] + ".json");
531 File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
532 try (
533 Writer writer = new BufferedWriter(new FileWriter(outFile));
534 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT);
535 ) {
536
537 // Write out the CSV column headings
538 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
539 csvWriter.printRecord("countryCode", "siteCount",
540 "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/);
541
542 AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
543
544 int docNum = 0;
545 for (Document doc : output) {
546 //System.out.println(doc);
547 writeDocAsJsonRecord(++docNum, doc, writer);
548 writeDocAsCSVRecord(++docNum, doc, csvWriter);
549 }
550 logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
551 } catch(UncheckedIOException ioe) {
552 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
553 }
554 catch(Exception e) {
555 logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
556 }
557 }
558 }
559
560 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) {
561
562 AggregateIterable<Document> output = null;
563
564 switch(tableNum) {
565
566 case 1:
567 /* 1table_allCrawledSites -
568
569 db.Websites.aggregate([
570 { $unwind: "$geoLocationCountryCode" },
571 {
572 $group: {
573 _id: "$geoLocationCountryCode",
574 count: { $sum: 1 },
575 //domain: { $addToSet: '$domain' },
576 numPagesInMRICount: { $sum: '$numPagesInMRI' },
577 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
578 }
579 },
580 { $sort : { count : -1} }
581 ]);
582 */
583 output = collection.aggregate(Arrays.asList(
584 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
585 unwind("$geoLocationCountryCode"),
586 group("$geoLocationCountryCode", Arrays.asList(
587 sum("count", 1),
588 /*addToSet("domain", "$domain"),*/
589 sum("numPagesInMRICount", "$numPagesInMRI"),
590 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
591 sort(BasicDBObject.parse("{count : -1}"))
592 ));
593 break;
594
595 case 2:
596 /*
597 db.Websites.aggregate([
598 { $match: { numPagesInMRI: {$gt: 0} } },
599 { $unwind: "$geoLocationCountryCode" },
600 {
601 $group: {
602 _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower
603 count: { $sum: 1 },
604 //domain: { $addToSet: '$domain' },
605 numPagesInMRICount: { $sum: '$numPagesInMRI' },
606 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
607 }
608 },
609 { $sort : { count : -1} }
610 ]);
611 */
612 output = collection.aggregate(Arrays.asList(
613 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
614 unwind("$geoLocationCountryCode"),
615 group("$geoLocationCountryCode", Arrays.asList(
616 sum("count", 1),
617 /*addToSet("domain", "$domain"),*/
618 sum("numPagesInMRICount", "$numPagesInMRI"),
619 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
620 sort(BasicDBObject.parse("{count : -1}"))
621 ));
622 break;
623
624 default: logger.error("Unknown table number: " + tableNum);
625
626 }
627
628 return output;
629
630 }
631
632
633
634 /**
635 * called by lambda forEach() call on Document objects to write them out to a file.
636 * Have to deal with unreported exceptions here that can't be dealt with when doing
637 * the actual forEach(). See
638 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
639 */
640 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
641 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
642 // Can't control json output to add newlines after each array element,
643 // no matter which JsonMode is used.
644
645 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
646 // Still can't control array element output,
647 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
648 //JsonWriterSettings writeSettings = new JsonWriterSettings();
649 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
650 //writer.write(doc.toJson(writeSettings) + NEWLINE);
651
652 // Not the JsonWriter of mongodb java driver:
653 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
654
655 // Have to use gson's pretty print to produce a json string that contains
656 // newlines after every array element in the json:
657 String jsonStr = prettyPrintJson(doc.toJson());
658 //System.err.println(jsonStr);
659 try {
660 writer.write(jsonStr + NEWLINE);
661 } catch (IOException ex) {
662 //throw ex;
663 throw new UncheckedIOException(ex);
664 }
665 }
666
667 public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
668 String jsonStr = prettyPrintJson(doc.toJson());
669 //System.err.println(jsonStr);
670 try {
671 writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
672 } catch (IOException ex) {
673 //throw ex;
674 throw new UncheckedIOException(ex);
675 }
676 }
677
678 // TODO
679 //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
680 public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
681 String jsonStr = doc.toJson();
682 JsonParser parser = new JsonParser();
683 JsonElement json = parser.parse(jsonStr);
684
685 JsonObject jsonObj = (JsonObject)json;
686
687 String countryCode = jsonObj.get("_id").getAsString();
688 int siteCount = jsonObj.get("count").getAsInt();
689 int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
690 int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
691
692 //System.err.println(jsonStr);
693 try {
694 //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
695 csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount);
696 } catch (IOException ex) {
697 //throw ex;
698 throw new UncheckedIOException(ex);
699 }
700 }
701
702 public String prettyPrintJson(String jsonStr) {
703 Gson gson = new GsonBuilder().setPrettyPrinting().create();
704 JsonParser jp = new JsonParser();
705 JsonElement je = jp.parse(jsonStr);
706 String prettyJsonString = gson.toJson(je);
707 return prettyJsonString;
708 }
709
710
711 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
712 public void close() {}
713
714
715 // TODO:
716 // In the database, need to ensure we have else
717 // create collection (table in RDBMS) websites, create collection webpages.
718 // The webpages collection will have sentences embedded based on my decisions from
719 // reading the series
720 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
721 // Then need functions:
722 // insertWebsiteDocument()
723 // insertWebpageDocument()
724
725 public static void main(String args[]) {
726 try {
727 MongoDBAccess mongodbCon = new MongoDBAccess();
728 mongodbCon.connectToDB();
729 mongodbCon.showCollections();
730
731 } catch(Exception e) {
732 e.printStackTrace();
733 }
734 }
735}
Note: See TracBrowser for help on using the repository browser.