source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33881

Last change on this file since 33881 was 33881, checked in by ak19, 4 years ago

Uses lambda expression to process each doc in a mongodb aggregate result. Hopefully doing so means it processes each result Document as the result is obtained, meaning increased efficiency, rather than still storing all the results and processing them, as that code is more legible.

File size: 20.6 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7import com.mongodb.client.MongoDatabase;
8//import com.mongodb.client.MongoIterable;
9
10// to use collection.find() filters like eq(), regex() etc
11import static com.mongodb.client.model.Filters.*;
12// to use collection.find().projection() filters like include() etc
13import static com.mongodb.client.model.Projections.*;
14// to use aggregation functions like unwind(), match(), sort() etc
15import static com.mongodb.client.model.Aggregates.*;
16// to use functions like sum() and addToSet() within aggregation functions
17import static com.mongodb.client.model.Accumulators.*;
18
19
20//import org.bson.conversions.Bson;
21import com.mongodb.BasicDBObject;
22import com.mongodb.MongoClient;
23import com.mongodb.MongoCredential;
24import com.mongodb.ServerAddress;
25import com.mongodb.MongoClientOptions;
26
27import com.mongodb.Block;
28
29import org.bson.Document;
30import org.bson.conversions.Bson;
31import org.bson.json.JsonMode;
32import org.bson.json.JsonWriterSettings;
33
34import com.mongodb.util.JSON;
35//import com.mongodb.DBObject;
36
37
38import com.google.gson.*; // for pretty printing
39
40import java.io.BufferedReader;
41import java.io.File;
42import java.io.FileReader;
43import java.io.IOException;
44import java.io.UncheckedIOException;
45import java.io.Writer;
46
47import java.util.Arrays;
48import java.util.ArrayList;
49import java.util.List;
50import java.util.Properties;
51import java.util.regex.Pattern;
52
53import org.apache.log4j.Logger;
54
55import org.greenstone.atea.morphia.*;
56import dev.morphia.*;
57
58/**
59 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
60 *
61 * TO COMPILE:
62 * maori-lang-detection/src$
63 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
64 *
65 * TO RUN:
66 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
67 *
68 * Manually connecting to mongodb from client:
69 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
70 * Then after connecting with pwd, type:
71 * use DBNAME
72 *
73 * Or connect to mongodb and specify db in one statement:
74 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
75 *
76 * Some links:
77 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
78 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
79 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
80 * IMPORTANT LINK:
81 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
82 *
83 */
84public class MongoDBAccess implements AutoCloseable {
85
86 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
87
88 static final String PROPS_FILENAME = "config.properties";
89 public static final String WEBPAGES_COLLECTION = "Webpages";
90 public static final String WEBSITES_COLLECTION = "Websites";
91
92 public static final String NEWLINE = System.getProperty("line.separator");
93
94 /** mongodb filter types to execute */
95 public static final int IS_MRI = 0;
96 public static final int CONTAINS_MRI = 1;
97
98 // configuration details, some with fallback values
99 private String HOST = "localhost";
100 private int PORT = 27017; // mongodb port
101 private String USERNAME;
102 private String PASSWORD;
103 private String DB_NAME ="ateacrawldata";
104
105 private MongoClient mongo = null;
106 private MongoDatabase database = null;
107
108 /**
109 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
110 * for MongoDB
111 */
112 public Datastore datastore = null;
113
114 public MongoDBAccess() throws Exception {
115 boolean success = false;
116
117 // Read in the username and password from our props file
118 Properties props = new Properties();
119
120 //File propsFile = new File(PROPS_FILENAME);
121 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
122 try {
123 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
124 } catch(Exception e) {
125 logger.error(e);
126 }
127
128
129 USERNAME = props.getProperty("mongodb.user", "");
130 if(USERNAME.equals("")) {
131 USERNAME = "root";
132 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
133 }
134 PASSWORD = props.getProperty("mongodb.pwd");
135
136 logger.debug("Got pwd: " + PASSWORD);
137
138 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
139
140 success = false;
141 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
142 }
143
144 HOST = props.getProperty("mongodb.host", HOST);
145 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
146 PORT = Integer.parseInt(port);
147 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
148
149 logger.info("Connecting to mongodb with:");
150 logger.info(" - host: " + HOST);
151 logger.info(" - port: " + PORT);
152 logger.info(" - user: " + USERNAME);
153 logger.info(" - db name: " + DB_NAME);
154 }
155
156 /**
157 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
158 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
159 */
160 public void connectToDB() throws Exception {
161
162 // Creating a Mongo client
163 mongo = new MongoClient( HOST, PORT );
164
165 // Creating Credentials
166 MongoCredential credential;
167 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
168 System.out.println("Connected to the database successfully");
169
170 // Accessing the database
171 this.database = mongo.getDatabase(DB_NAME);
172 logger.info("Credentials: "+ credential);
173
174 /*
175 MongoCredential credential;
176 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
177 logger.info("Credentials: "+ credential);
178
179 // Create our Mongo client
180 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
181 System.out.println("Connected to the database successfully");
182
183 this.database = mongo.getDatabase(DB_NAME);
184 */
185
186 Morphia morphia = new Morphia();
187 morphia.mapPackage("com.greenstone.atea.morphia");
188 datastore = morphia.createDatastore(mongo, DB_NAME);
189 datastore.ensureIndexes();
190
191 }
192
193 // TODO: which fields should be indexed?
194
195 public void showCollections() {
196 //MongoIterable<String> colls = this.database.listCollectionNames();
197 for(String coll : this.database.listCollectionNames()) {
198 System.err.println("coll: " + coll);
199 }
200 }
201
202 /*
203 public void insertWebsiteInfo(WebsiteInfo website)
204 {
205 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
206 Document document = new Document("_id", website.id)
207 .append("siteFolderName", website.siteFolderName)
208 .append("domain", website.domain)
209 .append("totalPages", website.totalPages)
210 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
211 .append("numPagesInMRI", website.numPagesInMRI)
212 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
213 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
214 .append("redoCrawl", website.redoCrawl);
215
216 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
217 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
218 document.put("countryCode", website.geoLocationCountryCode);
219 }
220
221 collection.insertOne(document);
222 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
223 + " inserted successfully into " + WEBSITES_COLLECTION);
224 }
225 */
226
227 /**
228 * Inserts a web page into the mongodb. Besides page related metadata and full body text
229 * the language information per sentence and per 2 adjacent sentences also get stored
230 * into the mongodb.
231 */
232 /*
233 public void insertWebpageInfo(WebpageInfo webpage)
234 {
235 int mri_sentence_count = 0;
236
237 // load the webpages db 'table'
238 // in mongodb, the equivalent of db tables are called 'collections'
239 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
240
241 Document document = new Document("_id", webpage.webpageID)
242 .append("siteid", webpage.websiteID)
243 .append("url", webpage.URL)
244 .append("isMRI", webpage.isMRI)
245 .append("totalSentences", webpage.totalSentences)
246 .append("charEncoding", webpage.charEncoding)
247 .append("modTime", webpage.modifiedTime)
248 .append("fetchTime", webpage.fetchTime);
249
250 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
251 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
252 List<BasicDBObject> sentencesList = new ArrayList<>();
253 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
254
255 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
256
257 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
258 bsonRecord.put("sentence", sentenceInfo.sentence);
259
260 sentencesList.add(bsonRecord);
261
262 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
263 mri_sentence_count++;
264 }
265
266 }
267 document.put("singleSentences", sentencesList);
268
269 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
270 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
271
272 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
273 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
274 bsonRecord.put("sentence", sentenceInfo.sentence);
275
276 overlappingSentencesList.add(bsonRecord);
277 }
278 document.put("overlappingSentences", overlappingSentencesList);
279
280 // also put the full text in there
281 document.put("text", webpage.text);
282
283 // also store the count of sentences in MRI
284 webpage.setMRISentenceCount(mri_sentence_count);
285 document.put("mriSentenceCount", mri_sentence_count);
286
287
288 collection.insertOne(document);
289 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
290 }
291 */
292
293 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
294 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
295 }
296 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
297 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
298 }
299
300 /**
301 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
302 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
303 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
304 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
305 *
306 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
307 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
308 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
309 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
310 */
311 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
312
313 final ArrayList<String> urlsList = new ArrayList<String>();
314
315 // remove any http(s)://(www.) from the start of URL first
316 // since it goes into a regex
317 domain = Utility.stripProtocolAndWWWFromURL(domain);
318
319 // load the "webpages" db table
320 // in mongodb, the equivalent of db tables are called 'collections'
321 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
322
323 // code we'll execute in Iterable.forEach() below
324 // see also https://www.baeldung.com/foreach-java
325 Block<Document> storeURL = new Block<Document>() {
326 @Override
327 public void apply(final Document document) {
328 //System.out.println(document.toJson());
329 String url = document.getString("URL");
330 // add to our urlsList
331 //System.out.println(url);
332 urlsList.add(url);
333 }
334 };
335
336
337 // Run the following mongodb query:
338 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
339
340 // 1. One way that works:
341 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
342
343 // 2. Another way:
344 //String query = "{URL: /DOMAIN/, isMRI: true}";
345 String query = "{URL: /DOMAIN/, ";
346 if(filterType == IS_MRI) {
347 query += "isMRI: true}";
348 } else if(filterType == CONTAINS_MRI) {
349 query += "containsMRI: true}";
350 }
351
352 domain = domain.replace(".", "\\."); // escape dots in domain for regex
353 query = query.replace("DOMAIN", domain);
354
355 //System.err.println("Executing find query: " + query);
356
357 BasicDBObject findObj = BasicDBObject.parse(query);
358 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
359
360
361 collection.find(findObj).projection(projectionObj).forEach(storeURL);
362
363 return urlsList;
364 }
365
366 /**
367 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
368 *
369 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
370 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
371 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
372 *
373 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
374 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
375 * On using group(TExpression) inside collection.aggregate().
376 *
377 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
378 * and https://www.javatpoint.com/java-8-foreach
379 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
380 *
381 *
382 * The mongodb aggregate() we want to run this time:
383 *
384 db.Websites.aggregate([
385 {
386 $match: {
387 $and: [
388 {numPagesContainingMRI: {$gt: 0}},
389 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
390 ]
391 }
392 },
393 { $unwind: "$geoLocationCountryCode" },
394 {
395 $group: {
396 _id: "nz",
397 count: { $sum: 1 },
398 domain: { $addToSet: '$domain' }
399 }
400 },
401 { $sort : { count : -1} }
402 ]);
403 */
404 public void aggregateContainsMRIForNZ(Writer writer) throws IOException {
405 // working with the WebSites collection, not WebPages collection!
406 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
407
408
409 //String isMRI_filter =
410
411 Bson orQuery = or(
412 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
413 BasicDBObject.parse("{domain: /\\.nz/}")
414 );
415 Bson andQuery = and(
416 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
417 orQuery);
418
419 // Hopefully the lambda expression (forEach()) at end means
420 // we write out each result Document as we get it
421 collection.aggregate(Arrays.asList(
422 match(andQuery),
423 unwind("$geoLocationCountryCode"),
424 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
425 sort(BasicDBObject.parse("{count : -1}"))
426 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
427
428 // should only have one doc for NZ since it's a count by geolocation.
429
430 return;
431 }
432
433 /**
434 * The aggregate() we want to run this time:
435 *
436 db.Websites.aggregate([
437 {
438 $match: {
439 $and: [
440 {geoLocationCountryCode: {$ne: "NZ"}},
441 {domain: {$not: /\.nz/}},
442 {numPagesContainingMRI: {$gt: 0}},
443 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
444 ]
445 }
446 },
447 { $unwind: "$geoLocationCountryCode" },
448 {
449 $group: {
450 _id: {$toLower: '$geoLocationCountryCode'},
451 count: { $sum: 1 },
452 domain: { $addToSet: '$domain' }
453 }
454 },
455 { $sort : { count : -1} }
456 ]);
457 */
458 public void aggregateContainsMRIForOverseas(Writer writer) throws UncheckedIOException {
459 // working with the WebSites collection, not WebPages collection!
460 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
461
462
463 Bson orQuery = or(
464 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
465 BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
466 );
467 Bson andQuery = and(
468 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
469 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
470 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
471 orQuery);
472
473
474 collection.aggregate(Arrays.asList(
475 match(andQuery), //match(BasicDBObject.parse(matchQuery))
476 // match((List<DBObject>)JSON.parse(matchQuery)),
477 unwind("$geoLocationCountryCode"),
478 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
479 sort(BasicDBObject.parse("{count : -1}"))
480 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
481
482 // casting to Block<Document> necessary because otherwise we see the error at
483 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
484
485 // Less efficient way is to keep all the results in memory and then
486 // write them out one at a time
487 /*
488 AggregateIterable<Document> output
489 = collection.aggregate(Arrays.asList(
490 match(andQuery), //match(BasicDBObject.parse(matchQuery))
491 // match((List<DBObject>)JSON.parse(matchQuery)),
492 unwind("$geoLocationCountryCode"),
493 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
494 sort(BasicDBObject.parse("{count : -1}"))
495 ));
496
497
498 for (Document doc : output) {
499 //System.out.println(doc);
500 System.out.println(doc.toJson());
501
502 }
503 */
504 return;
505 }
506
507 /**
508 * called by lambda forEach() call on Document objects to write them out to a file.
509 * Have to deal with unreported exceptions here that can't be dealt with when doing
510 * the actual forEach(). See
511 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
512 */
513
514 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
515 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
516 // Can't control json output to add newlines after each array element,
517 // no matter which JsonMode is used.
518
519 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
520 // Still can't control array element output,
521 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
522 //JsonWriterSettings writeSettings = new JsonWriterSettings();
523 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
524 //writer.write(doc.toJson(writeSettings) + NEWLINE);
525
526 // Not the JsonWriter of mongodb java driver:
527 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
528
529 // Have to use gson's pretty print to produce a json string that contains
530 // newlines after every array element in the json:
531 String jsonStr = prettyPrintJson(doc.toJson());
532 System.err.println(jsonStr);
533 try {
534 writer.write(jsonStr + NEWLINE);
535 } catch (IOException ex) {
536 //throw ex;
537 throw new UncheckedIOException(ex);
538 }
539 }
540 public String prettyPrintJson(String jsonStr) {
541 Gson gson = new GsonBuilder().setPrettyPrinting().create();
542 JsonParser jp = new JsonParser();
543 JsonElement je = jp.parse(jsonStr);
544 String prettyJsonString = gson.toJson(je);
545 return prettyJsonString;
546 }
547
548
549 public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) {
550
551 // should only have one doc
552 for (Document doc : output) {
553 //System.out.println(doc);
554 System.out.println(doc.toJson());
555 }
556 }
557
558
559 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
560 public void close() {}
561
562
563 // TODO:
564 // In the database, need to ensure we have else
565 // create collection (table in RDBMS) websites, create collection webpages.
566 // The webpages collection will have sentences embedded based on my decisions from
567 // reading the series
568 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
569 // Then need functions:
570 // insertWebsiteDocument()
571 // insertWebpageDocument()
572
573 public static void main(String args[]) {
574 try {
575 MongoDBAccess mongodbCon = new MongoDBAccess();
576 mongodbCon.connectToDB();
577 mongodbCon.showCollections();
578
579 } catch(Exception e) {
580 e.printStackTrace();
581 }
582 }
583}
Note: See TracBrowser for help on using the repository browser.