source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33879

Last change on this file since 33879 was 33879, checked in by ak19, 4 years ago

Have the 2 mongodb aggregate() calls working that

File size: 17.6 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7import com.mongodb.client.MongoDatabase;
8//import com.mongodb.client.MongoIterable;
9
10// to use collection.find() filters like eq(), regex() etc
11import static com.mongodb.client.model.Filters.*;
12// to use collection.find().projection() filters like include() etc
13import static com.mongodb.client.model.Projections.*;
14// to use aggregation functions like unwind(), match(), sort() etc
15import static com.mongodb.client.model.Aggregates.*;
16// to use functions like sum() and addToSet() within aggregation functions
17import static com.mongodb.client.model.Accumulators.*;
18
19//import org.bson.conversions.Bson;
20import com.mongodb.BasicDBObject;
21import com.mongodb.MongoClient;
22import com.mongodb.MongoCredential;
23import com.mongodb.ServerAddress;
24import com.mongodb.MongoClientOptions;
25
26import com.mongodb.Block;
27
28import org.bson.Document;
29import org.bson.conversions.Bson;
30
31import com.mongodb.util.JSON;
32//import com.mongodb.DBObject;
33
34import java.io.BufferedReader;
35import java.io.File;
36import java.io.FileReader;
37import java.util.Arrays;
38import java.util.ArrayList;
39import java.util.List;
40import java.util.Properties;
41import java.util.regex.Pattern;
42
43import org.apache.log4j.Logger;
44
45import org.greenstone.atea.morphia.*;
46import dev.morphia.*;
47
48/**
49 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
50 *
51 * TO COMPILE:
52 * maori-lang-detection/src$
53 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
54 *
55 * TO RUN:
56 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
57 *
58 * Manually connecting to mongodb from client:
59 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
60 * Then after connecting with pwd, type:
61 * use DBNAME
62 *
63 * Or connect to mongodb and specify db in one statement:
64 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
65 *
66 * Some links:
67 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
68 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
69 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
70 * IMPORTANT LINK:
71 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
72 *
73 */
74public class MongoDBAccess implements AutoCloseable {
75
76 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
77
78 static final String PROPS_FILENAME = "config.properties";
79 public static final String WEBPAGES_COLLECTION = "Webpages";
80 public static final String WEBSITES_COLLECTION = "Websites";
81
82 /** mongodb filter types to execute */
83 public static final int IS_MRI = 0;
84 public static final int CONTAINS_MRI = 1;
85
86 // configuration details, some with fallback values
87 private String HOST = "localhost";
88 private int PORT = 27017; // mongodb port
89 private String USERNAME;
90 private String PASSWORD;
91 private String DB_NAME ="ateacrawldata";
92
93 private MongoClient mongo = null;
94 private MongoDatabase database = null;
95
96 /**
97 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
98 * for MongoDB
99 */
100 public Datastore datastore = null;
101
102 public MongoDBAccess() throws Exception {
103 boolean success = false;
104
105 // Read in the username and password from our props file
106 Properties props = new Properties();
107
108 //File propsFile = new File(PROPS_FILENAME);
109 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
110 try {
111 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
112 } catch(Exception e) {
113 logger.error(e);
114 }
115
116
117 USERNAME = props.getProperty("mongodb.user", "");
118 if(USERNAME.equals("")) {
119 USERNAME = "root";
120 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
121 }
122 PASSWORD = props.getProperty("mongodb.pwd");
123
124 logger.debug("Got pwd: " + PASSWORD);
125
126 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
127
128 success = false;
129 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
130 }
131
132 HOST = props.getProperty("mongodb.host", HOST);
133 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
134 PORT = Integer.parseInt(port);
135 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
136
137 logger.info("Connecting to mongodb with:");
138 logger.info(" - host: " + HOST);
139 logger.info(" - port: " + PORT);
140 logger.info(" - user: " + USERNAME);
141 logger.info(" - db name: " + DB_NAME);
142 }
143
144 /**
145 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
146 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
147 */
148 public void connectToDB() throws Exception {
149
150 // Creating a Mongo client
151 mongo = new MongoClient( HOST, PORT );
152
153 // Creating Credentials
154 MongoCredential credential;
155 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
156 System.out.println("Connected to the database successfully");
157
158 // Accessing the database
159 this.database = mongo.getDatabase(DB_NAME);
160 logger.info("Credentials: "+ credential);
161
162 /*
163 MongoCredential credential;
164 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
165 logger.info("Credentials: "+ credential);
166
167 // Create our Mongo client
168 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
169 System.out.println("Connected to the database successfully");
170
171 this.database = mongo.getDatabase(DB_NAME);
172 */
173
174 Morphia morphia = new Morphia();
175 morphia.mapPackage("com.greenstone.atea.morphia");
176 datastore = morphia.createDatastore(mongo, DB_NAME);
177 datastore.ensureIndexes();
178
179 }
180
181 // TODO: which fields should be indexed?
182
183 public void showCollections() {
184 //MongoIterable<String> colls = this.database.listCollectionNames();
185 for(String coll : this.database.listCollectionNames()) {
186 System.err.println("coll: " + coll);
187 }
188 }
189
190 /*
191 public void insertWebsiteInfo(WebsiteInfo website)
192 {
193 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
194 Document document = new Document("_id", website.id)
195 .append("siteFolderName", website.siteFolderName)
196 .append("domain", website.domain)
197 .append("totalPages", website.totalPages)
198 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
199 .append("numPagesInMRI", website.numPagesInMRI)
200 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
201 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
202 .append("redoCrawl", website.redoCrawl);
203
204 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
205 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
206 document.put("countryCode", website.geoLocationCountryCode);
207 }
208
209 collection.insertOne(document);
210 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
211 + " inserted successfully into " + WEBSITES_COLLECTION);
212 }
213 */
214
215 /**
216 * Inserts a web page into the mongodb. Besides page related metadata and full body text
217 * the language information per sentence and per 2 adjacent sentences also get stored
218 * into the mongodb.
219 */
220 /*
221 public void insertWebpageInfo(WebpageInfo webpage)
222 {
223 int mri_sentence_count = 0;
224
225 // load the webpages db 'table'
226 // in mongodb, the equivalent of db tables are called 'collections'
227 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
228
229 Document document = new Document("_id", webpage.webpageID)
230 .append("siteid", webpage.websiteID)
231 .append("url", webpage.URL)
232 .append("isMRI", webpage.isMRI)
233 .append("totalSentences", webpage.totalSentences)
234 .append("charEncoding", webpage.charEncoding)
235 .append("modTime", webpage.modifiedTime)
236 .append("fetchTime", webpage.fetchTime);
237
238 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
239 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
240 List<BasicDBObject> sentencesList = new ArrayList<>();
241 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
242
243 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
244
245 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
246 bsonRecord.put("sentence", sentenceInfo.sentence);
247
248 sentencesList.add(bsonRecord);
249
250 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
251 mri_sentence_count++;
252 }
253
254 }
255 document.put("singleSentences", sentencesList);
256
257 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
258 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
259
260 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
261 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
262 bsonRecord.put("sentence", sentenceInfo.sentence);
263
264 overlappingSentencesList.add(bsonRecord);
265 }
266 document.put("overlappingSentences", overlappingSentencesList);
267
268 // also put the full text in there
269 document.put("text", webpage.text);
270
271 // also store the count of sentences in MRI
272 webpage.setMRISentenceCount(mri_sentence_count);
273 document.put("mriSentenceCount", mri_sentence_count);
274
275
276 collection.insertOne(document);
277 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
278 }
279 */
280
281 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
282 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
283 }
284 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
285 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
286 }
287
288 /**
289 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
290 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
291 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
292 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
293 *
294 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
295 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
296 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
297 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
298 */
299 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
300
301 final ArrayList<String> urlsList = new ArrayList<String>();
302
303 // remove any http(s)://(www.) from the start of URL first
304 // since it goes into a regex
305 domain = Utility.stripProtocolAndWWWFromURL(domain);
306
307 // load the "webpages" db table
308 // in mongodb, the equivalent of db tables are called 'collections'
309 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
310
311 // code we'll execute in Iterable.forEach() below
312 Block<Document> storeURL = new Block<Document>() {
313 @Override
314 public void apply(final Document document) {
315 //System.out.println(document.toJson());
316 String url = document.getString("URL");
317 // add to our urlsList
318 //System.out.println(url);
319 urlsList.add(url);
320 }
321 };
322
323
324 // Run the following mongodb query:
325 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
326
327 // 1. One way that works:
328 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
329
330 // 2. Another way:
331 //String query = "{URL: /DOMAIN/, isMRI: true}";
332 String query = "{URL: /DOMAIN/, ";
333 if(filterType == IS_MRI) {
334 query += "isMRI: true}";
335 } else if(filterType == CONTAINS_MRI) {
336 query += "containsMRI: true}";
337 }
338
339 domain = domain.replace(".", "\\."); // escape dots in domain for regex
340 query = query.replace("DOMAIN", domain);
341
342 //System.err.println("Executing find query: " + query);
343
344 BasicDBObject findObj = BasicDBObject.parse(query);
345 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
346
347
348 collection.find(findObj).projection(projectionObj).forEach(storeURL);
349
350 return urlsList;
351 }
352
353 /**
354 RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
355
356 https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
357 https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
358 Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
359
360 (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
361 https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
362 On using group(TExpression) inside collection.aggregate().
363
364
365 The aggregate() we want to run:
366
367 db.Websites.aggregate([
368 {
369 $match: {
370 $and: [
371 {geoLocationCountryCode: {$ne: "NZ"}},
372 {domain: {$not: /\.nz/}},
373 {numPagesContainingMRI: {$gt: 0}},
374 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
375 ]
376 }
377 },
378 { $unwind: "$geoLocationCountryCode" },
379 {
380 $group: {
381 _id: {$toLower: '$geoLocationCountryCode'},
382 count: { $sum: 1 },
383 domain: { $addToSet: '$domain' }
384 }
385 },
386 { $sort : { count : -1} }
387 ]);
388
389 */
390 public String aggregateContainsMRIForOverseas() {
391 // working with the WebSites collection, not WebPages collection!
392 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
393
394 /*String matchQuery =
395 "$and: ["
396 + "{geoLocationCountryCode: {$ne: \"NZ\"}},"
397 + "{domain: {$not: /\\.nz/}},"
398 + "{numPagesContainingMRI: {$gt: 0}},"
399 + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}"
400 + "]";*/
401
402
403
404
405 Bson orQuery = or(
406 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
407 BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
408 );
409 Bson andQuery = and(
410 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
411 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
412 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
413 orQuery);
414
415 AggregateIterable<Document> output
416 = collection.aggregate(Arrays.asList(
417 match(andQuery), //match(BasicDBObject.parse(matchQuery))
418 // match((List<DBObject>)JSON.parse(matchQuery)),
419 unwind("$geoLocationCountryCode"),
420 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
421 sort(BasicDBObject.parse("{count : -1}"))
422 ));
423
424 for (Document doc : output) {
425 //System.out.println(doc);
426 System.out.println(doc.toJson());
427 }
428
429 return "";
430 }
431
432
433 /**
434 The mongodb aggregate() we want to run this time:
435
436 db.Websites.aggregate([
437 {
438 $match: {
439 $and: [
440 {numPagesContainingMRI: {$gt: 0}},
441 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
442 ]
443 }
444 },
445 { $unwind: "$geoLocationCountryCode" },
446 {
447 $group: {
448 _id: "nz",
449 count: { $sum: 1 },
450 domain: { $addToSet: '$domain' }
451 }
452 },
453 { $sort : { count : -1} }
454 ]);
455 */
456 public String aggregateContainsMRIForNZ() {
457 // working with the WebSites collection, not WebPages collection!
458 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
459
460
461 //String isMRI_filter =
462
463 Bson orQuery = or(
464 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
465 BasicDBObject.parse("{domain: /\\.nz/}")
466 );
467 Bson andQuery = and(
468 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
469 orQuery);
470
471 AggregateIterable<Document> output
472 = collection.aggregate(Arrays.asList(
473 match(andQuery),
474 unwind("$geoLocationCountryCode"),
475 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
476 sort(BasicDBObject.parse("{count : -1}"))
477 ));
478
479 // should only have one doc
480 for (Document doc : output) {
481 //System.out.println(doc);
482 System.out.println(doc.toJson());
483 }
484
485 return "";
486 }
487
488 public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) {
489
490 // should only have one doc
491 for (Document doc : output) {
492 //System.out.println(doc);
493 System.out.println(doc.toJson());
494 }
495 }
496
497
498 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
499 public void close() {}
500
501
502 // TODO:
503 // In the database, need to ensure we have else
504 // create collection (table in RDBMS) websites, create collection webpages.
505 // The webpages collection will have sentences embedded based on my decisions from
506 // reading the series
507 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
508 // Then need functions:
509 // insertWebsiteDocument()
510 // insertWebpageDocument()
511
512 public static void main(String args[]) {
513 try {
514 MongoDBAccess mongodbCon = new MongoDBAccess();
515 mongodbCon.connectToDB();
516 mongodbCon.showCollections();
517
518 } catch(Exception e) {
519 e.printStackTrace();
520 }
521 }
522}
Note: See TracBrowser for help on using the repository browser.