source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33882

Last change on this file since 33882 was 33882, checked in by ak19, 4 years ago

Code now writes both a listing of all non-autotranslated websites and a listing of overseas autotranslated sites.

File size: 21.1 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7import com.mongodb.client.MongoDatabase;
8//import com.mongodb.client.MongoIterable;
9
10// to use collection.find() filters like eq(), regex() etc
11import static com.mongodb.client.model.Filters.*;
12// to use collection.find().projection() filters like include() etc
13import static com.mongodb.client.model.Projections.*;
14// to use aggregation functions like unwind(), match(), sort() etc
15import static com.mongodb.client.model.Aggregates.*;
16// to use functions like sum() and addToSet() within aggregation functions
17import static com.mongodb.client.model.Accumulators.*;
18
19
20//import org.bson.conversions.Bson;
21import com.mongodb.BasicDBObject;
22import com.mongodb.MongoClient;
23import com.mongodb.MongoCredential;
24import com.mongodb.ServerAddress;
25import com.mongodb.MongoClientOptions;
26
27import com.mongodb.Block;
28
29import org.bson.Document;
30import org.bson.conversions.Bson;
31import org.bson.json.JsonMode;
32import org.bson.json.JsonWriterSettings;
33
34import com.mongodb.util.JSON;
35//import com.mongodb.DBObject;
36
37
38import com.google.gson.*; // for pretty printing
39
40import java.io.BufferedReader;
41import java.io.File;
42import java.io.FileReader;
43import java.io.IOException;
44import java.io.UncheckedIOException;
45import java.io.Writer;
46
47import java.util.Arrays;
48import java.util.ArrayList;
49import java.util.List;
50import java.util.Properties;
51import java.util.regex.Pattern;
52
53import org.apache.log4j.Logger;
54
55import org.greenstone.atea.morphia.*;
56import dev.morphia.*;
57
58/**
59 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
60 *
61 * TO COMPILE:
62 * maori-lang-detection/src$
63 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
64 *
65 * TO RUN:
66 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
67 *
68 * Manually connecting to mongodb from client:
69 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
70 * Then after connecting with pwd, type:
71 * use DBNAME
72 *
73 * Or connect to mongodb and specify db in one statement:
74 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
75 *
76 * Some links:
77 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
78 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
79 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
80 * IMPORTANT LINK:
81 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
82 *
83 */
84public class MongoDBAccess implements AutoCloseable {
85
86 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
87
88 static final String PROPS_FILENAME = "config.properties";
89 public static final String WEBPAGES_COLLECTION = "Webpages";
90 public static final String WEBSITES_COLLECTION = "Websites";
91
92 public static final String NEWLINE = System.getProperty("line.separator");
93
94 /** mongodb filter types to execute */
95 public static final int IS_MRI = 0;
96 public static final int CONTAINS_MRI = 1;
97
98 /** Some reused fieldnames in the Websites collection */
99 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
100 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
101
102 // configuration details, some with fallback values
103 private String HOST = "localhost";
104 private int PORT = 27017; // mongodb port
105 private String USERNAME;
106 private String PASSWORD;
107 private String DB_NAME ="ateacrawldata";
108
109 private MongoClient mongo = null;
110 private MongoDatabase database = null;
111
112 /**
113 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
114 * for MongoDB
115 */
116 public Datastore datastore = null;
117
118 public MongoDBAccess() throws Exception {
119 boolean success = false;
120
121 // Read in the username and password from our props file
122 Properties props = new Properties();
123
124 //File propsFile = new File(PROPS_FILENAME);
125 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
126 try {
127 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
128 } catch(Exception e) {
129 logger.error(e);
130 }
131
132
133 USERNAME = props.getProperty("mongodb.user", "");
134 if(USERNAME.equals("")) {
135 USERNAME = "root";
136 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
137 }
138 PASSWORD = props.getProperty("mongodb.pwd");
139
140 logger.debug("Got pwd: " + PASSWORD);
141
142 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
143
144 success = false;
145 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
146 }
147
148 HOST = props.getProperty("mongodb.host", HOST);
149 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
150 PORT = Integer.parseInt(port);
151 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
152
153 logger.info("Connecting to mongodb with:");
154 logger.info(" - host: " + HOST);
155 logger.info(" - port: " + PORT);
156 logger.info(" - user: " + USERNAME);
157 logger.info(" - db name: " + DB_NAME);
158 }
159
160 /**
161 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
162 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
163 */
164 public void connectToDB() throws Exception {
165
166 // Creating a Mongo client
167 mongo = new MongoClient( HOST, PORT );
168
169 // Creating Credentials
170 MongoCredential credential;
171 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
172 System.out.println("Connected to the database successfully");
173
174 // Accessing the database
175 this.database = mongo.getDatabase(DB_NAME);
176 logger.info("Credentials: "+ credential);
177
178 /*
179 MongoCredential credential;
180 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
181 logger.info("Credentials: "+ credential);
182
183 // Create our Mongo client
184 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
185 System.out.println("Connected to the database successfully");
186
187 this.database = mongo.getDatabase(DB_NAME);
188 */
189
190 Morphia morphia = new Morphia();
191 morphia.mapPackage("com.greenstone.atea.morphia");
192 datastore = morphia.createDatastore(mongo, DB_NAME);
193 datastore.ensureIndexes();
194
195 }
196
197 // TODO: which fields should be indexed?
198
199 public void showCollections() {
200 //MongoIterable<String> colls = this.database.listCollectionNames();
201 for(String coll : this.database.listCollectionNames()) {
202 System.err.println("coll: " + coll);
203 }
204 }
205
206 /*
207 public void insertWebsiteInfo(WebsiteInfo website)
208 {
209 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
210 Document document = new Document("_id", website.id)
211 .append("siteFolderName", website.siteFolderName)
212 .append("domain", website.domain)
213 .append("totalPages", website.totalPages)
214 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
215 .append("numPagesInMRI", website.numPagesInMRI)
216 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
217 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
218 .append("redoCrawl", website.redoCrawl);
219
220 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
221 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
222 document.put("countryCode", website.geoLocationCountryCode);
223 }
224
225 collection.insertOne(document);
226 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
227 + " inserted successfully into " + WEBSITES_COLLECTION);
228 }
229 */
230
231 /**
232 * Inserts a web page into the mongodb. Besides page related metadata and full body text
233 * the language information per sentence and per 2 adjacent sentences also get stored
234 * into the mongodb.
235 */
236 /*
237 public void insertWebpageInfo(WebpageInfo webpage)
238 {
239 int mri_sentence_count = 0;
240
241 // load the webpages db 'table'
242 // in mongodb, the equivalent of db tables are called 'collections'
243 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
244
245 Document document = new Document("_id", webpage.webpageID)
246 .append("siteid", webpage.websiteID)
247 .append("url", webpage.URL)
248 .append("isMRI", webpage.isMRI)
249 .append("totalSentences", webpage.totalSentences)
250 .append("charEncoding", webpage.charEncoding)
251 .append("modTime", webpage.modifiedTime)
252 .append("fetchTime", webpage.fetchTime);
253
254 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
255 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
256 List<BasicDBObject> sentencesList = new ArrayList<>();
257 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
258
259 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
260
261 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
262 bsonRecord.put("sentence", sentenceInfo.sentence);
263
264 sentencesList.add(bsonRecord);
265
266 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
267 mri_sentence_count++;
268 }
269
270 }
271 document.put("singleSentences", sentencesList);
272
273 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
274 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
275
276 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
277 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
278 bsonRecord.put("sentence", sentenceInfo.sentence);
279
280 overlappingSentencesList.add(bsonRecord);
281 }
282 document.put("overlappingSentences", overlappingSentencesList);
283
284 // also put the full text in there
285 document.put("text", webpage.text);
286
287 // also store the count of sentences in MRI
288 webpage.setMRISentenceCount(mri_sentence_count);
289 document.put("mriSentenceCount", mri_sentence_count);
290
291
292 collection.insertOne(document);
293 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
294 }
295 */
296
297 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
298 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
299 }
300 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
301 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
302 }
303
304 /**
305 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
306 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
307 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
308 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
309 *
310 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
311 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
312 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
313 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
314 */
315 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
316
317 final ArrayList<String> urlsList = new ArrayList<String>();
318
319 // remove any http(s)://(www.) from the start of URL first
320 // since it goes into a regex
321 domain = Utility.stripProtocolAndWWWFromURL(domain);
322
323 // load the "webpages" db table
324 // in mongodb, the equivalent of db tables are called 'collections'
325 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
326
327 // code we'll execute in Iterable.forEach() below
328 // see also https://www.baeldung.com/foreach-java
329 Block<Document> storeURL = new Block<Document>() {
330 @Override
331 public void apply(final Document document) {
332 //System.out.println(document.toJson());
333 String url = document.getString("URL");
334 // add to our urlsList
335 //System.out.println(url);
336 urlsList.add(url);
337 }
338 };
339
340
341 // Run the following mongodb query:
342 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
343
344 // 1. One way that works:
345 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
346
347 // 2. Another way:
348 //String query = "{URL: /DOMAIN/, isMRI: true}";
349 String query = "{URL: /DOMAIN/, ";
350 if(filterType == IS_MRI) {
351 query += "isMRI: true}";
352 } else if(filterType == CONTAINS_MRI) {
353 query += "containsMRI: true}";
354 }
355
356 domain = domain.replace(".", "\\."); // escape dots in domain for regex
357 query = query.replace("DOMAIN", domain);
358
359 //System.err.println("Executing find query: " + query);
360
361 BasicDBObject findObj = BasicDBObject.parse(query);
362 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
363
364
365 collection.find(findObj).projection(projectionObj).forEach(storeURL);
366
367 return urlsList;
368 }
369
370 /**
371 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
372 *
373 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
374 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
375 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
376 *
377 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
378 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
379 * On using group(TExpression) inside collection.aggregate().
380 *
381 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
382 * and https://www.javatpoint.com/java-8-foreach
383 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
384 *
385 *
386 * The mongodb aggregate() we want to run this time:
387 *
388 db.Websites.aggregate([
389 {
390 $match: {
391 $and: [
392 {numPagesContainingMRI: {$gt: 0}},
393 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
394 ]
395 }
396 },
397 { $unwind: "$geoLocationCountryCode" },
398 {
399 $group: {
400 _id: "nz",
401 count: { $sum: 1 },
402 domain: { $addToSet: '$domain' }
403 }
404 },
405 { $sort : { count : -1} }
406 ]);
407 */
408 public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
409 // working with the WebSites collection, not WebPages collection!
410 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
411
412 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
413
414 Bson orQuery = or(
415 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
416 BasicDBObject.parse("{domain: /\\.nz/}")
417 );
418 Bson andQuery = and(
419 BasicDBObject.parse(mriFilterString),
420 orQuery);
421
422 // Hopefully the lambda expression (forEach()) at end means
423 // we write out each result Document as we get it
424 collection.aggregate(Arrays.asList(
425 match(andQuery),
426 unwind("$geoLocationCountryCode"),
427 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
428 sort(BasicDBObject.parse("{count : -1}"))
429 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
430
431 // should only have one doc for NZ since it's a count by geolocation.
432
433 return;
434 }
435
436 /**
437 * The aggregate() we want to run this time:
438 *
439 db.Websites.aggregate([
440 {
441 $match: {
442 $and: [
443 {geoLocationCountryCode: {$ne: "NZ"}},
444 {domain: {$not: /\.nz/}},
445 {numPagesContainingMRI: {$gt: 0}},
446 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
447 ]
448 }
449 },
450 { $unwind: "$geoLocationCountryCode" },
451 {
452 $group: {
453 _id: {$toLower: '$geoLocationCountryCode'},
454 count: { $sum: 1 },
455 domain: { $addToSet: '$domain' }
456 }
457 },
458 { $sort : { count : -1} }
459 ]);
460 */
461 public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
462 boolean isMiInURLPath) throws UncheckedIOException
463 {
464 // working with the WebSites collection, not WebPages collection!
465 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
466
467 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
468
469 Bson orQuery = or(
470 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
471 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
472 // e.g. "{urlContainsLangCodeInPath: false}"
473 );
474 Bson andQuery = and(
475 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
476 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
477 BasicDBObject.parse(mriFilterString),
478 orQuery);
479
480
481 collection.aggregate(Arrays.asList(
482 match(andQuery), //match(BasicDBObject.parse(matchQuery))
483 // match((List<DBObject>)JSON.parse(matchQuery)),
484 unwind("$geoLocationCountryCode"),
485 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
486 sort(BasicDBObject.parse("{count : -1}"))
487 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
488
489 // casting to Block<Document> necessary because otherwise we see the error at
490 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
491
492 // Less efficient way is to keep all the results in memory and then
493 // write them out one at a time
494 /*
495 AggregateIterable<Document> output
496 = collection.aggregate(Arrays.asList(
497 match(andQuery), //match(BasicDBObject.parse(matchQuery))
498 // match((List<DBObject>)JSON.parse(matchQuery)),
499 unwind("$geoLocationCountryCode"),
500 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
501 sort(BasicDBObject.parse("{count : -1}"))
502 ));
503
504
505 for (Document doc : output) {
506 //System.out.println(doc);
507 System.out.println(doc.toJson());
508
509 }
510 */
511 return;
512 }
513
514 /**
515 * called by lambda forEach() call on Document objects to write them out to a file.
516 * Have to deal with unreported exceptions here that can't be dealt with when doing
517 * the actual forEach(). See
518 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
519 */
520
521 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
522 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
523 // Can't control json output to add newlines after each array element,
524 // no matter which JsonMode is used.
525
526 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
527 // Still can't control array element output,
528 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
529 //JsonWriterSettings writeSettings = new JsonWriterSettings();
530 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
531 //writer.write(doc.toJson(writeSettings) + NEWLINE);
532
533 // Not the JsonWriter of mongodb java driver:
534 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
535
536 // Have to use gson's pretty print to produce a json string that contains
537 // newlines after every array element in the json:
538 String jsonStr = prettyPrintJson(doc.toJson());
539 System.err.println(jsonStr);
540 try {
541 writer.write(jsonStr + NEWLINE);
542 } catch (IOException ex) {
543 //throw ex;
544 throw new UncheckedIOException(ex);
545 }
546 }
547 public String prettyPrintJson(String jsonStr) {
548 Gson gson = new GsonBuilder().setPrettyPrinting().create();
549 JsonParser jp = new JsonParser();
550 JsonElement je = jp.parse(jsonStr);
551 String prettyJsonString = gson.toJson(je);
552 return prettyJsonString;
553 }
554
555
556 public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) {
557
558 // should only have one doc
559 for (Document doc : output) {
560 //System.out.println(doc);
561 System.out.println(doc.toJson());
562 }
563 }
564
565
566 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
567 public void close() {}
568
569
570 // TODO:
571 // In the database, need to ensure we have else
572 // create collection (table in RDBMS) websites, create collection webpages.
573 // The webpages collection will have sentences embedded based on my decisions from
574 // reading the series
575 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
576 // Then need functions:
577 // insertWebsiteDocument()
578 // insertWebpageDocument()
579
580 public static void main(String args[]) {
581 try {
582 MongoDBAccess mongodbCon = new MongoDBAccess();
583 mongodbCon.connectToDB();
584 mongodbCon.showCollections();
585
586 } catch(Exception e) {
587 e.printStackTrace();
588 }
589 }
590}
Note: See TracBrowser for help on using the repository browser.