package org.greenstone.atea; //import org.bson.BSONObject; import com.mongodb.client.AggregateIterable; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; //import com.mongodb.client.MongoIterable; // to use collection.find() filters like eq(), regex() etc import static com.mongodb.client.model.Filters.*; // to use collection.find().projection() filters like include() etc import static com.mongodb.client.model.Projections.*; // to use aggregation functions like unwind(), match(), sort() etc import static com.mongodb.client.model.Aggregates.*; // to use functions like sum() and addToSet() within aggregation functions import static com.mongodb.client.model.Accumulators.*; //import org.bson.conversions.Bson; import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; import com.mongodb.MongoCredential; import com.mongodb.ServerAddress; import com.mongodb.MongoClientOptions; import com.mongodb.Block; import org.bson.Document; import org.bson.conversions.Bson; import org.bson.json.JsonMode; import org.bson.json.JsonWriterSettings; import com.mongodb.util.JSON; //import com.mongodb.DBObject; import com.google.gson.*; // for pretty printing import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.UncheckedIOException; import java.io.Writer; import java.util.Arrays; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.greenstone.atea.morphia.*; import dev.morphia.*; /** * https://www.tutorialspoint.com/mongodb/mongodb_java.htm * * TO COMPILE: * maori-lang-detection/src$ * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java * * TO RUN: * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess * * Manually connecting to mongodb from client: * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p * Then after connecting with pwd, type: * use DBNAME * * Or connect to mongodb and specify db in one statement: * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p * * Some links: * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection") * - https://tecadmin.net/tutorial/mongodb/drop-collection/ * IMPORTANT LINK: * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1 * */ public class MongoDBAccess implements AutoCloseable { private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); static final String PROPS_FILENAME = "config.properties"; public static final String WEBPAGES_COLLECTION = "Webpages"; public static final String WEBSITES_COLLECTION = "Websites"; public static final String NEWLINE = System.getProperty("line.separator"); /** mongodb filter types to execute */ public static final int IS_MRI = 0; public static final int CONTAINS_MRI = 1; /** Some reused fieldnames in the Websites collection */ private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI"; private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI"; // configuration details, some with fallback values private String HOST = "localhost"; private int PORT = 27017; // mongodb port private String USERNAME; private String PASSWORD; private String DB_NAME ="ateacrawldata"; private MongoClient mongo = null; private MongoDatabase database = null; /** * Mongodb Client handle via morphia, which handles the ODM (object document mapper) * for MongoDB */ public Datastore datastore = null; public MongoDBAccess() throws Exception { boolean success = false; // Read in the username and password from our props file Properties props = new Properties(); //File propsFile = new File(PROPS_FILENAME); //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath()); try { props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME)); } catch(Exception e) { logger.error(e); } USERNAME = props.getProperty("mongodb.user", ""); if(USERNAME.equals("")) { USERNAME = "root"; logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME); } PASSWORD = props.getProperty("mongodb.pwd"); logger.debug("Got pwd: " + PASSWORD); if(PASSWORD != null && PASSWORD.equals("CHANGEME")) { success = false; throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME); } HOST = props.getProperty("mongodb.host", HOST); String port = props.getProperty("mongodb.port", Integer.toString(PORT)); PORT = Integer.parseInt(port); DB_NAME = props.getProperty("mongodb.dbname", DB_NAME); logger.info("Connecting to mongodb with:"); logger.info(" - host: " + HOST); logger.info(" - port: " + PORT); logger.info(" - user: " + USERNAME); logger.info(" - db name: " + DB_NAME); } /** * Since we have only a single MongoClient, don't need to call close/disconnect on it as per * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ public void connectToDB() throws Exception { // Creating a Mongo client mongo = new MongoClient( HOST, PORT ); // Creating Credentials MongoCredential credential; credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray()); System.out.println("Connected to the database successfully"); // Accessing the database this.database = mongo.getDatabase(DB_NAME); logger.info("Credentials: "+ credential); /* MongoCredential credential; credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray()); logger.info("Credentials: "+ credential); // Create our Mongo client mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build()); System.out.println("Connected to the database successfully"); this.database = mongo.getDatabase(DB_NAME); */ Morphia morphia = new Morphia(); morphia.mapPackage("com.greenstone.atea.morphia"); datastore = morphia.createDatastore(mongo, DB_NAME); datastore.ensureIndexes(); } // TODO: which fields should be indexed? public void showCollections() { //MongoIterable colls = this.database.listCollectionNames(); for(String coll : this.database.listCollectionNames()) { System.err.println("coll: " + coll); } } /* public void insertWebsiteInfo(WebsiteInfo website) { MongoCollection collection = this.database.getCollection(WEBSITES_COLLECTION); Document document = new Document("_id", website.id) .append("siteFolderName", website.siteFolderName) .append("domain", website.domain) .append("totalPages", website.totalPages) .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) .append("numPagesInMRI", website.numPagesInMRI) .append("siteCrawledTimestamp", website.siteCrawledTimestamp) .append("siteCrawlUnfinished", website.siteCrawlUnfinished) .append("redoCrawl", website.redoCrawl); document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath); if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) { document.put("countryCode", website.geoLocationCountryCode); } collection.insertOne(document); logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")" + " inserted successfully into " + WEBSITES_COLLECTION); } */ /** * Inserts a web page into the mongodb. Besides page related metadata and full body text * the language information per sentence and per 2 adjacent sentences also get stored * into the mongodb. */ /* public void insertWebpageInfo(WebpageInfo webpage) { int mri_sentence_count = 0; // load the webpages db 'table' // in mongodb, the equivalent of db tables are called 'collections' MongoCollection collection = this.database.getCollection(WEBPAGES_COLLECTION); Document document = new Document("_id", webpage.webpageID) .append("siteid", webpage.websiteID) .append("url", webpage.URL) .append("isMRI", webpage.isMRI) .append("totalSentences", webpage.totalSentences) .append("charEncoding", webpage.charEncoding) .append("modTime", webpage.modifiedTime) .append("fetchTime", webpage.fetchTime); // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER: // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java List sentencesList = new ArrayList<>(); for(SentenceInfo sentenceInfo : webpage.singleSentences) { BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode); bsonRecord.put("confidence", sentenceInfo.confidenceLevel); bsonRecord.put("sentence", sentenceInfo.sentence); sentencesList.add(bsonRecord); if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { mri_sentence_count++; } } document.put("singleSentences", sentencesList); List overlappingSentencesList = new ArrayList<>(); for(SentenceInfo sentenceInfo : webpage.overlappingSentences) { BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode); bsonRecord.put("confidence", sentenceInfo.confidenceLevel); bsonRecord.put("sentence", sentenceInfo.sentence); overlappingSentencesList.add(bsonRecord); } document.put("overlappingSentences", overlappingSentencesList); // also put the full text in there document.put("text", webpage.text); // also store the count of sentences in MRI webpage.setMRISentenceCount(mri_sentence_count); document.put("mriSentenceCount", mri_sentence_count); collection.insertOne(document); logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION); } */ public ArrayList queryAllMatchingIsMRIURLs(String domain) { return queryAllMatchingURLsFilteredBy(domain, IS_MRI); } public ArrayList queryAllMatchingcontainsMRIURLs(String domain) { return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI); } /** * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/ * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find * * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ */ public ArrayList queryAllMatchingURLsFilteredBy(String domain, int filterType) { final ArrayList urlsList = new ArrayList(); // remove any http(s)://(www.) from the start of URL first // since it goes into a regex domain = Utility.stripProtocolAndWWWFromURL(domain); // load the "webpages" db table // in mongodb, the equivalent of db tables are called 'collections' MongoCollection collection = this.database.getCollection(WEBPAGES_COLLECTION); // code we'll execute in Iterable.forEach() below // see also https://www.baeldung.com/foreach-java Block storeURL = new Block() { @Override public void apply(final Document document) { //System.out.println(document.toJson()); String url = document.getString("URL"); // add to our urlsList //System.out.println(url); urlsList.add(url); } }; // Run the following mongodb query: // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) // 1. One way that works: //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); // 2. Another way: //String query = "{URL: /DOMAIN/, isMRI: true}"; String query = "{URL: /DOMAIN/, "; if(filterType == IS_MRI) { query += "isMRI: true}"; } else if(filterType == CONTAINS_MRI) { query += "containsMRI: true}"; } domain = domain.replace(".", "\\."); // escape dots in domain for regex query = query.replace("DOMAIN", domain); //System.err.println("Executing find query: " + query); BasicDBObject findObj = BasicDBObject.parse(query); BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}"); collection.find(findObj).projection(projectionObj).forEach(storeURL); return urlsList; } /** * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: * * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo * * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates * On using group(TExpression) inside collection.aggregate(). * * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java * and https://www.javatpoint.com/java-8-foreach * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java * * * The mongodb aggregate() we want to run this time: * db.Websites.aggregate([ { $match: { $and: [ {numPagesContainingMRI: {$gt: 0}}, {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} ] } }, { $unwind: "$geoLocationCountryCode" }, { $group: { _id: "nz", count: { $sum: 1 }, domain: { $addToSet: '$domain' } } }, { $sort : { count : -1} } ]); */ public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException { // working with the WebSites collection, not WebPages collection! MongoCollection collection = this.database.getCollection(WEBSITES_COLLECTION); String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; Bson orQuery = or( BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), BasicDBObject.parse("{domain: /\\.nz/}") ); Bson andQuery = and( BasicDBObject.parse(mriFilterString), orQuery); // Hopefully the lambda expression (forEach()) at end means // we write out each result Document as we get it collection.aggregate(Arrays.asList( match(andQuery), unwind("$geoLocationCountryCode"), group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), sort(BasicDBObject.parse("{count : -1}")) )).forEach((Block)doc -> writeDoc(doc, writer)); // should only have one doc for NZ since it's a count by geolocation. return; } /** * The aggregate() we want to run this time: * db.Websites.aggregate([ { $match: { $and: [ {geoLocationCountryCode: {$ne: "NZ"}}, {domain: {$not: /\.nz/}}, {numPagesContainingMRI: {$gt: 0}}, {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} ] } }, { $unwind: "$geoLocationCountryCode" }, { $group: { _id: {$toLower: '$geoLocationCountryCode'}, count: { $sum: 1 }, domain: { $addToSet: '$domain' } } }, { $sort : { count : -1} } ]); */ public void aggregateContainsMRIForOverseas(Writer writer, int filterType, boolean isMiInURLPath) throws UncheckedIOException { // working with the WebSites collection, not WebPages collection! MongoCollection collection = this.database.getCollection(WEBSITES_COLLECTION); String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; Bson orQuery = or( BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}") // e.g. "{urlContainsLangCodeInPath: false}" ); Bson andQuery = and( BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), BasicDBObject.parse("{domain: {$not: /\\.nz/}}"), BasicDBObject.parse(mriFilterString), orQuery); collection.aggregate(Arrays.asList( match(andQuery), //match(BasicDBObject.parse(matchQuery)) // match((List)JSON.parse(matchQuery)), unwind("$geoLocationCountryCode"), group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), sort(BasicDBObject.parse("{count : -1}")) )).forEach((Block)doc -> writeDoc(doc, writer)); // casting to Block necessary because otherwise we see the error at // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java // Less efficient way is to keep all the results in memory and then // write them out one at a time /* AggregateIterable output = collection.aggregate(Arrays.asList( match(andQuery), //match(BasicDBObject.parse(matchQuery)) // match((List)JSON.parse(matchQuery)), unwind("$geoLocationCountryCode"), group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), sort(BasicDBObject.parse("{count : -1}")) )); for (Document doc : output) { //System.out.println(doc); System.out.println(doc.toJson()); } */ return; } /** * called by lambda forEach() call on Document objects to write them out to a file. * Have to deal with unreported exceptions here that can't be dealt with when doing * the actual forEach(). See * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach */ public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); // Can't control json output to add newlines after each array element, // no matter which JsonMode is used. // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html // Still can't control array element output, // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too: //JsonWriterSettings writeSettings = new JsonWriterSettings(); //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); //writer.write(doc.toJson(writeSettings) + NEWLINE); // Not the JsonWriter of mongodb java driver: // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line // Have to use gson's pretty print to produce a json string that contains // newlines after every array element in the json: String jsonStr = prettyPrintJson(doc.toJson()); //System.err.println(jsonStr); try { writer.write(jsonStr + NEWLINE); } catch (IOException ex) { //throw ex; throw new UncheckedIOException(ex); } } public String prettyPrintJson(String jsonStr) { Gson gson = new GsonBuilder().setPrettyPrinting().create(); JsonParser jp = new JsonParser(); JsonElement je = jp.parse(jsonStr); String prettyJsonString = gson.toJson(je); return prettyJsonString; } /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ public void close() {} // TODO: // In the database, need to ensure we have else // create collection (table in RDBMS) websites, create collection webpages. // The webpages collection will have sentences embedded based on my decisions from // reading the series // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1 // Then need functions: // insertWebsiteDocument() // insertWebpageDocument() public static void main(String args[]) { try { MongoDBAccess mongodbCon = new MongoDBAccess(); mongodbCon.connectToDB(); mongodbCon.showCollections(); } catch(Exception e) { e.printStackTrace(); } } }