source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33871

Last change on this file since 33871 was 33871, checked in by ak19, 4 years ago

Removed mostly duplicated older version of method but left the different parts commented out inside the new method. Improved regex in mongodb find query.

File size: 12.4 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.MongoCollection;
6import com.mongodb.client.MongoDatabase;
7//import com.mongodb.client.MongoIterable;
8
9// to use collection.find() filters like eq(), regex() etc
10import static com.mongodb.client.model.Filters.*;
11// to use collection.find().projection() filters like include() etc
12import static com.mongodb.client.model.Projections.*;
13
14//import org.bson.conversions.Bson;
15import com.mongodb.BasicDBObject;
16import com.mongodb.MongoClient;
17import com.mongodb.MongoCredential;
18import com.mongodb.ServerAddress;
19import com.mongodb.MongoClientOptions;
20
21import com.mongodb.Block;
22
23import org.bson.Document;
24
25import java.io.BufferedReader;
26import java.io.File;
27import java.io.FileReader;
28import java.util.ArrayList;
29import java.util.List;
30import java.util.Properties;
31import java.util.regex.Pattern;
32
33import org.apache.log4j.Logger;
34
35import org.greenstone.atea.morphia.*;
36import dev.morphia.*;
37
38/**
39 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
40 *
41 * TO COMPILE:
42 * maori-lang-detection/src$
43 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
44 *
45 * TO RUN:
46 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
47 *
48 * Manually connecting to mongodb from client:
49 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
50 * Then after connecting with pwd, type:
51 * use DBNAME
52 *
53 * Or connect to mongodb and specify db in one statement:
54 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
55 *
56 * Some links:
57 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
58 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
59 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
60 * IMPORTANT LINK:
61 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
62 *
63 */
64public class MongoDBAccess implements AutoCloseable {
65
66 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
67
68 static final String PROPS_FILENAME = "config.properties";
69 public static final String WEBPAGES_COLLECTION = "Webpages";
70 public static final String WEBSITES_COLLECTION = "Websites";
71
72 // configuration details, some with fallback values
73 private String HOST = "localhost";
74 private int PORT = 27017; // mongodb port
75 private String USERNAME;
76 private String PASSWORD;
77 private String DB_NAME ="ateacrawldata";
78
79 private MongoClient mongo = null;
80 private MongoDatabase database = null;
81
82 /**
83 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
84 * for MongoDB
85 */
86 public Datastore datastore = null;
87
88 public MongoDBAccess() throws Exception {
89 boolean success = false;
90
91 // Read in the username and password from our props file
92 Properties props = new Properties();
93
94 //File propsFile = new File(PROPS_FILENAME);
95 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
96 try {
97 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
98 } catch(Exception e) {
99 logger.error(e);
100 }
101
102
103 USERNAME = props.getProperty("mongodb.user", "");
104 if(USERNAME.equals("")) {
105 USERNAME = "root";
106 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
107 }
108 PASSWORD = props.getProperty("mongodb.pwd");
109
110 logger.debug("Got pwd: " + PASSWORD);
111
112 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
113
114 success = false;
115 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
116 }
117
118 HOST = props.getProperty("mongodb.host", HOST);
119 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
120 PORT = Integer.parseInt(port);
121 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
122
123 logger.info("Connecting to mongodb with:");
124 logger.info(" - host: " + HOST);
125 logger.info(" - port: " + PORT);
126 logger.info(" - user: " + USERNAME);
127 logger.info(" - db name: " + DB_NAME);
128 }
129
130 /**
131 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
132 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
133 */
134 public void connectToDB() throws Exception {
135
136 // Creating a Mongo client
137 mongo = new MongoClient( HOST, PORT );
138
139 // Creating Credentials
140 MongoCredential credential;
141 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
142 System.out.println("Connected to the database successfully");
143
144 // Accessing the database
145 this.database = mongo.getDatabase(DB_NAME);
146 logger.info("Credentials: "+ credential);
147
148 /*
149 MongoCredential credential;
150 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
151 logger.info("Credentials: "+ credential);
152
153 // Create our Mongo client
154 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
155 System.out.println("Connected to the database successfully");
156
157 this.database = mongo.getDatabase(DB_NAME);
158 */
159
160 Morphia morphia = new Morphia();
161 morphia.mapPackage("com.greenstone.atea.morphia");
162 datastore = morphia.createDatastore(mongo, DB_NAME);
163 datastore.ensureIndexes();
164
165 }
166
167 // TODO: which fields should be indexed?
168
169 public void showCollections() {
170 //MongoIterable<String> colls = this.database.listCollectionNames();
171 for(String coll : this.database.listCollectionNames()) {
172 System.err.println("coll: " + coll);
173 }
174 }
175
176 /*
177 public void insertWebsiteInfo(WebsiteInfo website)
178 {
179 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
180 Document document = new Document("_id", website.id)
181 .append("siteFolderName", website.siteFolderName)
182 .append("domain", website.domain)
183 .append("totalPages", website.totalPages)
184 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
185 .append("numPagesInMRI", website.numPagesInMRI)
186 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
187 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
188 .append("redoCrawl", website.redoCrawl);
189
190 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
191 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
192 document.put("countryCode", website.geoLocationCountryCode);
193 }
194
195 collection.insertOne(document);
196 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
197 + " inserted successfully into " + WEBSITES_COLLECTION);
198 }
199 */
200
201 /**
202 * Inserts a web page into the mongodb. Besides page related metadata and full body text
203 * the language information per sentence and per 2 adjacent sentences also get stored
204 * into the mongodb.
205 */
206 /*
207 public void insertWebpageInfo(WebpageInfo webpage)
208 {
209 int mri_sentence_count = 0;
210
211 // load the webpages db 'table'
212 // in mongodb, the equivalent of db tables are called 'collections'
213 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
214
215 Document document = new Document("_id", webpage.webpageID)
216 .append("siteid", webpage.websiteID)
217 .append("url", webpage.URL)
218 .append("isMRI", webpage.isMRI)
219 .append("totalSentences", webpage.totalSentences)
220 .append("charEncoding", webpage.charEncoding)
221 .append("modTime", webpage.modifiedTime)
222 .append("fetchTime", webpage.fetchTime);
223
224 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
225 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
226 List<BasicDBObject> sentencesList = new ArrayList<>();
227 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
228
229 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
230
231 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
232 bsonRecord.put("sentence", sentenceInfo.sentence);
233
234 sentencesList.add(bsonRecord);
235
236 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
237 mri_sentence_count++;
238 }
239
240 }
241 document.put("singleSentences", sentencesList);
242
243 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
244 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
245
246 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
247 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
248 bsonRecord.put("sentence", sentenceInfo.sentence);
249
250 overlappingSentencesList.add(bsonRecord);
251 }
252 document.put("overlappingSentences", overlappingSentencesList);
253
254 // also put the full text in there
255 document.put("text", webpage.text);
256
257 // also store the count of sentences in MRI
258 webpage.setMRISentenceCount(mri_sentence_count);
259 document.put("mriSentenceCount", mri_sentence_count);
260
261
262 collection.insertOne(document);
263 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
264 }
265 */
266
267 /**
268 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
269 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
270 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
271 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
272 *
273 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
274 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
275 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
276 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
277*/
278 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
279
280 final ArrayList<String> urlsList = new ArrayList<String>();
281
282 // remove any http(s)://(www.) from the start of URL first
283 // since it goes into a regex
284 domain = Utility.stripProtocolAndWWWFromURL(domain);
285
286 // load the "webpages" db table
287 // in mongodb, the equivalent of db tables are called 'collections'
288 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
289
290 // code we'll execute in Iterable.forEach() below
291 Block<Document> storeURL = new Block<Document>() {
292 @Override
293 public void apply(final Document document) {
294 //System.out.println(document.toJson());
295 String url = document.getString("URL");
296 // add to our urlsList
297 //System.out.println(url);
298 urlsList.add(url);
299 }
300 };
301
302
303
304 // Run the following mongodb query:
305 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
306
307 // 1. One way that works:
308 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
309
310 // 2. Another way:
311 String query = "{URL: /DOMAIN/, isMRI: true}";
312 domain = domain.replace(".", "\\."); // escape dots in domain for regex
313 query = query.replace("DOMAIN", domain);
314
315 //System.err.println("Executing find query: " + query);
316
317 BasicDBObject findObj = BasicDBObject.parse(query);
318 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
319
320
321 collection.find(findObj).projection(projectionObj).forEach(storeURL);
322
323 return urlsList;
324 }
325
326
327
328 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
329 public void close() {}
330
331
332 // TODO:
333 // In the database, need to ensure we have else
334 // create collection (table in RDBMS) websites, create collection webpages.
335 // The webpages collection will have sentences embedded based on my decisions from
336 // reading the series
337 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
338 // Then need functions:
339 // insertWebsiteDocument()
340 // insertWebpageDocument()
341
342 public static void main(String args[]) {
343 try {
344 MongoDBAccess mongodbCon = new MongoDBAccess();
345 mongodbCon.connectToDB();
346 mongodbCon.showCollections();
347
348 } catch(Exception e) {
349 e.printStackTrace();
350 }
351 }
352}
Note: See TracBrowser for help on using the repository browser.