source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33653

Last change on this file since 33653 was 33653, checked in by ak19, 4 years ago
  1. As suggested by Dr Bainbridge, made the code changes to use Morphia as ODM for MongoDB (Object Document Mapper, ODM for MongoDB is equivalent to what ORM is to RDBMS). 2. Adding jar files to get this to work. 3. Further changes to store site folder names of form ##### as primary key of Websites collection. However, may in a future commit decide to store a reference to a WebsiteInfo object (representing a JSON document in a Websites MongoDB collection) inside a WebpageInfo object. 4. The MongoDB collections are now called Websites and Webpages, not websites and webpages. 5. geolocation of site now stored as field in Websites mongodb collection. And containsMRI now stored as field in Webpages collection of mongoDB. 6. Tried out some mongodb query commands based on what Dr Bainbridge did yesterday.
File size: 9.5 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.MongoCollection;
6import com.mongodb.client.MongoDatabase;
7//import com.mongodb.client.MongoIterable;
8import com.mongodb.BasicDBObject;
9import com.mongodb.MongoClient;
10import com.mongodb.MongoCredential;
11import com.mongodb.ServerAddress;
12import com.mongodb.MongoClientOptions;
13
14import org.bson.Document;
15
16import java.io.BufferedReader;
17import java.io.File;
18import java.io.FileReader;
19import java.util.ArrayList;
20import java.util.List;
21import java.util.Properties;
22
23
24import org.apache.log4j.Logger;
25
26import org.greenstone.atea.morphia.*;
27import dev.morphia.*;
28
29/**
30 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
31 *
32 * TO COMPILE:
33 * maori-lang-detection/src$
34 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
35 *
36 * TO RUN:
37 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
38 *
39 * Manually connecting to mongodb from client:
40 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
41 * Then after connecting with pwd, type:
42 * use DBNAME
43 *
44 * Or connect to mongodb and specify db in one statement:
45 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
46 *
47 * Some links:
48 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
49 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
50 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
51 * IMPORTANT LINK:
52 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
53 *
54 */
55public class MongoDBAccess implements AutoCloseable {
56
57 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
58
59 static final String PROPS_FILENAME = "config.properties";
60 public static final String WEBPAGES_COLLECTION = "webpages";
61 public static final String WEBSITES_COLLECTION = "websites";
62
63 // configuration details, some with fallback values
64 private String HOST = "localhost";
65 private int PORT = 27017; // mongodb port
66 private String USERNAME;
67 private String PASSWORD;
68 private String DB_NAME ="ateacrawldata";
69
70 private MongoClient mongo = null;
71 private MongoDatabase database = null;
72
73 /**
74 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
75 * for MongoDB
76 */
77 public Datastore datastore = null;
78
79 public MongoDBAccess() throws Exception {
80 boolean success = false;
81
82 // Read in the username and password from our props file
83 Properties props = new Properties();
84
85 //File propsFile = new File(PROPS_FILENAME);
86 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
87 try {
88 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
89 } catch(Exception e) {
90 logger.error(e);
91 }
92
93
94 USERNAME = props.getProperty("mongodb.user", "");
95 if(USERNAME.equals("")) {
96 USERNAME = "root";
97 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
98 }
99 PASSWORD = props.getProperty("mongodb.pwd");
100
101 logger.debug("Got pwd: " + PASSWORD);
102
103 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
104
105 success = false;
106 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
107 }
108
109 HOST = props.getProperty("mongodb.host", HOST);
110 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
111 PORT = Integer.parseInt(port);
112 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
113
114 logger.info("Connecting to mongodb with:");
115 logger.info(" - host: " + HOST);
116 logger.info(" - port: " + PORT);
117 logger.info(" - user: " + USERNAME);
118 logger.info(" - db name: " + DB_NAME);
119 }
120
121 /**
122 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
123 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
124 */
125 public void connectToDB() throws Exception {
126
127 // Creating a Mongo client
128 mongo = new MongoClient( HOST, PORT );
129
130 // Creating Credentials
131 MongoCredential credential;
132 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
133 System.out.println("Connected to the database successfully");
134
135 // Accessing the database
136 this.database = mongo.getDatabase(DB_NAME);
137 logger.info("Credentials: "+ credential);
138
139 /*
140 MongoCredential credential;
141 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
142 logger.info("Credentials: "+ credential);
143
144 // Create our Mongo client
145 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
146 System.out.println("Connected to the database successfully");
147
148 this.database = mongo.getDatabase(DB_NAME);
149 */
150
151 Morphia morphia = new Morphia();
152 morphia.mapPackage("com.greenstone.atea.morphia");
153 datastore = morphia.createDatastore(mongo, DB_NAME);
154 datastore.ensureIndexes();
155
156 }
157
158 // TODO: which fields should be indexed?
159
160 public void showCollections() {
161 //MongoIterable<String> colls = this.database.listCollectionNames();
162 for(String coll : this.database.listCollectionNames()) {
163 System.err.println("coll: " + coll);
164 }
165 }
166
167 /*
168 public void insertWebsiteInfo(WebsiteInfo website)
169 {
170 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
171 Document document = new Document("_id", website.id)
172 .append("siteFolderName", website.siteFolderName)
173 .append("domain", website.domain)
174 .append("totalPages", website.totalPages)
175 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
176 .append("numPagesInMRI", website.numPagesInMRI)
177 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
178 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
179 .append("redoCrawl", website.redoCrawl);
180
181 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
182 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
183 document.put("countryCode", website.geoLocationCountryCode);
184 }
185
186 collection.insertOne(document);
187 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
188 + " inserted successfully into " + WEBSITES_COLLECTION);
189 }
190 */
191
192 /**
193 * Inserts a web page into the mongodb. Besides page related metadata and full body text
194 * the language information per sentence and per 2 adjacent sentences also get stored
195 * into the mongodb.
196 */
197 /*
198 public void insertWebpageInfo(WebpageInfo webpage)
199 {
200 int mri_sentence_count = 0;
201
202 // load the webpages db 'table'
203 // in mongodb, the equivalent of db tables are called 'collections'
204 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
205
206 Document document = new Document("_id", webpage.webpageID)
207 .append("siteid", webpage.websiteID)
208 .append("url", webpage.URL)
209 .append("isMRI", webpage.isMRI)
210 .append("totalSentences", webpage.totalSentences)
211 .append("charEncoding", webpage.charEncoding)
212 .append("modTime", webpage.modifiedTime)
213 .append("fetchTime", webpage.fetchTime);
214
215 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
216 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
217 List<BasicDBObject> sentencesList = new ArrayList<>();
218 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
219
220 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
221
222 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
223 bsonRecord.put("sentence", sentenceInfo.sentence);
224
225 sentencesList.add(bsonRecord);
226
227 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
228 mri_sentence_count++;
229 }
230
231 }
232 document.put("singleSentences", sentencesList);
233
234 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
235 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
236
237 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
238 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
239 bsonRecord.put("sentence", sentenceInfo.sentence);
240
241 overlappingSentencesList.add(bsonRecord);
242 }
243 document.put("overlappingSentences", overlappingSentencesList);
244
245 // also put the full text in there
246 document.put("text", webpage.text);
247
248 // also store the count of sentences in MRI
249 webpage.setMRISentenceCount(mri_sentence_count);
250 document.put("mriSentenceCount", mri_sentence_count);
251
252
253 collection.insertOne(document);
254 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
255 }
256 */
257
258 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
259 public void close() {}
260
261
262 // TODO:
263 // In the database, need to ensure we have else
264 // create collection (table in RDBMS) websites, create collection webpages.
265 // The webpages collection will have sentences embedded based on my decisions from
266 // reading the series
267 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
268 // Then need functions:
269 // insertWebsiteDocument()
270 // insertWebpageDocument()
271
272 public static void main(String args[]) {
273 try {
274 MongoDBAccess mongodbCon = new MongoDBAccess();
275 mongodbCon.connectToDB();
276 mongodbCon.showCollections();
277
278 } catch(Exception e) {
279 e.printStackTrace();
280 }
281 }
282}
Note: See TracBrowser for help on using the repository browser.