source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33652

Last change on this file since 33652 was 33652, checked in by ak19, 4 years ago

Introducing morphia subpackage

File size: 9.7 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.MongoCollection;
6import com.mongodb.client.MongoDatabase;
7//import com.mongodb.client.MongoIterable;
8import com.mongodb.BasicDBObject;
9import com.mongodb.MongoClient;
10import com.mongodb.MongoCredential;
11import com.mongodb.ServerAddress;
12import com.mongodb.MongoClientOptions;
13
14import org.bson.Document;
15
16import java.io.BufferedReader;
17import java.io.File;
18import java.io.FileReader;
19import java.util.ArrayList;
20import java.util.List;
21import java.util.Properties;
22
23
24import org.apache.log4j.Logger;
25
26import org.greenstone.atea.morphia.*;
27import dev.morphia.*;
28
29/**
30 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
31 *
32 * TO COMPILE:
33 * maori-lang-detection/src$
34 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
35 *
36 * TO RUN:
37 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
38 *
39 * Manually connecting to mongodb from client:
40 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
41 * Then after connecting with pwd, type:
42 * use DBNAME
43 *
44 * Or connect to mongodb and specify db in one statement:
45 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
46 *
47 * Some links:
48 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
49 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
50 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
51 * IMPORTANT LINK:
52 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
53 *
54 */
55public class MongoDBAccess implements AutoCloseable {
56
57 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
58
59 static final String PROPS_FILENAME = "config.properties";
60 public static final String WEBPAGES_COLLECTION = "webpages";
61 public static final String WEBSITES_COLLECTION = "websites";
62
63 // configuration details, some with fallback values
64 private String HOST = "localhost";
65 private int PORT = 27017; // mongodb port
66 private String USERNAME;
67 private String PASSWORD;
68 private String DB_NAME ="ateacrawldata";
69
70 private MongoClient mongo = null;
71 private MongoDatabase database = null;
72
73 /**
74 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
75 * for MongoDB
76 */
77 public Datastore datastore = null;
78
79 public MongoDBAccess() throws Exception {
80 boolean success = false;
81
82 // Read in the username and password from our props file
83 Properties props = new Properties();
84
85 //File propsFile = new File(PROPS_FILENAME);
86 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
87 try {
88 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
89 } catch(Exception e) {
90 logger.error(e);
91 }
92
93
94 USERNAME = props.getProperty("mongodb.user", "");
95 if(USERNAME.equals("")) {
96 USERNAME = "root";
97 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
98 }
99 PASSWORD = props.getProperty("mongodb.pwd");
100
101 logger.debug("Got pwd: " + PASSWORD);
102
103 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
104
105 success = false;
106 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
107 }
108
109 HOST = props.getProperty("mongodb.host", HOST);
110 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
111 PORT = Integer.parseInt(port);
112 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
113
114 logger.info("Connecting to mongodb with:");
115 logger.info(" - host: " + HOST);
116 logger.info(" - port: " + PORT);
117 logger.info(" - user: " + USERNAME);
118 logger.info(" - db name: " + DB_NAME);
119 }
120
121 /**
122 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
123 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
124 */
125 public void connectToDB() throws Exception {
126
127 // Creating a Mongo client
128 mongo = new MongoClient( HOST, PORT );
129
130 // Creating Credentials
131 MongoCredential credential;
132 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
133 System.out.println("Connected to the database successfully");
134
135 // Accessing the database
136 this.database = mongo.getDatabase(DB_NAME);
137 logger.info("Credentials: "+ credential);
138
139 /*
140 MongoCredential credential;
141 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
142 logger.info("Credentials: "+ credential);
143
144 // Create our Mongo client
145 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
146 System.out.println("Connected to the database successfully");
147
148 this.database = mongo.getDatabase(DB_NAME);
149 */
150
151 Morphia morphia = new Morphia();
152 morphia.mapPackage("com.greenstone.atea.morphia");
153 datastore = morphia.createDatastore(mongo, DB_NAME);
154 datastore.ensureIndexes();
155
156 }
157
158 // TODO: which fields should be indexed?
159
160 public void showCollections() {
161 //MongoIterable<String> colls = this.database.listCollectionNames();
162 for(String coll : this.database.listCollectionNames()) {
163 System.err.println("coll: " + coll);
164 }
165 }
166
167
168 public void insertWebsiteInfo(WebsiteInfo website)
169 {
170 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
171 Document document = new Document("_id", website.id)
172 .append("siteFolderName", website.siteFolderName)
173 .append("domain", website.domain)
174 .append("totalPages", website.totalPages)
175 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
176 .append("numPagesInMRI", website.numPagesInMRI)
177 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
178 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
179 .append("redoCrawl", website.redoCrawl);
180
181 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
182 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
183 document.put("countryCode", website.geoLocationCountryCode);
184 }
185
186 collection.insertOne(document);
187 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
188 + " inserted successfully into " + WEBSITES_COLLECTION);
189 }
190
191 /* TODO:
192 https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex
193 */
194 /**
195 * Inserts a web page into the mongodb. Besides page related metadata and full body text
196 * the language information per sentence and per 2 adjacent sentences also get stored
197 * into the mongodb.
198 */
199 public void insertWebpageInfo(WebpageInfo webpage)
200 {
201 int mri_sentence_count = 0;
202
203 // load the webpages db 'table'
204 // in mongodb, the equivalent of db tables are called 'collections'
205 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
206
207 Document document = new Document("_id", webpage.webpageID)
208 .append("siteid", webpage.websiteID)
209 .append("url", webpage.URL)
210 .append("isMRI", webpage.isMRI)
211 .append("totalSentences", webpage.totalSentences)
212 .append("charEncoding", webpage.charEncoding)
213 .append("modTime", webpage.modifiedTime)
214 .append("fetchTime", webpage.fetchTime);
215
216 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
217 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
218 List<BasicDBObject> sentencesList = new ArrayList<>();
219 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
220
221 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
222
223 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
224 bsonRecord.put("sentence", sentenceInfo.sentence);
225
226 sentencesList.add(bsonRecord);
227
228 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
229 mri_sentence_count++;
230 }
231
232 }
233 document.put("singleSentences", sentencesList);
234
235 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
236 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
237
238 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
239 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
240 bsonRecord.put("sentence", sentenceInfo.sentence);
241
242 overlappingSentencesList.add(bsonRecord);
243 }
244 document.put("overlappingSentences", overlappingSentencesList);
245
246 // also put the full text in there
247 document.put("text", webpage.text);
248
249 // also store the count of sentences in MRI
250 webpage.setMRISentenceCount(mri_sentence_count);
251 document.put("mriSentenceCount", mri_sentence_count);
252
253
254 collection.insertOne(document);
255 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
256 }
257
258 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
259 public void close() {}
260
261
262 // TODO:
263 // In the database, need to ensure we have else
264 // create collection (table in RDBMS) websites, create collection webpages.
265 // The webpages collection will have sentences embedded based on my decisions from
266 // reading the series
267 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
268 // Then need functions:
269 // insertWebsiteDocument()
270 // insertWebpageDocument()
271
272 public static void main(String args[]) {
273 try {
274 MongoDBAccess mongodbCon = new MongoDBAccess();
275 mongodbCon.connectToDB();
276 mongodbCon.showCollections();
277
278 } catch(Exception e) {
279 e.printStackTrace();
280 }
281 }
282}
Note: See TracBrowser for help on using the repository browser.