source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

Last change on this file was 33911, checked in by ak19, 4 years ago

Correct commit message for previous and current commit: 1. After refactoring MongoDBAccess class into additional subclass MongoDBQueryer, I split the import statements accordingly too. 2. Renamed WebPageURLsListing.java class to SummaryTool.java

File size: 9.8 KB
Line 
1package org.greenstone.atea;
2
3
4import com.mongodb.client.MongoCollection;
5import com.mongodb.client.MongoDatabase;
6
7import com.mongodb.MongoClient;
8import com.mongodb.MongoCredential;
9import com.mongodb.ServerAddress;
10import com.mongodb.MongoClientOptions;
11
12import org.bson.Document;
13
14import java.util.Properties;
15
16import org.apache.log4j.Logger;
17
18import org.greenstone.atea.morphia.*;
19import dev.morphia.*;
20
21
22/**
23 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
24 *
25 * TO COMPILE:
26 * maori-lang-detection/src$
27 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
28 *
29 * TO RUN:
30 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
31 *
32 * Manually connecting to mongodb from client:
33 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
34 * Then after connecting with pwd, type:
35 * use DBNAME
36 *
37 * Or connect to mongodb and specify db in one statement:
38 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
39 *
40 * Some links:
41 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
42 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
43 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
44 * IMPORTANT LINK:
45 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
46 *
47 * API:
48 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
49 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
50 */
51public class MongoDBAccess implements AutoCloseable {
52
53 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
54
55 static final String PROPS_FILENAME = "config.properties";
56 public static final String WEBPAGES_COLLECTION = "Webpages";
57 public static final String WEBSITES_COLLECTION = "Websites";
58
59
60 // configuration details, some with fallback values
61 protected String HOST = "localhost";
62 protected int PORT = 27017; // mongodb port
63 protected String USERNAME;
64 protected String PASSWORD;
65 protected String DB_NAME ="ateacrawldata";
66
67 protected MongoClient mongo = null;
68 protected MongoDatabase database = null;
69
70 /**
71 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
72 * for MongoDB
73 */
74 public Datastore datastore = null;
75
76 public MongoDBAccess() throws Exception {
77 boolean success = false;
78
79 // Read in the username and password from our props file
80 Properties props = new Properties();
81
82 //File propsFile = new File(PROPS_FILENAME);
83 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
84 try {
85 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
86 } catch(Exception e) {
87 logger.error(e);
88 }
89
90
91 USERNAME = props.getProperty("mongodb.user", "");
92 if(USERNAME.equals("")) {
93 USERNAME = "root";
94 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
95 }
96 PASSWORD = props.getProperty("mongodb.pwd");
97
98 logger.debug("Got pwd: " + PASSWORD);
99
100 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
101
102 success = false;
103 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
104 }
105
106 HOST = props.getProperty("mongodb.host", HOST);
107 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
108 PORT = Integer.parseInt(port);
109 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
110
111 logger.info("Connecting to mongodb with:");
112 logger.info(" - host: " + HOST);
113 logger.info(" - port: " + PORT);
114 logger.info(" - user: " + USERNAME);
115 logger.info(" - db name: " + DB_NAME);
116 }
117
118 /**
119 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
120 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
121 */
122 public void connectToDB() throws Exception {
123
124 // Creating a Mongo client
125 mongo = new MongoClient( HOST, PORT );
126
127 // Creating Credentials
128 MongoCredential credential;
129 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
130 System.out.println("Connected to the database successfully");
131
132 // Accessing the database
133 this.database = mongo.getDatabase(DB_NAME);
134 logger.info("Credentials: "+ credential);
135
136 /*
137 MongoCredential credential;
138 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
139 logger.info("Credentials: "+ credential);
140
141 // Create our Mongo client
142 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
143 System.out.println("Connected to the database successfully");
144
145 this.database = mongo.getDatabase(DB_NAME);
146 */
147
148 Morphia morphia = new Morphia();
149 morphia.mapPackage("com.greenstone.atea.morphia");
150 datastore = morphia.createDatastore(mongo, DB_NAME);
151 datastore.ensureIndexes();
152
153 }
154
155 // TODO: which fields should be indexed?
156
157 public void showCollections() {
158 //MongoIterable<String> colls = this.database.listCollectionNames();
159 for(String coll : this.database.listCollectionNames()) {
160 System.err.println("coll: " + coll);
161 }
162 }
163
164 protected MongoCollection<Document> getWebpagesCollection() {
165 return this.database.getCollection(WEBPAGES_COLLECTION);
166 }
167 protected MongoCollection<Document> getWebsitesCollection() {
168 return this.database.getCollection(WEBSITES_COLLECTION);
169 }
170
171 /*
172 public void insertWebsiteInfo(WebsiteInfo website)
173 {
174 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
175 Document document = new Document("_id", website.id)
176 .append("siteFolderName", website.siteFolderName)
177 .append("domain", website.domain)
178 .append("totalPages", website.totalPages)
179 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
180 .append("numPagesInMRI", website.numPagesInMRI)
181 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
182 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
183 .append("redoCrawl", website.redoCrawl);
184
185 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
186 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
187 document.put("countryCode", website.geoLocationCountryCode);
188 }
189
190 collection.insertOne(document);
191 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
192 + " inserted successfully into " + WEBSITES_COLLECTION);
193 }
194 */
195
196 /**
197 * Inserts a web page into the mongodb. Besides page related metadata and full body text
198 * the language information per sentence and per 2 adjacent sentences also get stored
199 * into the mongodb.
200 */
201 /*
202 public void insertWebpageInfo(WebpageInfo webpage)
203 {
204 int mri_sentence_count = 0;
205
206 // load the webpages db 'table'
207 // in mongodb, the equivalent of db tables are called 'collections'
208 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
209
210 Document document = new Document("_id", webpage.webpageID)
211 .append("siteid", webpage.websiteID)
212 .append("url", webpage.URL)
213 .append("isMRI", webpage.isMRI)
214 .append("totalSentences", webpage.totalSentences)
215 .append("charEncoding", webpage.charEncoding)
216 .append("modTime", webpage.modifiedTime)
217 .append("fetchTime", webpage.fetchTime);
218
219 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
220 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
221 List<BasicDBObject> sentencesList = new ArrayList<>();
222 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
223
224 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
225
226 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
227 bsonRecord.put("sentence", sentenceInfo.sentence);
228
229 sentencesList.add(bsonRecord);
230
231 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
232 mri_sentence_count++;
233 }
234
235 }
236 document.put("singleSentences", sentencesList);
237
238 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
239 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
240
241 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
242 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
243 bsonRecord.put("sentence", sentenceInfo.sentence);
244
245 overlappingSentencesList.add(bsonRecord);
246 }
247 document.put("overlappingSentences", overlappingSentencesList);
248
249 // also put the full text in there
250 document.put("text", webpage.text);
251
252 // also store the count of sentences in MRI
253 webpage.setMRISentenceCount(mri_sentence_count);
254 document.put("mriSentenceCount", mri_sentence_count);
255
256
257 collection.insertOne(document);
258 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
259 }
260 */
261
262
263 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
264 public void close() {}
265
266
267 // TODO:
268 // In the database, need to ensure we have else
269 // create collection (table in RDBMS) websites, create collection webpages.
270 // The webpages collection will have sentences embedded based on my decisions from
271 // reading the series
272 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
273 // Then need functions:
274 // insertWebsiteDocument()
275 // insertWebpageDocument()
276
277 public static void main(String args[]) {
278 try {
279 MongoDBAccess mongodbCon = new MongoDBAccess();
280 mongodbCon.connectToDB();
281 mongodbCon.showCollections();
282
283 } catch(Exception e) {
284 e.printStackTrace();
285 }
286 }
287}
Note: See TracBrowser for help on using the repository browser.