source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33645

Last change on this file since 33645 was 33645, checked in by ak19, 4 years ago

Fix to 2 bugs when sending data to MongoDB: 1. overlappingSentences was missing because overwritten by singleSentences. But it is still empty at present because of a bug. 2. Need to store an array of tuples not with each pair of a tuple stored as an array element.

File size: 9.2 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.MongoCollection;
6import com.mongodb.client.MongoDatabase;
7//import com.mongodb.client.MongoIterable;
8import com.mongodb.BasicDBObject;
9import com.mongodb.MongoClient;
10import com.mongodb.MongoCredential;
11import com.mongodb.ServerAddress;
12import com.mongodb.MongoClientOptions;
13
14import org.bson.Document;
15
16import java.io.BufferedReader;
17import java.io.File;
18import java.io.FileReader;
19import java.util.ArrayList;
20import java.util.List;
21import java.util.Properties;
22
23
24import org.apache.log4j.Logger;
25
26
27/**
28 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
29 *
30 * TO COMPILE:
31 * maori-lang-detection/src$
32 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
33 *
34 * TO RUN:
35 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
36 *
37 * Manually connecting to mongodb from client:
38 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
39 * Then after connecting with pwd, type:
40 * use DBNAME
41 *
42 * Or connect to mongodb and specify db in one statement:
43 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
44 *
45 * Some links:
46 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
47 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
48 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
49 * IMPORTANT LINK:
50 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
51 *
52 */
53public class MongoDBAccess implements AutoCloseable {
54
55 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
56
57 static final String PROPS_FILENAME = "config.properties";
58 public static final String WEBPAGES_COLLECTION = "webpages";
59 public static final String WEBSITES_COLLECTION = "websites";
60
61 // configuration details, some with fallback values
62 private String HOST = "localhost";
63 private int PORT = 27017; // mongodb port
64 private String USERNAME;
65 private String PASSWORD;
66 private String DB_NAME ="ateacrawldata";
67
68 private MongoClient mongo = null;
69 private MongoDatabase database = null;
70
71
72 public MongoDBAccess() throws Exception {
73 boolean success = false;
74
75 // Read in the username and password from our props file
76 Properties props = new Properties();
77
78 //File propsFile = new File(PROPS_FILENAME);
79 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
80 try {
81 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
82 } catch(Exception e) {
83 logger.error(e);
84 }
85
86
87 USERNAME = props.getProperty("mongodb.user", "");
88 if(USERNAME.equals("")) {
89 USERNAME = "root";
90 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
91 }
92 PASSWORD = props.getProperty("mongodb.pwd");
93
94 logger.debug("Got pwd: " + PASSWORD);
95
96 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
97
98 success = false;
99 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
100 }
101
102 HOST = props.getProperty("mongodb.host", HOST);
103 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
104 PORT = Integer.parseInt(port);
105 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
106
107 logger.info("Connecting to mongodb with:");
108 logger.info(" - host: " + HOST);
109 logger.info(" - port: " + PORT);
110 logger.info(" - user: " + USERNAME);
111 logger.info(" - db name: " + DB_NAME);
112 }
113
114 /**
115 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
116 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
117 */
118 public void connectToDB() throws Exception {
119
120 // Creating a Mongo client
121 mongo = new MongoClient( HOST, PORT );
122
123 // Creating Credentials
124 MongoCredential credential;
125 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
126 System.out.println("Connected to the database successfully");
127
128 // Accessing the database
129 this.database = mongo.getDatabase(DB_NAME);
130 logger.info("Credentials: "+ credential);
131
132 /*
133 MongoCredential credential;
134 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
135 logger.info("Credentials: "+ credential);
136
137 // Create our Mongo client
138 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
139 System.out.println("Connected to the database successfully");
140
141 this.database = mongo.getDatabase(DB_NAME);
142 */
143
144 }
145
146 // TODO: which fields should be indexed?
147
148 public void showCollections() {
149 //MongoIterable<String> colls = this.database.listCollectionNames();
150 for(String coll : this.database.listCollectionNames()) {
151 System.err.println("coll: " + coll);
152 }
153 }
154
155
156 public void insertWebsiteInfo(WebsiteInfo website)
157 {
158 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
159 Document document = new Document("_id", website.id)
160 .append("siteFolderName", website.siteFolderName)
161 .append("domain", website.domain)
162 .append("totalPages", website.totalPages)
163 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
164 .append("numPagesInMRI", website.numPagesInMRI)
165 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
166 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
167 .append("redoCrawl", website.redoCrawl);
168
169 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
170 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
171 document.put("countryCode", website.geoLocationCountryCode);
172 }
173
174 collection.insertOne(document);
175 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
176 + " inserted successfully into " + WEBSITES_COLLECTION);
177 }
178
179 /* TODO:
180 https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex
181 */
182 public void insertWebpageInfo(WebpageInfo webpage)
183 {
184 // load the webpages db 'table'
185 // in mongodb, the equivalent of db tables are called 'collections'
186 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
187
188 Document document = new Document("_id", webpage.webpageID)
189 .append("siteid", webpage.websiteID)
190 .append("url", webpage.URL)
191 .append("isMRI", webpage.isMRI)
192 .append("totalSentences", webpage.totalSentences)
193 .append("charEncoding", webpage.charEncoding)
194 .append("modTime", webpage.modifiedTime)
195 .append("fetchTime", webpage.fetchTime);
196
197 // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT:
198 //document.put("singleSentences", webpage.singleSentences);
199 //document.put("overlappingSentences", webpage.overlappingSentences);
200
201 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
202 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
203 List<BasicDBObject> sentencesList = new ArrayList<>();
204 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
205 //sentencesList.add(new BasicDBObject("langCode", sentenceInfo.langCode));
206 //sentencesList.add(new BasicDBObject("confidence", sentenceInfo.confidenceLevel));
207 //sentencesList.add(new BasicDBObject("sentence", sentenceInfo.sentence));
208 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
209 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
210 bsonRecord.put("sentence", sentenceInfo.sentence);
211
212 sentencesList.add(bsonRecord);
213 }
214 document.put("singleSentences", sentencesList);
215
216 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
217 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
218 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
219 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
220 bsonRecord.put("sentence", sentenceInfo.sentence);
221
222 sentencesList.add(bsonRecord);
223 }
224 document.put("overlappingSentences", overlappingSentencesList);
225
226 // also put the full text in there
227 document.put("text", webpage.text);
228
229 collection.insertOne(document);
230 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
231 }
232
233 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
234 public void close() {}
235
236
237 // TODO:
238 // In the database, need to ensure we have else
239 // create collection (table in RDBMS) websites, create collection webpages.
240 // The webpages collection will have sentences embedded based on my decisions from
241 // reading the series
242 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
243 // Then need functions:
244 // insertWebsiteDocument()
245 // insertWebpageDocument()
246
247 public static void main(String args[]) {
248 try {
249 MongoDBAccess mongodbCon = new MongoDBAccess();
250 mongodbCon.connectToDB();
251 mongodbCon.showCollections();
252
253 } catch(Exception e) {
254 e.printStackTrace();
255 }
256 }
257}
Note: See TracBrowser for help on using the repository browser.