source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 8.9 KB
Line 
1package org.greenstone.atea;
2
3
4import com.mongodb.client.MongoCollection;
5import com.mongodb.client.MongoDatabase;
6//import com.mongodb.client.MongoIterable;
7import com.mongodb.BasicDBObject;
8import com.mongodb.MongoClient;
9import com.mongodb.MongoCredential;
10import com.mongodb.ServerAddress;
11import com.mongodb.MongoClientOptions;
12
13import org.bson.Document;
14
15import java.io.BufferedReader;
16import java.io.File;
17import java.io.FileReader;
18import java.util.ArrayList;
19import java.util.List;
20import java.util.Properties;
21
22
23import org.apache.log4j.Logger;
24
25
26/**
27 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
28 *
29 * TO COMPILE:
30 * maori-lang-detection/src$
31 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
32 *
33 * TO RUN:
34 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
35 *
36 * Manually connecting to mongodb from client:
37 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
38 * Then after connecting with pwd, type:
39 * use DBNAME
40 *
41 * Or connect to mongodb and specify db in one statement:
42 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
43 *
44 * Some links:
45 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
46 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
47 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
48 * IMPORTANT LINK:
49 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
50 *
51 */
52public class MongoDBAccess implements AutoCloseable {
53
54 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
55
56 static final String PROPS_FILENAME = "config.properties";
57 public static final String WEBPAGES_COLLECTION = "webpages";
58 public static final String WEBSITES_COLLECTION = "websites";
59
60 // configuration details, some with fallback values
61 private String HOST = "localhost";
62 private int PORT = 27017; // mongodb port
63 private String USERNAME;
64 private String PASSWORD;
65 private String DB_NAME ="ateacrawldata";
66
67 private MongoClient mongo = null;
68 private MongoDatabase database = null;
69
70
71 public MongoDBAccess() throws Exception {
72 boolean success = false;
73
74 // Read in the username and password from our props file
75 Properties props = new Properties();
76
77 //File propsFile = new File(PROPS_FILENAME);
78 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
79 try {
80 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
81 } catch(Exception e) {
82 logger.error(e);
83 }
84
85
86 USERNAME = props.getProperty("mongodb.user", "");
87 if(USERNAME.equals("")) {
88 USERNAME = "root";
89 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
90 }
91 PASSWORD = props.getProperty("mongodb.pwd");
92
93 logger.debug("Got pwd: " + PASSWORD);
94
95 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
96
97 success = false;
98 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
99 }
100
101 HOST = props.getProperty("mongodb.host", HOST);
102 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
103 PORT = Integer.parseInt(port);
104 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
105
106 logger.info("Connecting to mongodb with:");
107 logger.info(" - host: " + HOST);
108 logger.info(" - port: " + PORT);
109 logger.info(" - user: " + USERNAME);
110 logger.info(" - db name: " + DB_NAME);
111 }
112
113 /**
114 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
115 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
116 */
117 public void connectToDB() throws Exception {
118
119 // Creating a Mongo client
120 mongo = new MongoClient( HOST, PORT );
121
122 // Creating Credentials
123 MongoCredential credential;
124 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
125 System.out.println("Connected to the database successfully");
126
127 // Accessing the database
128 this.database = mongo.getDatabase(DB_NAME);
129 logger.info("Credentials: "+ credential);
130
131 /*
132 MongoCredential credential;
133 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
134 logger.info("Credentials: "+ credential);
135
136 // Create our Mongo client
137 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
138 System.out.println("Connected to the database successfully");
139
140 this.database = mongo.getDatabase(DB_NAME);
141 */
142
143 }
144
145 // TODO: which fields should be indexed?
146
147 public void showCollections() {
148 //MongoIterable<String> colls = this.database.listCollectionNames();
149 for(String coll : this.database.listCollectionNames()) {
150 System.err.println("coll: " + coll);
151 }
152 }
153
154
155 public void insertWebsiteInfo(WebsiteInfo website)
156 {
157 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
158 Document document = new Document("_id", website.id)
159 .append("siteFolderName", website.siteFolderName)
160 .append("domain", website.domain)
161 .append("totalPages", website.totalPages)
162 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
163 .append("numPagesInMRI", website.numPagesInMRI)
164 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
165 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
166 .append("redoCrawl", website.redoCrawl);
167
168 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
169 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
170 document.put("countryCode", website.geoLocationCountryCode);
171 }
172
173 collection.insertOne(document);
174 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
175 + " inserted successfully into " + WEBSITES_COLLECTION);
176 }
177
178 /* TODO:
179 https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex
180 */
181 public void insertWebpageInfo(WebpageInfo webpage)
182 {
183 // load the webpages db 'table'
184 // in mongodb, the equivalent of db tables are called 'collections'
185 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
186
187 Document document = new Document("_id", webpage.webpageID)
188 .append("siteid", webpage.websiteID)
189 .append("url", webpage.URL)
190 .append("isMRI", webpage.isMRI)
191 .append("totalSentences", webpage.totalSentences)
192 .append("charEncoding", webpage.charEncoding)
193 .append("modTime", webpage.modifiedTime)
194 .append("fetchTime", webpage.fetchTime);
195
196 // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT:
197 //document.put("singleSentences", webpage.singleSentences);
198 //document.put("overlappingSentences", webpage.overlappingSentences);
199
200 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
201 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
202 List<BasicDBObject> sentencesList = new ArrayList<>();
203 for(SentenceInfo sentence : webpage.singleSentences) {
204 sentencesList.add(new BasicDBObject("langCode", sentence.langCode));
205 sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel));
206 sentencesList.add(new BasicDBObject("sentence", sentence));
207 }
208 document.put("singleSentences", sentencesList);
209
210 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
211 for(SentenceInfo sentence : webpage.overlappingSentences) {
212 sentencesList.add(new BasicDBObject("langCode", sentence.langCode));
213 sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel));
214 sentencesList.add(new BasicDBObject("sentence", sentence));
215 }
216 document.put("singleSentences", overlappingSentencesList);
217
218 // also put the full text in there
219 document.put("text", webpage.text);
220
221 collection.insertOne(document);
222 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
223 }
224
225 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
226 public void close() {}
227
228
229 // TODO:
230 // In the database, need to ensure we have else
231 // create collection (table in RDBMS) websites, create collection webpages.
232 // The webpages collection will have sentences embedded based on my decisions from
233 // reading the series
234 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
235 // Then need functions:
236 // insertWebsiteDocument()
237 // insertWebpageDocument()
238
239 public static void main(String args[]) {
240 try {
241 MongoDBAccess mongodbCon = new MongoDBAccess();
242 mongodbCon.connectToDB();
243 mongodbCon.showCollections();
244
245 } catch(Exception e) {
246 e.printStackTrace();
247 }
248 }
249}
Note: See TracBrowser for help on using the repository browser.