- Timestamp:
- 2019-11-08T23:59:07+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33633 r33634 4 4 import com.mongodb.client.MongoCollection; 5 5 import com.mongodb.client.MongoDatabase; 6 //import com.mongodb.client.MongoIterable; 7 import com.mongodb.BasicDBObject; 6 8 import com.mongodb.MongoClient; 7 import com.mongodb.MongoCredential; 9 import com.mongodb.MongoCredential; 10 import com.mongodb.ServerAddress; 11 import com.mongodb.MongoClientOptions; 8 12 9 13 import org.bson.Document; … … 12 16 import java.io.File; 13 17 import java.io.FileReader; 18 import java.util.ArrayList; 19 import java.util.List; 14 20 import java.util.Properties; 21 15 22 16 23 import org.apache.log4j.Logger; … … 43 50 * 44 51 */ 45 public class MongoDBAccess {52 public class MongoDBAccess implements AutoCloseable { 46 53 47 54 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName()); 48 55 49 56 static final String PROPS_FILENAME = "config.properties"; 50 public static final String DB_NAME = "anupama"; //"ateacrawldata";51 57 public static final String WEBPAGES_COLLECTION = "webpages"; 52 58 public static final String WEBSITES_COLLECTION = "websites"; 53 59 60 // configuration details, some with fallback values 54 61 private String HOST = "localhost"; 55 62 private int PORT = 27017; // mongodb port 56 63 private String USERNAME; 57 64 private String PASSWORD; 58 65 private String DB_NAME ="ateacrawldata"; 66 59 67 private MongoClient mongo = null; 60 68 private MongoDatabase database = null; … … 108 116 */ 109 117 public void connectToDB() throws Exception { 118 110 119 // Creating a Mongo client 111 120 mongo = new MongoClient( HOST, PORT ); … … 117 126 118 127 // Accessing the database 119 database = mongo.getDatabase(DB_NAME);128 this.database = mongo.getDatabase(DB_NAME); 120 129 logger.info("Credentials: "+ credential); 121 } 122 123 124 public void insertWebSiteInfo(int SITE_COUNTER, int siteID, String domainOfSite, 125 int numPages, int numPagesInMRI, int numPagesContainingMRI, 126 /* TODO: String geoLocationCountryCode, boolean miURL */ 127 String siteCrawledTimestamp, String siteCrawlUnfinished, boolean redoCrawl) 130 131 /* 132 MongoCredential credential; 133 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray()); 134 logger.info("Credentials: "+ credential); 135 136 // Create our Mongo client 137 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build()); 138 System.out.println("Connected to the database successfully"); 139 140 this.database = mongo.getDatabase(DB_NAME); 141 */ 142 143 } 144 145 // TODO: which fields should be indexed? 146 147 public void showCollections() { 148 //MongoIterable<String> colls = this.database.listCollectionNames(); 149 for(String coll : this.database.listCollectionNames()) { 150 System.err.println("coll: " + coll); 151 } 152 } 153 154 155 public void insertWebsiteInfo(WebsiteInfo website) 128 156 { 129 157 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 130 Document document = new Document("id", SITE_COUNTER) 131 .append("siteFolderName", siteID) 132 .append("domain", domainOfSite) 133 .append("totalPages", numPages) 134 .append("numPagesInMRI", numPagesInMRI) 135 .append("numPagesContainingMRI", numPagesContainingMRI) 136 .append("siteCrawledTimestamp", siteCrawledTimestamp) 137 .append("siteCrawlUnfinished", siteCrawlUnfinished) 138 .append("redoCrawl", redoCrawl); 158 Document document = new Document("_id", website.id) 159 .append("siteFolderName", website.siteFolderName) 160 .append("domain", website.domain) 161 .append("totalPages", website.totalPages) 162 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) 163 .append("numPagesInMRI", website.numPagesInMRI) 164 .append("siteCrawledTimestamp", website.siteCrawledTimestamp) 165 .append("siteCrawlUnfinished", website.siteCrawlUnfinished) 166 .append("redoCrawl", website.redoCrawl); 167 168 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath); 169 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) { 170 document.put("countryCode", website.geoLocationCountryCode); 171 } 172 139 173 collection.insertOne(document); 140 System.out.println("website info inserted successfully into " + WEBSITES_COLLECTION); 141 } 142 143 144 public void insertWebPage(int WEBPAGE_COUNTER, int site_id, /* id of websites_collection*/ 145 String url, String charEncoding, String modTime, String fetchTime, 146 boolean isMRI, int totalSentences, int numSentencesInMRI, 147 ArrayList<SentenceInfo> singleSentences, 148 ArrayList<SentenceInfo> overlappingSentences) 174 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")" 175 + " inserted successfully into " + WEBSITES_COLLECTION); 176 } 177 178 /* TODO: 179 https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex 180 */ 181 public void insertWebpageInfo(WebpageInfo webpage) 149 182 { 150 183 // load the webpages db 'table' … … 152 185 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 153 186 154 Document document = new Document("id", WEBPAGE_COUNTER) 155 .append("siteid", site_id) 156 .append("url", url) 157 .append("charEncoding", charEncoding) 158 .append("modTime", modTime) 159 .append("fetchTime", fetchTime) 160 .append("isMRI", isMRI) 161 .append("totalSentences", totalSentences) 162 .append("numSentencesInMRI", numSentencesInMRI); 163 164 document.put("singleSentences", singleSentences); 165 document.put("overlappingSentences", overlappingSentences); 187 Document document = new Document("_id", webpage.webpageID) 188 .append("siteid", webpage.websiteID) 189 .append("url", webpage.URL) 190 .append("isMRI", webpage.isMRI) 191 .append("totalSentences", webpage.totalSentences) 192 .append("charEncoding", webpage.charEncoding) 193 .append("modTime", webpage.modifiedTime) 194 .append("fetchTime", webpage.fetchTime); 195 196 // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT: 197 //document.put("singleSentences", webpage.singleSentences); 198 //document.put("overlappingSentences", webpage.overlappingSentences); 199 200 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER: 201 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java 202 List<BasicDBObject> sentencesList = new ArrayList<>(); 203 for(SentenceInfo sentence : webpage.singleSentences) { 204 sentencesList.add(new BasicDBObject("langCode", sentence.langCode)); 205 sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel)); 206 sentencesList.add(new BasicDBObject("sentence", sentence)); 207 } 208 document.put("singleSentences", sentencesList); 209 210 List<BasicDBObject> overlappingSentencesList = new ArrayList<>(); 211 for(SentenceInfo sentence : webpage.overlappingSentences) { 212 sentencesList.add(new BasicDBObject("langCode", sentence.langCode)); 213 sentencesList.add(new BasicDBObject("confidence", sentence.confidenceLevel)); 214 sentencesList.add(new BasicDBObject("sentence", sentence)); 215 } 216 document.put("singleSentences", overlappingSentencesList); 217 218 // also put the full text in there 219 document.put("text", webpage.text); 166 220 167 221 collection.insertOne(document); 168 System.out.println("website info inserted successfully into " + WEBPAGES_COLLECTION); 169 } 222 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION); 223 } 224 225 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 226 public void close() {} 170 227 171 228 … … 184 241 MongoDBAccess mongodbCon = new MongoDBAccess(); 185 242 mongodbCon.connectToDB(); 186 //mongodbCon.insertDocument(); 187 }catch(Exception e) { 243 mongodbCon.showCollections(); 244 245 } catch(Exception e) { 188 246 e.printStackTrace(); 189 247 }
Note:
See TracChangeset
for help on using the changeset viewer.