- Timestamp:
- 2020-01-29T21:48:52+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33871 r33876 3 3 //import org.bson.BSONObject; 4 4 5 import com.mongodb.client.AggregateIterable; 5 6 import com.mongodb.client.MongoCollection; 6 7 import com.mongodb.client.MongoDatabase; … … 11 12 // to use collection.find().projection() filters like include() etc 12 13 import static com.mongodb.client.model.Projections.*; 14 // to use aggregation functions like unwind(), match(), sort() etc 15 import static com.mongodb.client.model.Aggregates.*; 16 // to use functions like sum() and addToSet() within aggregation functions 17 import static com.mongodb.client.model.Accumulators.*; 13 18 14 19 //import org.bson.conversions.Bson; … … 22 27 23 28 import org.bson.Document; 29 import org.bson.conversions.Bson; 30 31 import com.mongodb.util.JSON; 32 //import com.mongodb.DBObject; 24 33 25 34 import java.io.BufferedReader; 26 35 import java.io.File; 27 36 import java.io.FileReader; 37 import java.util.Arrays; 28 38 import java.util.ArrayList; 29 39 import java.util.List; … … 68 78 static final String PROPS_FILENAME = "config.properties"; 69 79 public static final String WEBPAGES_COLLECTION = "Webpages"; 70 public static final String WEBSITES_COLLECTION = "Websites"; 80 public static final String WEBSITES_COLLECTION = "Websites"; 81 82 /** mongodb filter types to execute */ 83 public static final int IS_MRI = 0; 84 public static final int CONTAINS_MRI = 1; 71 85 72 86 // configuration details, some with fallback values … … 264 278 } 265 279 */ 266 280 281 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 282 return queryAllMatchingURLsFilteredBy(domain, IS_MRI); 283 } 284 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) { 285 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI); 286 } 287 267 288 /** 268 289 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/ … … 275 296 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java 276 297 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 277 */278 public ArrayList<String> queryAllMatching IsMRIURLs(String domain) {298 */ 299 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) { 279 300 280 301 final ArrayList<String> urlsList = new ArrayList<String>(); … … 300 321 }; 301 322 302 303 323 304 324 // Run the following mongodb query: … … 309 329 310 330 // 2. Another way: 311 String query = "{URL: /DOMAIN/, isMRI: true}"; 331 //String query = "{URL: /DOMAIN/, isMRI: true}"; 332 String query = "{URL: /DOMAIN/, "; 333 if(filterType == IS_MRI) { 334 query += "isMRI: true}"; 335 } else if(filterType == CONTAINS_MRI) { 336 query += "containsMRI: true}"; 337 } 338 312 339 domain = domain.replace(".", "\\."); // escape dots in domain for regex 313 340 query = query.replace("DOMAIN", domain); … … 324 351 } 325 352 326 353 /** 354 355 db.Websites.aggregate([ 356 { 357 $match: { 358 $and: [ 359 {geoLocationCountryCode: {$ne: "NZ"}}, 360 {domain: {$not: /\.nz/}}, 361 {numPagesContainingMRI: {$gt: 0}}, 362 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} 363 ] 364 } 365 }, 366 { $unwind: "$geoLocationCountryCode" }, 367 { 368 $group: { 369 _id: {$toLower: '$geoLocationCountryCode'}, 370 count: { $sum: 1 }, 371 domain: { $addToSet: '$domain' } 372 } 373 }, 374 { $sort : { count : -1} } 375 ]); 376 377 https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver 378 https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 379 Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo 380 381 (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) 382 https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates 383 On using group(TExpression) inside collection.aggregate(). 384 */ 385 public String aggregateContainsMRIForOverseas() { 386 // working with the WebSites collection, not WebPages collection! 387 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 388 389 /*String matchQuery = 390 "$and: [" 391 + "{geoLocationCountryCode: {$ne: \"NZ\"}}," 392 + "{domain: {$not: /\\.nz/}}," 393 + "{numPagesContainingMRI: {$gt: 0}}," 394 + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}" 395 + "]";*/ 396 397 398 399 400 Bson orQuery = or( 401 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), 402 BasicDBObject.parse("{urlContainsLangCodeInPath: false}") 403 ); 404 Bson andQuery = and( 405 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 406 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"), 407 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 408 orQuery); 409 410 AggregateIterable<Document> output 411 = collection.aggregate(Arrays.asList( 412 match(andQuery), //match(BasicDBObject.parse(matchQuery)) 413 // match((List<DBObject>)JSON.parse(matchQuery)), 414 unwind("$geoLocationCountryCode"), 415 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 416 sort(BasicDBObject.parse("{count : -1}")) 417 )); 418 419 for (Document doc : output) { 420 //System.out.println(doc); 421 System.out.println(doc.toJson()); 422 } 423 424 return ""; 425 } 327 426 328 427 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
Note:
See TracChangeset
for help on using the changeset viewer.