Changeset 33876
- Timestamp:
- 2020-01-29T21:48:52+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33871 r33876 3 3 //import org.bson.BSONObject; 4 4 5 import com.mongodb.client.AggregateIterable; 5 6 import com.mongodb.client.MongoCollection; 6 7 import com.mongodb.client.MongoDatabase; … … 11 12 // to use collection.find().projection() filters like include() etc 12 13 import static com.mongodb.client.model.Projections.*; 14 // to use aggregation functions like unwind(), match(), sort() etc 15 import static com.mongodb.client.model.Aggregates.*; 16 // to use functions like sum() and addToSet() within aggregation functions 17 import static com.mongodb.client.model.Accumulators.*; 13 18 14 19 //import org.bson.conversions.Bson; … … 22 27 23 28 import org.bson.Document; 29 import org.bson.conversions.Bson; 30 31 import com.mongodb.util.JSON; 32 //import com.mongodb.DBObject; 24 33 25 34 import java.io.BufferedReader; 26 35 import java.io.File; 27 36 import java.io.FileReader; 37 import java.util.Arrays; 28 38 import java.util.ArrayList; 29 39 import java.util.List; … … 68 78 static final String PROPS_FILENAME = "config.properties"; 69 79 public static final String WEBPAGES_COLLECTION = "Webpages"; 70 public static final String WEBSITES_COLLECTION = "Websites"; 80 public static final String WEBSITES_COLLECTION = "Websites"; 81 82 /** mongodb filter types to execute */ 83 public static final int IS_MRI = 0; 84 public static final int CONTAINS_MRI = 1; 71 85 72 86 // configuration details, some with fallback values … … 264 278 } 265 279 */ 266 280 281 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 282 return queryAllMatchingURLsFilteredBy(domain, IS_MRI); 283 } 284 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) { 285 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI); 286 } 287 267 288 /** 268 289 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/ … … 275 296 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java 276 297 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 277 */278 public ArrayList<String> queryAllMatching IsMRIURLs(String domain) {298 */ 299 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) { 279 300 280 301 final ArrayList<String> urlsList = new ArrayList<String>(); … … 300 321 }; 301 322 302 303 323 304 324 // Run the following mongodb query: … … 309 329 310 330 // 2. Another way: 311 String query = "{URL: /DOMAIN/, isMRI: true}"; 331 //String query = "{URL: /DOMAIN/, isMRI: true}"; 332 String query = "{URL: /DOMAIN/, "; 333 if(filterType == IS_MRI) { 334 query += "isMRI: true}"; 335 } else if(filterType == CONTAINS_MRI) { 336 query += "containsMRI: true}"; 337 } 338 312 339 domain = domain.replace(".", "\\."); // escape dots in domain for regex 313 340 query = query.replace("DOMAIN", domain); … … 324 351 } 325 352 326 353 /** 354 355 db.Websites.aggregate([ 356 { 357 $match: { 358 $and: [ 359 {geoLocationCountryCode: {$ne: "NZ"}}, 360 {domain: {$not: /\.nz/}}, 361 {numPagesContainingMRI: {$gt: 0}}, 362 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]} 363 ] 364 } 365 }, 366 { $unwind: "$geoLocationCountryCode" }, 367 { 368 $group: { 369 _id: {$toLower: '$geoLocationCountryCode'}, 370 count: { $sum: 1 }, 371 domain: { $addToSet: '$domain' } 372 } 373 }, 374 { $sort : { count : -1} } 375 ]); 376 377 https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver 378 https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 379 Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo 380 381 (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) 382 https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates 383 On using group(TExpression) inside collection.aggregate(). 384 */ 385 public String aggregateContainsMRIForOverseas() { 386 // working with the WebSites collection, not WebPages collection! 387 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 388 389 /*String matchQuery = 390 "$and: [" 391 + "{geoLocationCountryCode: {$ne: \"NZ\"}}," 392 + "{domain: {$not: /\\.nz/}}," 393 + "{numPagesContainingMRI: {$gt: 0}}," 394 + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}" 395 + "]";*/ 396 397 398 399 400 Bson orQuery = or( 401 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), 402 BasicDBObject.parse("{urlContainsLangCodeInPath: false}") 403 ); 404 Bson andQuery = and( 405 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 406 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"), 407 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 408 orQuery); 409 410 AggregateIterable<Document> output 411 = collection.aggregate(Arrays.asList( 412 match(andQuery), //match(BasicDBObject.parse(matchQuery)) 413 // match((List<DBObject>)JSON.parse(matchQuery)), 414 unwind("$geoLocationCountryCode"), 415 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 416 sort(BasicDBObject.parse("{count : -1}")) 417 )); 418 419 for (Document doc : output) { 420 //System.out.println(doc); 421 System.out.println(doc.toJson()); 422 } 423 424 return ""; 425 } 327 426 328 427 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ -
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33873 r33876 18 18 * TO RUN: 19 19 * maori-lang-detection/src$ 20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255 21 21 * 22 22 */ … … 25 25 26 26 private final MongoDBAccess mongodbAccess; 27 private final int numURLs; 28 29 //private String[] urls; 30 ArrayList<String> urlsList = new ArrayList<String>(); 27 private int numURLs; 28 private File domainsFile; 29 30 public WebPageURLsListing(MongoDBAccess mongodbAccess, 31 File domainsFile) 32 { 33 this.mongodbAccess = mongodbAccess; 34 this.domainsFile = domainsFile; 35 } 31 36 32 37 public WebPageURLsListing(MongoDBAccess mongodbAccess, … … 34 39 int numURLs) 35 40 { 36 this .mongodbAccess = mongodbAccess;41 this(mongodbAccess, domainsFile); 37 42 this.numURLs = numURLs; 43 } 38 44 45 public String produceURLsForPagesInMRI() { 46 return writeFile(MongoDBAccess.IS_MRI); 47 } 48 49 public String produceURLsForPagesContainingMRI() { 50 return writeFile(MongoDBAccess.CONTAINS_MRI); 51 } 52 53 54 public String writeFile(int filterType) { 55 56 ArrayList<String> urlsList = new ArrayList<String>(); 57 39 58 // 1. read each url from the domainsFile 40 41 42 59 // 1a. do the query 60 // 1b. add the arraylist result to urls 61 43 62 try ( 44 63 BufferedReader reader = new BufferedReader(new FileReader(domainsFile)); … … 51 70 domain = domain.trim(); 52 71 if(!domain.equals("")) { 53 ArrayList<String> moreURLs = mongodbAccess.queryAllMatching IsMRIURLs(domain);72 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 54 73 urlsList.addAll(moreURLs); 55 74 } … … 63 82 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 64 83 File parentFolder = domainsFile.getParentFile(); 65 String fileName = domainsFile.getName();66 //File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);67 File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);84 //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName()); 85 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; 86 File outFile = new File(parentFolder, fileName+domainsFile.getName()); 68 87 69 88 // write out ALL the URLs 70 89 try ( 71 Writer writer = new BufferedWriter(new FileWriter( fullSetOutFile));90 Writer writer = new BufferedWriter(new FileWriter(outFile)); 72 91 ) { 73 92 … … 78 97 } 79 98 } catch(Exception e) { 80 logger.error("Unable to write to file " + fullSetOutFile.getAbsolutePath());99 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 81 100 logger.error(e.getMessage(), e); 82 101 } … … 98 117 } 99 118 */ 119 120 return outFile.getAbsolutePath(); 100 121 } 122 101 123 102 124 public static void printUsage() { … … 107 129 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ 108 130 109 110 131 111 132 public static void main(String args[]) { … … 124 145 } 125 146 126 int genNumURLs = Integer.parseInt(args[1]);147 //int genNumURLs = Integer.parseInt(args[1]); 127 148 128 149 mongodb.connectToDB(); 129 150 130 151 WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile); 152 //String isMRIFile = listing.produceURLsForPagesInMRI(); 153 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 154 mongodb.aggregateContainsMRIForOverseas(); 131 155 132 156 } catch(Exception e) {
Note:
See TracChangeset
for help on using the changeset viewer.