Changeset 33880 for other-projects
- Timestamp:
- 2020-01-30T21:17:40+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33879 r33880 17 17 import static com.mongodb.client.model.Accumulators.*; 18 18 19 19 20 //import org.bson.conversions.Bson; 20 21 import com.mongodb.BasicDBObject; … … 28 29 import org.bson.Document; 29 30 import org.bson.conversions.Bson; 31 import org.bson.json.JsonMode; 32 import org.bson.json.JsonWriterSettings; 30 33 31 34 import com.mongodb.util.JSON; 32 35 //import com.mongodb.DBObject; 36 37 38 import com.google.gson.*; // for pretty printing 33 39 34 40 import java.io.BufferedReader; 35 41 import java.io.File; 36 42 import java.io.FileReader; 43 import java.io.IOException; 44 import java.io.Writer; 45 37 46 import java.util.Arrays; 38 47 import java.util.ArrayList; … … 79 88 public static final String WEBPAGES_COLLECTION = "Webpages"; 80 89 public static final String WEBSITES_COLLECTION = "Websites"; 90 91 public static final String NEWLINE = System.getProperty("line.separator"); 81 92 82 93 /** mongodb filter types to execute */ … … 351 362 } 352 363 364 /** 365 The mongodb aggregate() we want to run this time: 366 367 db.Websites.aggregate([ 368 { 369 $match: { 370 $and: [ 371 {numPagesContainingMRI: {$gt: 0}}, 372 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 373 ] 374 } 375 }, 376 { $unwind: "$geoLocationCountryCode" }, 377 { 378 $group: { 379 _id: "nz", 380 count: { $sum: 1 }, 381 domain: { $addToSet: '$domain' } 382 } 383 }, 384 { $sort : { count : -1} } 385 ]); 386 */ 387 public void aggregateContainsMRIForNZ(Writer writer) throws IOException { 388 // working with the WebSites collection, not WebPages collection! 389 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 390 391 392 //String isMRI_filter = 393 394 Bson orQuery = or( 395 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 396 BasicDBObject.parse("{domain: /\\.nz/}") 397 ); 398 Bson andQuery = and( 399 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 400 orQuery); 401 402 AggregateIterable<Document> output 403 = collection.aggregate(Arrays.asList( 404 match(andQuery), 405 unwind("$geoLocationCountryCode"), 406 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 407 sort(BasicDBObject.parse("{count : -1}")) 408 )); 409 410 // should only have one doc 411 for (Document doc : output) { 412 //System.out.println(doc); 413 System.out.println(doc.toJson()); 414 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 415 //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE); 416 /* 417 JsonWriterSettings writeSettings = new JsonWriterSettings(); 418 writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); 419 writer.write(doc.toJson(writeSettings) + NEWLINE); 420 */ 421 writer.write(prettyPrintJson(doc.toJson()) + NEWLINE); 422 } 423 424 return; 425 } 426 353 427 /** 354 428 RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: … … 387 461 ]); 388 462 463 464 https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line 389 465 */ 390 public String aggregateContainsMRIForOverseas(){466 public void aggregateContainsMRIForOverseas(Writer writer) throws IOException { 391 467 // working with the WebSites collection, not WebPages collection! 392 468 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); … … 425 501 //System.out.println(doc); 426 502 System.out.println(doc.toJson()); 427 } 428 429 return ""; 430 } 431 432 433 /** 434 The mongodb aggregate() we want to run this time: 435 436 db.Websites.aggregate([ 437 { 438 $match: { 439 $and: [ 440 {numPagesContainingMRI: {$gt: 0}}, 441 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]} 442 ] 443 } 444 }, 445 { $unwind: "$geoLocationCountryCode" }, 446 { 447 $group: { 448 _id: "nz", 449 count: { $sum: 1 }, 450 domain: { $addToSet: '$domain' } 451 } 452 }, 453 { $sort : { count : -1} } 454 ]); 455 */ 456 public String aggregateContainsMRIForNZ() { 457 // working with the WebSites collection, not WebPages collection! 458 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 459 460 461 //String isMRI_filter = 462 463 Bson orQuery = or( 464 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 465 BasicDBObject.parse("{domain: /\\.nz/}") 466 ); 467 Bson andQuery = and( 468 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 469 orQuery); 470 471 AggregateIterable<Document> output 472 = collection.aggregate(Arrays.asList( 473 match(andQuery), 474 unwind("$geoLocationCountryCode"), 475 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 476 sort(BasicDBObject.parse("{count : -1}")) 477 )); 478 479 // should only have one doc 480 for (Document doc : output) { 481 //System.out.println(doc); 482 System.out.println(doc.toJson()); 483 } 484 485 return ""; 486 } 503 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 504 //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE); 505 /* 506 JsonWriterSettings writeSettings = new JsonWriterSettings(); 507 writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); 508 writer.write(doc.toJson(writeSettings) + NEWLINE); 509 */ 510 writer.write(prettyPrintJson(doc.toJson()) + NEWLINE); 511 } 512 513 return; 514 } 515 516 517 public String prettyPrintJson(String jsonStr) { 518 Gson gson = new GsonBuilder().setPrettyPrinting().create(); 519 JsonParser jp = new JsonParser(); 520 JsonElement je = jp.parse(jsonStr); 521 String prettyJsonString = gson.toJson(je); 522 return prettyJsonString; 523 } 524 487 525 488 526 public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) { -
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33879 r33880 27 27 private int numURLs; 28 28 private File domainsFile; 29 30 29 31 30 32 public WebPageURLsListing(MongoDBAccess mongodbAccess, … … 120 122 return outFile.getAbsolutePath(); 121 123 } 124 125 /* ---------------------------------------- */ 126 127 /** 128 * Create the file 129 * @return full path of file generated 130 */ 131 public String writeTentativeNonAutotranslatedSites() { 132 File outFolder = new File("../mongodb-data/").getAbsoluteFile(); 133 File outFile = new File(outFolder, "5counts_tentativeNonAutotranslatedSites.json"); 134 135 String filename = outFile.getAbsolutePath(); 136 137 try ( 138 Writer writer = new BufferedWriter(new FileWriter(outFile)); 139 ) { 140 // first write out NZ sites and .nz TLD count and domains 141 mongodbAccess.aggregateContainsMRIForNZ(writer); 142 // next write out all overseas sites and .nz TLD count and domains 143 mongodbAccess.aggregateContainsMRIForOverseas(writer); 144 145 filename = outFile.getCanonicalPath(); 146 } catch(Exception e) { 147 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 148 logger.error(e.getMessage(), e); 149 } 150 151 return filename; 152 } 122 153 123 154 … … 152 183 //String isMRIFile = listing.produceURLsForPagesInMRI(); 153 184 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 154 mongodb.aggregateContainsMRIForNZ(); 155 mongodb.aggregateContainsMRIForOverseas(); 185 String filename = listing.writeTentativeNonAutotranslatedSites(); 186 System.err.println("Check file: " + filename); 187 156 188 157 189 } catch(Exception e) {
Note:
See TracChangeset
for help on using the changeset viewer.