- Timestamp:
- 2020-02-12T19:02:44+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33906 r33909 28 28 import org.bson.BsonArray; 29 29 import org.bson.BsonString; 30 import org.bson.BsonValue; 30 31 import org.bson.Document; 31 32 import org.bson.conversions.Bson; … … 52 53 import java.util.List; 53 54 import java.util.Properties; 55 import java.util.TreeSet; 54 56 import java.util.regex.Pattern; 55 57 … … 98 100 public static final String WEBSITES_COLLECTION = "Websites"; 99 101 100 public static final String NEWLINE = System.getProperty("line.separator"); 101 102 /** mongodb filter types to execute */ 103 public static final int IS_MRI = 0; 104 public static final int CONTAINS_MRI = 1; 105 106 /** Some reused fieldnames in the Websites collection */ 107 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI"; 108 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI"; 109 102 110 103 // configuration details, some with fallback values 111 pr ivateString HOST = "localhost";112 pr ivateint PORT = 27017; // mongodb port113 pr ivateString USERNAME;114 pr ivateString PASSWORD;115 pr ivateString DB_NAME ="ateacrawldata";116 117 pr ivateMongoClient mongo = null;118 pr ivateMongoDatabase database = null;104 protected String HOST = "localhost"; 105 protected int PORT = 27017; // mongodb port 106 protected String USERNAME; 107 protected String PASSWORD; 108 protected String DB_NAME ="ateacrawldata"; 109 110 protected MongoClient mongo = null; 111 protected MongoDatabase database = null; 119 112 120 113 /** … … 210 203 System.err.println("coll: " + coll); 211 204 } 205 } 206 207 protected MongoCollection<Document> getWebpagesCollection() { 208 return this.database.getCollection(WEBPAGES_COLLECTION); 209 } 210 protected MongoCollection<Document> getWebsitesCollection() { 211 return this.database.getCollection(WEBSITES_COLLECTION); 212 212 } 213 213 … … 219 219 .append("siteFolderName", website.siteFolderName) 220 220 .append("domain", website.domain) 221 .append("basicDomain", website.basicDomain)222 221 .append("totalPages", website.totalPages) 223 222 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) … … 303 302 } 304 303 */ 305 306 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {307 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);308 }309 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {310 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);311 }312 313 /**314 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/315 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html316 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection317 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find318 *319 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java320 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria321 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java322 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/323 */324 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {325 326 final ArrayList<String> urlsList = new ArrayList<String>();327 328 // remove any http(s)://(www.) from the start of URL first329 // since it goes into a regex330 domain = Utility.stripProtocolAndWWWFromURL(domain);331 332 // load the "webpages" db table333 // in mongodb, the equivalent of db tables are called 'collections'334 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);335 336 // code we'll execute in Iterable.forEach() below337 // see also https://www.baeldung.com/foreach-java338 Block<Document> storeURL = new Block<Document>() {339 @Override340 public void apply(final Document document) {341 //System.out.println(document.toJson());342 String url = document.getString("URL");343 // add to our urlsList344 //System.out.println(url);345 urlsList.add(url);346 }347 };348 349 350 // Run the following mongodb query:351 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})352 353 // 1. One way that works:354 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);355 356 // 2. Another way:357 //String query = "{URL: /DOMAIN/, isMRI: true}";358 String query = "{URL: /DOMAIN/, ";359 if(filterType == IS_MRI) {360 query += "isMRI: true}";361 } else if(filterType == CONTAINS_MRI) {362 query += "containsMRI: true}";363 }364 365 domain = domain.replace(".", "\\."); // escape dots in domain for regex366 query = query.replace("DOMAIN", domain);367 368 //System.err.println("Executing find query: " + query);369 370 BasicDBObject findObj = BasicDBObject.parse(query);371 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");372 373 374 collection.find(findObj).projection(projectionObj).forEach(storeURL);375 376 return urlsList;377 }378 379 /**380 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:381 *382 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver383 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria384 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo385 *386 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)387 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates388 * On using group(TExpression) inside collection.aggregate().389 *390 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java391 * and https://www.javatpoint.com/java-8-foreach392 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java393 *394 * Count by country code of non-NZ websites containing a positive number of sentences in MRI,395 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER396 * and total counts of numPagesInMRI and numPagesContainingMRI across all these397 * matching sites.398 *399 * The mongodb aggregate() we want to run this time:400 *401 db.Websites.aggregate([402 {403 $match: {404 $and: [405 {numPagesContainingMRI: {$gt: 0}},406 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}407 ]408 }409 },410 { $unwind: "$geoLocationCountryCode" },411 {412 $group: {413 _id: "nz",414 count: { $sum: 1 },415 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }416 }417 },418 { $sort : { count : -1} }419 ]);420 */421 public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {422 // working with the WebSites collection, not WebPages collection!423 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);424 425 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";426 427 Bson orQuery = or(428 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),429 BasicDBObject.parse("{domain: /\\.nz/}")430 );431 Bson andQuery = and(432 BasicDBObject.parse(mriFilterString),433 orQuery);434 435 // Hopefully the lambda expression (forEach()) at end means436 // we write out each result Document as we get it437 collection.aggregate(Arrays.asList(438 match(andQuery),439 unwind("$geoLocationCountryCode"),440 group("NZ", Arrays.asList(sum("count", 1),441 addToSet("domain", "$basicDomain"))),442 sort(BasicDBObject.parse("{count : -1}"))443 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));444 445 // should only have one doc for NZ since it's a count by geolocation.446 447 return;448 }449 450 /**451 * Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,452 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER453 * and total counts of numPagesInMRI and numPagesContainingMRI across all these454 * matching sites.455 *456 * The aggregate() we want to run this time:457 *458 db.Websites.aggregate([459 {460 $match: {461 $and: [462 {geoLocationCountryCode: {$ne: "NZ"}},463 {domain: {$not: /\.nz/}},464 {numPagesContainingMRI: {$gt: 0}},465 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}466 ]467 }468 },469 { $unwind: "$geoLocationCountryCode" },470 {471 $group: {472 _id: {$toLower: '$geoLocationCountryCode'},473 count: { $sum: 1 },474 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }475 }476 },477 { $sort : { count : -1} }478 ]);479 */480 public void aggregateContainsMRIForOverseas(Writer writer, int filterType,481 boolean isMiInURLPath) throws UncheckedIOException482 {483 // working with the WebSites collection, not WebPages collection!484 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);485 486 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";487 488 Bson orQuery = or(489 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),490 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")491 // e.g. "{urlContainsLangCodeInPath: false}"492 );493 Bson andQuery = and(494 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),495 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),496 BasicDBObject.parse(mriFilterString),497 orQuery);498 499 collection.aggregate(Arrays.asList(500 match(andQuery), //match(BasicDBObject.parse(matchQuery))501 // match((List<DBObject>)JSON.parse(matchQuery)),502 unwind("$geoLocationCountryCode"),503 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),504 addToSet("domain", "$basicDomain"))),505 sort(BasicDBObject.parse("{count : -1}"))506 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));507 508 // casting to Block<Document> necessary because otherwise we see the error at509 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java510 511 // Less efficient way is to keep all the results in memory and then512 // write them out one at a time513 /*514 AggregateIterable<Document> output515 = collection.aggregate(Arrays.asList(516 match(andQuery), //match(BasicDBObject.parse(matchQuery))517 // match((List<DBObject>)JSON.parse(matchQuery)),518 unwind("$geoLocationCountryCode"),519 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),520 sort(BasicDBObject.parse("{count : -1}"))521 ));522 523 524 for (Document doc : output) {525 //System.out.println(doc);526 System.out.println(doc.toJson());527 528 }529 */530 return;531 }532 533 /** Do the aggregates for writing out tables.534 Table1:535 536 */537 public void writeTables(File outFolder) {538 // In this function, we're always dealing with the Websites mongodb collection.539 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);540 541 String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"};542 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {543 File outFile = new File(outFolder, tableNames[tableNum] + ".json");544 File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");545 try (546 Writer writer = new BufferedWriter(new FileWriter(outFile));547 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT);548 ) {549 550 // Write out the CSV column headings551 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html552 csvWriter.printRecord("countryCode", "siteCount",553 "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/);554 555 AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));556 557 int docNum = 0;558 for (Document doc : output) {559 //System.out.println(doc);560 writeDocAsJsonRecord(++docNum, doc, writer);561 writeDocAsCSVRecord(++docNum, doc, csvWriter);562 }563 logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");564 } catch(UncheckedIOException ioe) {565 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);566 }567 catch(Exception e) {568 logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);569 }570 }571 }572 573 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) {574 575 AggregateIterable<Document> output = null;576 577 switch(tableNum) {578 579 case 1:580 /* 1table_allCrawledSites -581 582 db.Websites.aggregate([583 { $unwind: "$geoLocationCountryCode" },584 {585 $group: {586 _id: "$geoLocationCountryCode",587 count: { $sum: 1 },588 //domain: { $addToSet: '$domain' },589 numPagesInMRICount: { $sum: '$numPagesInMRI' },590 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }591 }592 },593 { $sort : { count : -1} }594 ]);595 */596 output = collection.aggregate(Arrays.asList(597 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),598 unwind("$geoLocationCountryCode"),599 group("$geoLocationCountryCode", Arrays.asList(600 sum("count", 1),601 /*addToSet("domain", "$domain"),*/602 sum("numPagesInMRICount", "$numPagesInMRI"),603 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),604 sort(BasicDBObject.parse("{count : -1}"))605 ));606 break;607 608 case 2:609 /*610 db.Websites.aggregate([611 { $match: { numPagesInMRI: {$gt: 0} } },612 { $unwind: "$geoLocationCountryCode" },613 {614 $group: {615 _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower616 count: { $sum: 1 },617 //domain: { $addToSet: '$domain' },618 numPagesInMRICount: { $sum: '$numPagesInMRI' },619 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }620 }621 },622 { $sort : { count : -1} }623 ]);624 */625 output = collection.aggregate(Arrays.asList(626 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),627 unwind("$geoLocationCountryCode"),628 group("$geoLocationCountryCode", Arrays.asList(629 sum("count", 1),630 /*addToSet("domain", "$domain"),*/631 sum("numPagesInMRICount", "$numPagesInMRI"),632 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),633 sort(BasicDBObject.parse("{count : -1}"))634 ));635 break;636 637 default: logger.error("Unknown table number: " + tableNum);638 639 }640 641 return output;642 643 }644 645 646 647 /**648 * called by lambda forEach() call on Document objects to write them out to a file.649 * Have to deal with unreported exceptions here that can't be dealt with when doing650 * the actual forEach(). See651 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach652 */653 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {654 655 // If there's a domain field in the json Doc, sort this domain listing alphabetically656 Object domainList = doc.remove("domain");657 if(domainList != null) {658 doc.put("domain", sortAlphabetically(domainList));659 }660 661 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);662 // Can't control json output to add newlines after each array element,663 // no matter which JsonMode is used.664 665 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html666 // Still can't control array element output,667 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:668 //JsonWriterSettings writeSettings = new JsonWriterSettings();669 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();670 //writer.write(doc.toJson(writeSettings) + NEWLINE);671 672 // Not the JsonWriter of mongodb java driver:673 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line674 675 // Have to use gson's pretty print to produce a json string that contains676 // newlines after every array element in the json:677 678 String jsonStr = prettyPrintJson(doc.toJson());679 //System.err.println(jsonStr);680 try {681 writer.write(jsonStr + NEWLINE);682 } catch (IOException ex) {683 //throw ex;684 throw new UncheckedIOException(ex);685 }686 }687 688 private List sortAlphabetically(Object list) {689 BsonArray domainList = (BsonArray)list;690 //for(String domain : domainList) {691 for(int i = domainList.size() - 1; i >= 0; i--) {692 BsonString domain = domainList.get(i).asString();693 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());694 domainList.set(i, new BsonString(domainStr));695 }696 697 return domainList;698 }699 700 public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {701 String jsonStr = prettyPrintJson(doc.toJson());702 //System.err.println(jsonStr);703 try {704 writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);705 } catch (IOException ex) {706 //throw ex;707 throw new UncheckedIOException(ex);708 }709 }710 711 // TODO712 //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {713 public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {714 String jsonStr = doc.toJson();715 JsonParser parser = new JsonParser();716 JsonElement json = parser.parse(jsonStr);717 718 JsonObject jsonObj = (JsonObject)json;719 720 String countryCode = jsonObj.get("_id").getAsString();721 int siteCount = jsonObj.get("count").getAsInt();722 int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();723 int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();724 725 //System.err.println(jsonStr);726 try {727 //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);728 csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount);729 } catch (IOException ex) {730 //throw ex;731 throw new UncheckedIOException(ex);732 }733 }734 735 public String prettyPrintJson(String jsonStr) {736 Gson gson = new GsonBuilder().setPrettyPrinting().create();737 JsonParser jp = new JsonParser();738 JsonElement je = jp.parse(jsonStr);739 String prettyJsonString = gson.toJson(je);740 return prettyJsonString;741 }742 304 743 305
Note:
See TracChangeset
for help on using the changeset viewer.