- Timestamp:
- 2020-01-31T23:49:11+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33885 r33887 17 17 import static com.mongodb.client.model.Accumulators.*; 18 18 19 20 19 //import org.bson.conversions.Bson; 21 20 import com.mongodb.BasicDBObject; … … 57 56 import org.greenstone.atea.morphia.*; 58 57 import dev.morphia.*; 58 59 import org.apache.commons.csv.*; 59 60 60 61 /** … … 83 84 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1 84 85 * 86 * API: 87 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find-- 88 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/ 85 89 */ 86 90 public class MongoDBAccess implements AutoCloseable { … … 516 520 /** Do the aggregates for writing out tables. 517 521 Table1: 518 db.Websites.aggregate([519 522 520 { $unwind: "$geoLocationCountryCode" },521 {522 $group: {523 _id: "$geoLocationCountryCode",524 count: { $sum: 1 },525 //domain: { $addToSet: '$domain' },526 numPagesInMRICount: { $sum: '$numPagesInMRI' },527 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }528 }529 },530 { $sort : { count : -1} }531 ]);532 523 */ 533 524 public void writeTables(File outFolder) { … … 535 526 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 536 527 537 // table 1 538 File outFile = new File(outFolder, "1table_allCrawledSites.csv"); 539 try ( 540 Writer writer = new BufferedWriter(new FileWriter(outFile)); 541 ) { 528 String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"}; 529 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) { 530 File outFile = new File(outFolder, tableNames[tableNum] + ".json"); 531 File csvFile = new File(outFolder, tableNames[tableNum] + ".csv"); 532 try ( 533 Writer writer = new BufferedWriter(new FileWriter(outFile)); 534 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT); 535 ) { 536 537 // Write out the CSV column headings 538 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html 539 csvWriter.printRecord("countryCode", "siteCount", 540 "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/); 541 542 AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer)); 543 544 int docNum = 0; 545 for (Document doc : output) { 546 //System.out.println(doc); 547 writeDocAsJsonRecord(++docNum, doc, writer); 548 writeDocAsCSVRecord(++docNum, doc, csvWriter); 549 } 550 logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv"); 551 } catch(UncheckedIOException ioe) { 552 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe); 553 } 554 catch(Exception e) { 555 logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e); 556 } 557 } 558 } 559 560 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) { 561 562 AggregateIterable<Document> output = null; 563 564 switch(tableNum) { 542 565 543 544 AggregateIterable<Document> output = collection.aggregate(Arrays.asList( 566 case 1: 567 /* 1table_allCrawledSites - 568 569 db.Websites.aggregate([ 570 { $unwind: "$geoLocationCountryCode" }, 571 { 572 $group: { 573 _id: "$geoLocationCountryCode", 574 count: { $sum: 1 }, 575 //domain: { $addToSet: '$domain' }, 576 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 577 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 578 } 579 }, 580 { $sort : { count : -1} } 581 ]); 582 */ 583 output = collection.aggregate(Arrays.asList( 545 584 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")), 546 585 unwind("$geoLocationCountryCode"), … … 550 589 sum("numPagesInMRICount", "$numPagesInMRI"), 551 590 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))), 552 sort(BasicDBObject.parse("{count : -1}")) 553 ));//.forEach((Block<Document>)doc -> writeDoc(doc, writer)); 554 555 int docNum = 0; 556 for (Document doc : output) { 557 //System.out.println(doc); 558 writeDoc(++docNum, doc, writer); 559 } 591 sort(BasicDBObject.parse("{count : -1}")) 592 )); 593 break; 594 595 case 2: 596 /* 597 db.Websites.aggregate([ 598 { $match: { numPagesInMRI: {$gt: 0} } }, 599 { $unwind: "$geoLocationCountryCode" }, 600 { 601 $group: { 602 _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower 603 count: { $sum: 1 }, 604 //domain: { $addToSet: '$domain' }, 605 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 606 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 607 } 608 }, 609 { $sort : { count : -1} } 610 ]); 611 */ 612 output = collection.aggregate(Arrays.asList( 613 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")), 614 unwind("$geoLocationCountryCode"), 615 group("$geoLocationCountryCode", Arrays.asList( 616 sum("count", 1), 617 /*addToSet("domain", "$domain"),*/ 618 sum("numPagesInMRICount", "$numPagesInMRI"), 619 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))), 620 sort(BasicDBObject.parse("{count : -1}")) 621 )); 622 break; 560 623 624 default: logger.error("Unknown table number: " + tableNum); 561 625 562 } catch(UncheckedIOException ioe) { 563 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe); 564 } 565 catch(Exception e) { 566 logger.error("Could not write table to file " + outFile, e); 567 } 568 } 569 570 public void doTable1() { 626 } 627 628 return output; 571 629 572 630 } … … 606 664 } 607 665 } 608 public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException { 666 667 public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException { 609 668 String jsonStr = prettyPrintJson(doc.toJson()); 610 669 //System.err.println(jsonStr); … … 616 675 } 617 676 } 618 677 678 // TODO 679 //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException { 680 public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException { 681 String jsonStr = doc.toJson(); 682 JsonParser parser = new JsonParser(); 683 JsonElement json = parser.parse(jsonStr); 684 685 JsonObject jsonObj = (JsonObject)json; 686 687 String countryCode = jsonObj.get("_id").getAsString(); 688 int siteCount = jsonObj.get("count").getAsInt(); 689 int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt(); 690 int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt(); 691 692 //System.err.println(jsonStr); 693 try { 694 //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE); 695 csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount); 696 } catch (IOException ex) { 697 //throw ex; 698 throw new UncheckedIOException(ex); 699 } 700 } 619 701 620 702 public String prettyPrintJson(String jsonStr) {
Note:
See TracChangeset
for help on using the changeset viewer.