Changeset 33885 for other-projects
- Timestamp:
- 2020-01-31T22:54:15+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33883 r33885 39 39 40 40 import java.io.BufferedReader; 41 import java.io.BufferedWriter; 41 42 import java.io.File; 42 43 import java.io.FileReader; 44 import java.io.FileWriter; 43 45 import java.io.IOException; 44 46 import java.io.UncheckedIOException; … … 512 514 } 513 515 516 /** Do the aggregates for writing out tables. 517 Table1: 518 db.Websites.aggregate([ 519 520 { $unwind: "$geoLocationCountryCode" }, 521 { 522 $group: { 523 _id: "$geoLocationCountryCode", 524 count: { $sum: 1 }, 525 //domain: { $addToSet: '$domain' }, 526 numPagesInMRICount: { $sum: '$numPagesInMRI' }, 527 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' } 528 } 529 }, 530 { $sort : { count : -1} } 531 ]); 532 */ 533 public void writeTables(File outFolder) { 534 // In this function, we're always dealing with the Websites mongodb collection. 535 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 536 537 // table 1 538 File outFile = new File(outFolder, "1table_allCrawledSites.csv"); 539 try ( 540 Writer writer = new BufferedWriter(new FileWriter(outFile)); 541 ) { 542 543 544 AggregateIterable<Document> output = collection.aggregate(Arrays.asList( 545 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")), 546 unwind("$geoLocationCountryCode"), 547 group("$geoLocationCountryCode", Arrays.asList( 548 sum("count", 1), 549 /*addToSet("domain", "$domain"),*/ 550 sum("numPagesInMRICount", "$numPagesInMRI"), 551 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))), 552 sort(BasicDBObject.parse("{count : -1}")) 553 ));//.forEach((Block<Document>)doc -> writeDoc(doc, writer)); 554 555 int docNum = 0; 556 for (Document doc : output) { 557 //System.out.println(doc); 558 writeDoc(++docNum, doc, writer); 559 } 560 561 562 } catch(UncheckedIOException ioe) { 563 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe); 564 } 565 catch(Exception e) { 566 logger.error("Could not write table to file " + outFile, e); 567 } 568 } 569 570 public void doTable1() { 571 572 } 573 574 575 514 576 /** 515 577 * called by lambda forEach() call on Document objects to write them out to a file. … … 517 579 * the actual forEach(). See 518 580 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach 519 */ 520 581 */ 521 582 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 522 583 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); … … 545 606 } 546 607 } 608 public void writeDoc(int docNum, Document doc, Writer writer) throws UncheckedIOException { 609 String jsonStr = prettyPrintJson(doc.toJson()); 610 //System.err.println(jsonStr); 611 try { 612 writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE); 613 } catch (IOException ex) { 614 //throw ex; 615 throw new UncheckedIOException(ex); 616 } 617 } 618 619 547 620 public String prettyPrintJson(String jsonStr) { 548 621 Gson gson = new GsonBuilder().setPrettyPrinting().create(); -
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33884 r33885 20 20 * TO RUN: 21 21 * maori-lang-detection/src$ 22 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing 23 * OR: 22 24 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 23 25 * … … 344 346 345 347 // TODO: generate the tables 348 349 mongodb.writeTables(outFolder); 346 350 } 347 351
Note:
See TracChangeset
for help on using the changeset viewer.