- Timestamp:
- 2020-02-05T23:36:37+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33887 r33906 26 26 import com.mongodb.Block; 27 27 28 import org.bson.BsonArray; 29 import org.bson.BsonString; 28 30 import org.bson.Document; 29 31 import org.bson.conversions.Bson; … … 217 219 .append("siteFolderName", website.siteFolderName) 218 220 .append("domain", website.domain) 221 .append("basicDomain", website.basicDomain) 219 222 .append("totalPages", website.totalPages) 220 223 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) … … 389 392 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 390 393 * 391 * 394 * Count by country code of non-NZ websites containing a positive number of sentences in MRI, 395 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 396 * and total counts of numPagesInMRI and numPagesContainingMRI across all these 397 * matching sites. 398 * 392 399 * The mongodb aggregate() we want to run this time: 393 400 * … … 405 412 $group: { 406 413 _id: "nz", 407 count: { $sum: 1 }, 408 domain: { $addToSet: '$domain'}414 count: { $sum: 1 }, 415 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 409 416 } 410 417 }, … … 431 438 match(andQuery), 432 439 unwind("$geoLocationCountryCode"), 433 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 440 group("NZ", Arrays.asList(sum("count", 1), 441 addToSet("domain", "$basicDomain"))), 434 442 sort(BasicDBObject.parse("{count : -1}")) 435 443 )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); … … 441 449 442 450 /** 451 * Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI, 452 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 453 * and total counts of numPagesInMRI and numPagesContainingMRI across all these 454 * matching sites. 455 * 443 456 * The aggregate() we want to run this time: 444 457 * … … 459 472 _id: {$toLower: '$geoLocationCountryCode'}, 460 473 count: { $sum: 1 }, 461 domain: { $addToSet: '$domain'}474 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 462 475 } 463 476 }, … … 484 497 orQuery); 485 498 486 487 499 collection.aggregate(Arrays.asList( 488 500 match(andQuery), //match(BasicDBObject.parse(matchQuery)) 489 501 // match((List<DBObject>)JSON.parse(matchQuery)), 490 502 unwind("$geoLocationCountryCode"), 491 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 503 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), 504 addToSet("domain", "$basicDomain"))), 492 505 sort(BasicDBObject.parse("{count : -1}")) 493 506 )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); … … 639 652 */ 640 653 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 654 655 // If there's a domain field in the json Doc, sort this domain listing alphabetically 656 Object domainList = doc.remove("domain"); 657 if(domainList != null) { 658 doc.put("domain", sortAlphabetically(domainList)); 659 } 660 641 661 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); 642 662 // Can't control json output to add newlines after each array element, … … 655 675 // Have to use gson's pretty print to produce a json string that contains 656 676 // newlines after every array element in the json: 677 657 678 String jsonStr = prettyPrintJson(doc.toJson()); 658 679 //System.err.println(jsonStr); … … 663 684 throw new UncheckedIOException(ex); 664 685 } 686 } 687 688 private List sortAlphabetically(Object list) { 689 BsonArray domainList = (BsonArray)list; 690 //for(String domain : domainList) { 691 for(int i = domainList.size() - 1; i >= 0; i--) { 692 BsonString domain = domainList.get(i).asString(); 693 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString()); 694 domainList.set(i, new BsonString(domainStr)); 695 } 696 697 return domainList; 665 698 } 666 699
Note:
See TracChangeset
for help on using the changeset viewer.