Changeset 33906 for other-projects
- Timestamp:
- 2020-02-05T23:36:37+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33887 r33906 26 26 import com.mongodb.Block; 27 27 28 import org.bson.BsonArray; 29 import org.bson.BsonString; 28 30 import org.bson.Document; 29 31 import org.bson.conversions.Bson; … … 217 219 .append("siteFolderName", website.siteFolderName) 218 220 .append("domain", website.domain) 221 .append("basicDomain", website.basicDomain) 219 222 .append("totalPages", website.totalPages) 220 223 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText) … … 389 392 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 390 393 * 391 * 394 * Count by country code of non-NZ websites containing a positive number of sentences in MRI, 395 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 396 * and total counts of numPagesInMRI and numPagesContainingMRI across all these 397 * matching sites. 398 * 392 399 * The mongodb aggregate() we want to run this time: 393 400 * … … 405 412 $group: { 406 413 _id: "nz", 407 count: { $sum: 1 }, 408 domain: { $addToSet: '$domain'}414 count: { $sum: 1 }, 415 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 409 416 } 410 417 }, … … 431 438 match(andQuery), 432 439 unwind("$geoLocationCountryCode"), 433 group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 440 group("NZ", Arrays.asList(sum("count", 1), 441 addToSet("domain", "$basicDomain"))), 434 442 sort(BasicDBObject.parse("{count : -1}")) 435 443 )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); … … 441 449 442 450 /** 451 * Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI, 452 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER 453 * and total counts of numPagesInMRI and numPagesContainingMRI across all these 454 * matching sites. 455 * 443 456 * The aggregate() we want to run this time: 444 457 * … … 459 472 _id: {$toLower: '$geoLocationCountryCode'}, 460 473 count: { $sum: 1 }, 461 domain: { $addToSet: '$domain'}474 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" } 462 475 } 463 476 }, … … 484 497 orQuery); 485 498 486 487 499 collection.aggregate(Arrays.asList( 488 500 match(andQuery), //match(BasicDBObject.parse(matchQuery)) 489 501 // match((List<DBObject>)JSON.parse(matchQuery)), 490 502 unwind("$geoLocationCountryCode"), 491 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 503 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), 504 addToSet("domain", "$basicDomain"))), 492 505 sort(BasicDBObject.parse("{count : -1}")) 493 506 )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); … … 639 652 */ 640 653 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 654 655 // If there's a domain field in the json Doc, sort this domain listing alphabetically 656 Object domainList = doc.remove("domain"); 657 if(domainList != null) { 658 doc.put("domain", sortAlphabetically(domainList)); 659 } 660 641 661 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); 642 662 // Can't control json output to add newlines after each array element, … … 655 675 // Have to use gson's pretty print to produce a json string that contains 656 676 // newlines after every array element in the json: 677 657 678 String jsonStr = prettyPrintJson(doc.toJson()); 658 679 //System.err.println(jsonStr); … … 663 684 throw new UncheckedIOException(ex); 664 685 } 686 } 687 688 private List sortAlphabetically(Object list) { 689 BsonArray domainList = (BsonArray)list; 690 //for(String domain : domainList) { 691 for(int i = domainList.size() - 1; i >= 0; i--) { 692 BsonString domain = domainList.get(i).asString(); 693 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString()); 694 domainList.set(i, new BsonString(domainStr)); 695 } 696 697 return domainList; 665 698 } 666 699 -
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33811 r33906 75 75 private String geoLocationCountryCode = null; /** 2 letter country code */ 76 76 private boolean urlContainsLangCodeInPath = false; /** If any URL on this site contains a /mi(/) or http(s)://mi.* in its URL path */ 77 77 78 78 private String domainOfSite; 79 private String baseSiteDomain; // domainOfSite stripped of any http(s)://www. 79 80 private int numPagesInMRI = 0; 80 81 private int numPagesContainingMRI = 0; … … 202 203 String url = firstPage.getPageURL(); 203 204 this.domainOfSite = Utility.getDomainForURL(url, true); 205 this.baseSiteDomain = Utility.stripProtocolAndWWWFromURL(this.domainOfSite); 204 206 } 205 207 else { 206 208 this.domainOfSite = "UNKNOWN"; 209 this.baseSiteDomain = "UNKNOWN"; 207 210 } 208 211 … … 339 342 int totalPages = pages.size(); 340 343 341 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite, 344 WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, 345 this.domainOfSite, this.baseSiteDomain, 342 346 totalPages, this.countOfWebPagesWithBodyText, 343 347 this.numPagesInMRI, this.numPagesContainingMRI, -
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33887 r33906 57 57 58 58 public void produceURLsForPagesInMRI(File domainsFile) { 59 ArrayList<Tuple> urlsList = getURLsFor WebPages(MongoDBAccess.IS_MRI, domainsFile);59 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 60 60 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 61 61 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 66 66 67 67 public void produceURLsForPagesContainingMRI(File domainsFile) { 68 ArrayList<Tuple> urlsList = getURLsFor WebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);68 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile); 69 69 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 70 70 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 74 74 } 75 75 76 private ArrayList<Tuple> getURLsFor WebPages(int filterType, File domainsFile) {76 private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) { 77 77 ArrayList<Tuple> urlsList = new ArrayList<Tuple>(); 78 78 … … 120 120 } 121 121 122 /** Given a hand curated list of NZ sites with positive numPagesContainingMRI, 123 * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?). 124 * Total all these pages in MRI (N), then work out the correct sample size (n) 122 /** Given a hand curated list of all sites with positive numPagesContainingMRI 123 * determined by manual inspection, get a listing of all their web pages that 124 * are IN_MRI (or CONTAINS_MRI?). 125 * Total all these pages that are inMRI (N), then work out the correct sample size (n) 125 126 * at 90% confidence with 5% margin of error. Then generate a random listing 126 127 * of n of these pages in MRI of these trusted sites and output to a file 127 * for manual inspection . */128 * for manual inspection of the sample webpage URLs at page-level. */ 128 129 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing 129 130 * of all their web pages IN_MRI (or CONTAINS_MRI). … … 138 139 139 140 // 0. get a list of all the web pages in the given domain listing where isMRI = true 140 ArrayList<Tuple> urlsList = getURLsFor WebPages(MongoDBAccess.IS_MRI, domainsFile);141 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 141 142 // produceURLsForPagesInMRI(domainsFile); 142 143 -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java
r33811 r33906 9 9 public final String siteFolderName; 10 10 public final String domain; 11 public final String basicDomain; // domain without protocol and www. prefix 11 12 12 13 public final int totalPages; … … 23 24 public final boolean urlContainsLangCodeInPath; 24 25 25 public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 26 public WebsiteInfo(/*int siteCount,*/ String siteFolderName, 27 String domainOfSite, String baseSiteDomain, 26 28 int totalPages, int countOfWebPagesWithBodyText, 27 29 int numPagesInMRI, int numPagesContainingMRI, … … 32 34 this.siteFolderName = siteFolderName; 33 35 this.domain = domainOfSite; 36 this.basicDomain = baseSiteDomain; 34 37 35 38 this.totalPages = totalPages;
Note:
See TracChangeset
for help on using the changeset viewer.