Changeset 33919
- Timestamp:
- 2020-02-13T22:40:41+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection
- Files:
-
- 2 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/MoreReading/mongodb.txt
r33914 r33919 1638 1638 UNKNOWN: 1639 1639 X gained hitiaotera.com from IL 1640 (and lost viveipcl.com to CZ) 1640 1641 1641 1642 IL: -
other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java
r33869 r33919 7 7 import java.io.FileWriter; 8 8 import java.io.Writer; 9 10 import java.net.URLEncoder; 9 11 10 12 import java.util.HashMap; … … 33 35 import mil.nga.sf.geojson.Position; 34 36 37 38 import org.greenstone.util.SafeProcess; 35 39 36 40 /** … … 105 109 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations/2.10.0 106 110 */ 107 public class CountryCodeCountsMapData { 111 public class CountryCodeCountsMapData { 112 108 113 static Logger logger = Logger.getLogger(org.greenstone.atea.CountryCodeCountsMapData.class.getName()); 114 115 static public final String GEOJSON_MAP_TOOL_URL = "http://geojson.io/"; //"http://geojson.tools/"; 116 static private final String DATA_STR = "#data=data:application/json,"; 117 118 // "http://geojson.io" has a URL API to programmatically access 119 /* 120 See http://geojson.io/ -> Help 121 122 "I'm a coder 123 124 geojson.io has an array of cli tools that make it easy to go from a GeoJSON file on your computer to geojson.io." 125 126 http://geojson.io/#geojson-io-api 127 "Geojson.io API 128 129 You can interact with geojson.io programmatically in two ways: 130 131 => URL parameters 132 Browser console" 133 134 http://geojson.io/#url-api 135 "data=data:application/json, 136 137 Open the map and load a chunk of GeoJSON data from a URL segment directly onto the map. 138 The GeoJSON data should be encoded as per encodeURIComponent(JSON.stringify(geojson_data)). 139 Example: 140 141 http://geojson.io/#data=data:application/json,%7B%22type%22%3A%22LineString%22%2C%22coordinates%22%3A%5B%5B0%2C0%5D%2C%5B10%2C10%5D%5D%7D 142 */ 143 144 public static final int SUPPRESS_MAPDATA_DISPLAY = 0; 145 public static final int PRINT_MAPDATA_TO_SCREEN = 1; 109 146 110 147 //Map<String, JsonObject> countryToJsonMap; … … 121 158 private final String geoJsonFilenameWithSuffix; 122 159 private final File outputFolder; 160 161 123 162 124 163 public CountryCodeCountsMapData(String countryCountsJSONFilename) throws Exception { … … 185 224 logger.info("No geolocation info found for country code " + countryCode); 186 225 if(countryCode.equals("EU")) { 226 logger.info(" Adding lat,lng for somewhere around Europe"); 187 227 //logger.info("Unlisted country code: EU"); 188 228 // add lat and lng for Europe … … 192 232 } 193 233 else if(countryCode.equals("UNKNOWN")) { 234 logger.info(" Adding lat,lng for somewhere in Antarctica"); 194 235 //logger.info("Unlisted country code: UNKNOWN"); 195 236 // add lat and lng for Antarctica … … 515 556 } 516 557 558 // by default, display mapdata output on screen too 517 559 public String writeMultiPointGeoJsonToFile() { 560 return writeMultiPointGeoJsonToFile(PRINT_MAPDATA_TO_SCREEN); 561 } 562 public String writeMultiPointGeoJsonToFile(int displayMapData) { 518 563 final String filename = "multipoint_" + this.geoJsonFilenameWithSuffix; 519 564 File outFile = new File(this.outputFolder, filename); … … 521 566 Geometry geometry = this.toMultiPointGeoJson(); 522 567 String multiPointGeojsonString = FeatureConverter.toStringValue(geometry); 523 System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n"); 568 if(displayMapData == PRINT_MAPDATA_TO_SCREEN) { 569 System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n"); 570 } 524 571 try ( 525 572 Writer writer = new BufferedWriter(new FileWriter(outFile)); … … 528 575 // Some basic re-formatting for some immediate legibility 529 576 // But pasting the contents of the file (or the System.err output above) 530 // directly into http://geojson.tools/ will instantly reformat the json perfectly anyway. 577 // directly into http://geojson.tools/ or http://geojson.io/ 578 // will instantly reformat the json perfectly anyway. 531 579 multiPointGeojsonString = multiPointGeojsonString.replace("[[", "\n[\n\t["); 532 580 multiPointGeojsonString = multiPointGeojsonString.replace("],[", "],\n\t["); … … 544 592 545 593 } 546 594 595 // by default, display mapdata output on screen too 547 596 public String writeFeaturesGeoJsonToFile() { 597 return writeFeaturesGeoJsonToFile(PRINT_MAPDATA_TO_SCREEN); 598 } 599 // write out geojson features to appropriately named file 600 // If displayMapData == PRINT_MAPDATA_TO_SCREEN, then it will also be printed to screen 601 public String writeFeaturesGeoJsonToFile(int displayMapData) { 548 602 final String filename = "geojson-features_" + this.geoJsonFilenameWithSuffix; 549 603 File outFile = new File(this.outputFolder, filename); … … 551 605 FeatureCollection featureColl = this.toFeatureCollection(); 552 606 String featuresGeojsonString = FeatureConverter.toStringValue(featureColl); 553 System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n"); 607 if(displayMapData == PRINT_MAPDATA_TO_SCREEN) { 608 System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n"); 609 } 554 610 try ( 555 611 Writer writer = new BufferedWriter(new FileWriter(outFile)); … … 564 620 } 565 621 566 return outFile.getAbsolutePath(); 567 568 } 569 622 return outFile.getAbsolutePath(); 623 } 624 625 626 public String getFeaturesGeoJsonString(boolean uriEncoded) { 627 String featuresGeojsonString = FeatureConverter.toStringValue(this.toFeatureCollection()); 628 if(uriEncoded) { 629 // Want to return encodeURIComponent(JSON.stringify(featuresGeojsonString)); 630 // https://stackoverflow.com/questions/607176/java-equivalent-to-javascripts-encodeuricomponent-that-produces-identical-outpu 631 URLEncoder.encode(featuresGeojsonString); 632 } 633 return featuresGeojsonString; 634 } 635 636 public String getAsMapURL() { 637 boolean uriEncoded = true; 638 String url = GEOJSON_MAP_TOOL_URL + DATA_STR + getFeaturesGeoJsonString(uriEncoded); 639 640 return url; 641 } 642 643 public String geoJsonMapScreenshot(File outputFolder, String fileNamePrefix) { 644 // https://stackoverflow.com/questions/49606051/how-to-take-a-screenshot-in-firefox-headless-selenium-in-java 645 646 // https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode 647 // /path/to/firefox -P my-profile --screenshot test.jpg https://developer.mozilla.org --window-size=800,1000 648 // https://stackoverflow.com/questions/15783701/which-characters-need-to-be-escaped-when-using-bash 649 650 String mapURL = this.getAsMapURL(); 651 652 String mapURLescapedForBash = mapURL.replace("\"", "\\\"");//.replace("[", "\\[").replace("]", "\\]"); 653 654 File outputFile = new File(outputFolder + File.separator + fileNamePrefix+".png"); 655 String outputFilePath = Utility.getFilePath(outputFile); 656 657 658 String[] cmdArgs = { 659 "firefox", 660 "--screenshot", 661 outputFilePath, 662 mapURLescapedForBash //"'" + mapURL + "'" 663 }; 664 665 System.err.print("Running:"); 666 for(String arg : cmdArgs) { 667 System.err.print(" " + arg); 668 } 669 System.err.println(); 670 671 672 //String cmdArgs = "firefox --screenshot " + outputFilePath + " " + GEOJSON_MAP_TOOL_URL + DATA_STR; 673 //String cmdArgs = "firefox --screenshot " + outputFilePath + " " + "'" + mapURL + "'"; 674 //System.err.println("Running: " + cmdArgs); 675 676 SafeProcess proc = new SafeProcess(cmdArgs); 677 678 int retVal = proc.runProcess(); 679 680 logger.info("Process out: " + proc.getStdOutput()); 681 logger.info("Process err: " + proc.getStdError()); 682 logger.info("Screenshot process returned with: " + retVal); 683 684 return outputFilePath; 685 686 } 570 687 571 688 public int getTotalCount() { … … 613 730 System.err.println("***********\nWrote mapdata to files " + multipointOutFileName 614 731 + " and " + featuresOutFileName); 615 System.err.println("You can paste the geojson contents of either of these files into the " 616 + "editor at http://geojson.tools/ to see the data arranged on a world map"); 732 System.err.println("You can paste the geojson contents of either of these files into " 733 + " the editor at " + GEOJSON_MAP_TOOL_URL 734 + " to see the data arranged on a world map"); 617 735 618 736 System.err.println("Total count for query: " + mapData.getTotalCount()); -
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java
r33917 r33919 433 433 434 434 /** Perform the aggregates for writing out the summary tables. */ 435 public voidwriteTables(File outFolder) {435 public String[] writeTables(File outFolder) { 436 436 // In this function, we're always dealing with the Websites mongodb collection. 437 437 MongoCollection<Document> collection = getWebsitesCollection(); 438 438 439 String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI", 440 "3table_sitesWithPagesContainingMRI", "4table_containsMRI_exclTentativeProductSites", 441 "5table_sitesWithPagesContainingMRI_allNZGrouped" 439 String[] tableNames = { 440 "", 441 "1table_allCrawledSites", 442 "2table_sitesWithPagesInMRI", 443 "3table_sitesWithPagesContainingMRI", 444 "4table_containsMRI_exclTentativeProductSites", 445 "5table_sitesWithPagesContainingMRI_allNZGrouped", 446 "5table_sitesWithPagesInMRI_allNZGrouped" 442 447 }; 443 448 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) { … … 481 486 } 482 487 } 483 } 484 485 public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) { 488 489 return tableNames; 490 } 491 492 public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) { 486 493 487 494 Document nzRowData = null; … … 491 498 break; 492 499 493 case 5: 494 /* Get NZ only table data: 500 //case 5: 501 //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; 502 //case 6: 503 //filterQueryStr = "{numPagesInMRI: {$gt: 0}}"; 504 case 5: case 6: 505 String filterQueryStr = (tableNum == 5) ? 506 "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 507 508 /* Get NZ only table data. 509 Can be numPagesContainingMRI or numPagesInMRI > 0 depending on filterQueryStr. 510 495 511 db.Websites.aggregate([ 496 512 { … … 517 533 518 534 */ 535 519 536 Bson orQuery = or( 520 537 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), … … 522 539 ); 523 540 Bson andQuery = and( 524 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),525 526 541 BasicDBObject.parse(filterQueryStr), // e.g."{numPagesContainingMRI: {$gt: 0}}" 542 orQuery 543 ); 527 544 AggregateIterable<Document> output = collection.aggregate(Arrays.asList( 528 545 match(andQuery), … … 541 558 break; 542 559 543 default: logger.error("Unknown table number: " + tableNum); 560 561 default: logger.error("Unknown table number: " + tableNum); 544 562 } 545 563 … … 549 567 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) 550 568 { 569 //String filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; // only used if tableNum = 5|6 551 570 552 571 AggregateIterable<Document> output = null; … … 639 658 */ 640 659 output = collection.aggregate(Arrays.asList( 641 match(BasicDBObject.parse("{ numPages InMRI: {$gt: 0} }")),660 match(BasicDBObject.parse("{ numPagesContainingMRI: {$gt: 0} }")), 642 661 unwind("$geoLocationCountryCode"), 643 662 group("$geoLocationCountryCode", Arrays.asList( … … 698 717 )); 699 718 break; 700 701 case 5: 719 //case 5: 720 //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; 721 //case 6: 722 //filterQueryStr = "{numPagesInMRI: {$gt: 0}}"; 723 case 5: case 6: 724 String filterQueryStr = (tableNum == 5) ? 725 "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 702 726 /* 703 727 Table of count by countryCode of sites with numPagesContainingMRI > 0 728 (or numPagesInMRI > 0). 704 729 Just do OVERSEAS here, NZ handled separately 705 730 … … 730 755 731 756 andQuery = and( 732 733 734 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}")735 757 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 758 BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"), 759 BasicDBObject.parse(filterQueryStr) // e.g. "{numPagesContainingMRI: {$gt: 0}}" 760 ); 736 761 output = collection.aggregate(Arrays.asList( 737 762 match(andQuery), -
other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java
r33917 r33919 8 8 /** 9 9 * Runs some of the important mongoDB queries I ran. 10 * 11 * This program expects a folder ../mongo-data-auto to exist. 10 12 * 11 13 * TO COMPILE OR RUN, FIRST DO: … … 26 28 */ 27 29 public class SummaryTool { 30 28 31 static Logger logger = Logger.getLogger(org.greenstone.atea.SummaryTool.class.getName()); 32 33 34 static private final String GEOJSON_FEATURES_FILE_PREFIX = "geojson-features_"; 35 29 36 static private final long FIXED_SEED = 1000; 30 37 31 38 private final MongoDBQueryer mongodbQueryer; 32 39 private File outFolder; 33 34 40 35 41 … … 101 107 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know 102 108 if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) { 103 System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages- only containsMRI.");109 System.out.println(" " + countryCode + " domain " + domain + " had no webpages where isMRI=true - only containsMRI."); 104 110 } 105 111 … … 149 155 //File outFolder = domainsFile.getParentFile(); 150 156 String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI"; 151 File outFile = new File(outFolder, filterName+"_ "+domainsFile.getName());157 File outFile = new File(outFolder, filterName+"_full_"+domainsFile.getName()); 152 158 153 159 writeURLsToFile(urlsList, outFile, N_totalNumPages); … … 158 164 int n_numSampleURLs = calcSampleSize(N_totalNumPages); 159 165 160 System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist : " + N_totalNumPages);166 System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist file: " + N_totalNumPages); 161 167 System.err.println(" (out of " + mongodbQueryer.countOfWebpagesMatching(filterType) 162 168 + " web pages across ALL sites for which " + filterName + " = true)"); … … 172 178 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs " 173 179 + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile)); 180 181 // For N = 6557, z-alpha-over-2 = 1.6449 and m = 0.05 (5%), 182 // n = (z-alpha-over-2^2 x N) / (z-alpha-over-2^2 + 4 x (N-1) x m^2) 183 // = (1.6449^2Ã6557) ÷ (1.6449^2 + 4 à 6556Ã0.05^2) = 259.88526851 => 260 rounded up. Check. 174 184 } 175 185 … … 384 394 // TODO: generate the tables 385 395 386 mongodb.writeTables(outFolder); 396 String[] tableFileNames = mongodb.writeTables(outFolder); 397 // for each table file name, generate the geojson-features .json file 398 // that GEOJSON_MAP_TOOL_URL takes as input to produce a map. 399 400 for(int i = 1; i < tableFileNames.length; i++) { // empty element at 0 401 String tablefilename = tableFileNames[i] + ".json"; // filenames have no suffix 402 403 File countsTableFile = new File(outFolder, tablefilename); 404 if(!countsTableFile.exists()) { 405 logger.error("@@@ File " + countsTableFile + " does not exist!"); 406 logger.error("@@@ Can't generate map date for this."); 407 continue; 408 } 409 String countsTableFilename = outFolder + File.separator + tablefilename; 410 CountryCodeCountsMapData mapData 411 = new CountryCodeCountsMapData(countsTableFilename); 412 String geoJsonFilename = mapData.writeFeaturesGeoJsonToFile(CountryCodeCountsMapData.SUPPRESS_MAPDATA_DISPLAY); 413 414 415 /* 416 // Ensure the geo-json file generated exists 417 //String geoJsonFilename = outFolder + File.separator 418 //+ GEOJSON_FEATURES_FILE_PREFIX + tablefilename; 419 420 File geoJsonFile = new File(geoJsonFilename); 421 if(!geoJsonFile.exists()) { 422 System.err.println("@@@ geoJson file " + geoJsonFilename + " not generated!"); 423 continue; 424 } 425 */ 426 System.err.println("**** Wrote mapdata to file " + geoJsonFilename); 427 //System.err.println(" Paste the file's geojson contents into " 428 //+ "the editor at " + CountryCodeCountsMapData.GEOJSON_MAP_TOOL_URL 429 //+ " to see the data arranged on a world map"); 430 System.err.println("Total count for query: " + mapData.getTotalCount()); 431 432 /*boolean uriEncoded = true; 433 String mapDataEncodedStr = mapData.getFeaturesGeoJsonString(uriEncoded); 434 System.err.println("Encoded string: " + mapDataEncodedStr); 435 */ 436 437 System.err.println("Data URL string: " + mapData.getAsMapURL()); 438 System.err.println(); 439 mapData.geoJsonMapScreenshot(outFolder, tablefilename); 440 System.err.println("---"); 441 442 // TODO: breaks after first table -> map conversion 443 break; 444 } 445 387 446 } 388 447
Note:
See TracChangeset
for help on using the changeset viewer.