Changeset 33919


Ignore:
Timestamp:
2020-02-13T22:40:41+13:00 (4 years ago)
Author:
ak19
Message:

SummaryTool now uses the CountryCodeCountsMapData.java class to generate the geojson-features files from the tables it already created using MongoDB query results. Switched over from geojson.tools to geojson.io since the latter allows passing geojson mapdata in the URL. The firefox screenshotting is still not working. But I can't even get complex geojson features to work from the commandline yet, so then there's another possible layer of complexity when running firefox as a Java process. Added jna jar files used by Greenstone's SafeProcess for launching Firefox as a Java process.

Location:
other-projects/maori-lang-detection
Files:
2 added
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33914 r33919  
    16381638UNKNOWN:
    16391639X gained hitiaotera.com from IL
     1640(and lost viveipcl.com to CZ)
    16401641
    16411642IL:
  • other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java

    r33869 r33919  
    77import java.io.FileWriter;
    88import java.io.Writer;
     9
     10import java.net.URLEncoder;
    911
    1012import java.util.HashMap;
     
    3335import mil.nga.sf.geojson.Position;
    3436
     37
     38import org.greenstone.util.SafeProcess;
    3539
    3640/**
     
    105109 *   https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations/2.10.0
    106110 */
    107 public class CountryCodeCountsMapData {
     111public class CountryCodeCountsMapData {   
     112   
    108113    static Logger logger = Logger.getLogger(org.greenstone.atea.CountryCodeCountsMapData.class.getName());
     114   
     115    static public final String GEOJSON_MAP_TOOL_URL = "http://geojson.io/"; //"http://geojson.tools/";
     116    static private final String DATA_STR = "#data=data:application/json,";
     117   
     118    // "http://geojson.io" has a URL API to programmatically access
     119    /*
     120       See http://geojson.io/ -> Help
     121
     122       "I'm a coder
     123
     124       geojson.io has an array of cli tools that make it easy to go from a GeoJSON file on your computer to geojson.io."
     125
     126       http://geojson.io/#geojson-io-api
     127       "Geojson.io API
     128
     129       You can interact with geojson.io programmatically in two ways:
     130
     131       => URL parameters
     132       Browser console"
     133
     134       http://geojson.io/#url-api
     135       "data=data:application/json,
     136
     137       Open the map and load a chunk of GeoJSON data from a URL segment directly onto the map.
     138       The GeoJSON data should be encoded as per encodeURIComponent(JSON.stringify(geojson_data)).
     139       Example:
     140
     141       http://geojson.io/#data=data:application/json,%7B%22type%22%3A%22LineString%22%2C%22coordinates%22%3A%5B%5B0%2C0%5D%2C%5B10%2C10%5D%5D%7D
     142    */
     143   
     144    public static final int SUPPRESS_MAPDATA_DISPLAY = 0;
     145    public static final int PRINT_MAPDATA_TO_SCREEN = 1;
    109146
    110147    //Map<String, JsonObject> countryToJsonMap;
     
    121158    private final String geoJsonFilenameWithSuffix;
    122159    private final File outputFolder;
     160
     161
    123162   
    124163    public CountryCodeCountsMapData(String countryCountsJSONFilename) throws Exception {
     
    185224        logger.info("No geolocation info found for country code " + countryCode);
    186225        if(countryCode.equals("EU")) {
     226            logger.info("   Adding lat,lng for somewhere around Europe");
    187227            //logger.info("Unlisted country code: EU");
    188228            // add lat and lng for Europe
     
    192232        }
    193233        else if(countryCode.equals("UNKNOWN")) {
     234            logger.info("   Adding lat,lng for somewhere in Antarctica");
    194235            //logger.info("Unlisted country code: UNKNOWN");
    195236            // add lat and lng for Antarctica
     
    515556    }
    516557   
     558    // by default, display mapdata output on screen too
    517559    public String writeMultiPointGeoJsonToFile() {
     560    return writeMultiPointGeoJsonToFile(PRINT_MAPDATA_TO_SCREEN);
     561    }
     562    public String writeMultiPointGeoJsonToFile(int displayMapData) {
    518563    final String filename = "multipoint_" + this.geoJsonFilenameWithSuffix;
    519564    File outFile = new File(this.outputFolder, filename);
     
    521566    Geometry geometry = this.toMultiPointGeoJson();
    522567    String multiPointGeojsonString = FeatureConverter.toStringValue(geometry);
    523     System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n");
     568    if(displayMapData == PRINT_MAPDATA_TO_SCREEN) {
     569        System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n");
     570    }
    524571    try (
    525572         Writer writer = new BufferedWriter(new FileWriter(outFile));
     
    528575        // Some basic re-formatting for some immediate legibility
    529576        // But pasting the contents of the file (or the System.err output above)
    530         // directly into http://geojson.tools/ will instantly reformat the json perfectly anyway.
     577        // directly into http://geojson.tools/ or http://geojson.io/
     578        // will instantly reformat the json perfectly anyway.
    531579        multiPointGeojsonString = multiPointGeojsonString.replace("[[", "\n[\n\t[");
    532580        multiPointGeojsonString = multiPointGeojsonString.replace("],[", "],\n\t[");
     
    544592   
    545593    }
    546    
     594
     595    // by default, display mapdata output on screen too
    547596    public String writeFeaturesGeoJsonToFile() {
     597    return writeFeaturesGeoJsonToFile(PRINT_MAPDATA_TO_SCREEN);
     598    }
     599    // write out geojson features to appropriately named file
     600    // If displayMapData == PRINT_MAPDATA_TO_SCREEN, then it will also be printed to screen
     601    public String writeFeaturesGeoJsonToFile(int displayMapData) {
    548602    final String filename = "geojson-features_" + this.geoJsonFilenameWithSuffix;
    549603    File outFile = new File(this.outputFolder, filename);
     
    551605    FeatureCollection featureColl = this.toFeatureCollection();
    552606    String featuresGeojsonString = FeatureConverter.toStringValue(featureColl);
    553     System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n");
     607    if(displayMapData == PRINT_MAPDATA_TO_SCREEN) {
     608        System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n");
     609    }
    554610    try (
    555611         Writer writer = new BufferedWriter(new FileWriter(outFile));
     
    564620    }
    565621
    566     return outFile.getAbsolutePath();
    567    
    568     }
    569 
     622    return outFile.getAbsolutePath();   
     623    }
     624   
     625
     626    public String getFeaturesGeoJsonString(boolean uriEncoded) {
     627    String featuresGeojsonString = FeatureConverter.toStringValue(this.toFeatureCollection());
     628    if(uriEncoded) {
     629        // Want to return encodeURIComponent(JSON.stringify(featuresGeojsonString));
     630        // https://stackoverflow.com/questions/607176/java-equivalent-to-javascripts-encodeuricomponent-that-produces-identical-outpu
     631        URLEncoder.encode(featuresGeojsonString);
     632    }
     633    return featuresGeojsonString;
     634    }
     635
     636    public String getAsMapURL() {
     637    boolean uriEncoded = true;
     638    String url = GEOJSON_MAP_TOOL_URL + DATA_STR + getFeaturesGeoJsonString(uriEncoded);
     639
     640    return url;
     641    }
     642
     643    public String geoJsonMapScreenshot(File outputFolder, String fileNamePrefix) {
     644    // https://stackoverflow.com/questions/49606051/how-to-take-a-screenshot-in-firefox-headless-selenium-in-java
     645   
     646    // https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode
     647    // /path/to/firefox -P my-profile --screenshot test.jpg  https://developer.mozilla.org --window-size=800,1000
     648    // https://stackoverflow.com/questions/15783701/which-characters-need-to-be-escaped-when-using-bash
     649   
     650    String mapURL = this.getAsMapURL();
     651
     652    String mapURLescapedForBash = mapURL.replace("\"", "\\\"");//.replace("[", "\\[").replace("]", "\\]");
     653   
     654    File outputFile = new File(outputFolder + File.separator + fileNamePrefix+".png");
     655    String outputFilePath = Utility.getFilePath(outputFile);
     656
     657   
     658    String[] cmdArgs = {
     659        "firefox",
     660        "--screenshot",
     661        outputFilePath,
     662        mapURLescapedForBash //"'" + mapURL + "'"
     663    };
     664   
     665    System.err.print("Running:");
     666    for(String arg : cmdArgs) {
     667        System.err.print(" " + arg);
     668    }
     669    System.err.println();
     670   
     671
     672    //String cmdArgs = "firefox --screenshot " + outputFilePath + " " + GEOJSON_MAP_TOOL_URL + DATA_STR;
     673    //String cmdArgs = "firefox --screenshot " + outputFilePath + " " + "'" + mapURL + "'";
     674    //System.err.println("Running: " + cmdArgs);
     675   
     676    SafeProcess proc = new SafeProcess(cmdArgs);
     677
     678    int retVal = proc.runProcess();
     679
     680    logger.info("Process out: " + proc.getStdOutput());
     681    logger.info("Process err: " + proc.getStdError());
     682    logger.info("Screenshot process returned with: " + retVal);
     683   
     684    return outputFilePath;
     685   
     686    }
    570687   
    571688    public int getTotalCount() {
     
    613730        System.err.println("***********\nWrote mapdata to files " + multipointOutFileName
    614731                   + " and " + featuresOutFileName);
    615         System.err.println("You can paste the geojson contents of either of these files into the "
    616                    + "editor at http://geojson.tools/ to see the data arranged on a world map");
     732        System.err.println("You can paste the geojson contents of either of these files into "
     733                   + " the editor at " + GEOJSON_MAP_TOOL_URL
     734                   + " to see the data arranged on a world map");
    617735
    618736        System.err.println("Total count for query: " + mapData.getTotalCount());
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33917 r33919  
    433433
    434434    /** Perform the aggregates for writing out the summary tables. */
    435     public void writeTables(File outFolder) {
     435    public String[] writeTables(File outFolder) {
    436436    // In this function, we're always dealing with the Websites mongodb collection.
    437437    MongoCollection<Document> collection = getWebsitesCollection();
    438438
    439     String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI",
    440     "3table_sitesWithPagesContainingMRI", "4table_containsMRI_exclTentativeProductSites",
    441     "5table_sitesWithPagesContainingMRI_allNZGrouped"
     439    String[] tableNames = {
     440        "",
     441        "1table_allCrawledSites",
     442        "2table_sitesWithPagesInMRI",
     443        "3table_sitesWithPagesContainingMRI",
     444        "4table_containsMRI_exclTentativeProductSites",
     445        "5table_sitesWithPagesContainingMRI_allNZGrouped",
     446        "5table_sitesWithPagesInMRI_allNZGrouped"
    442447    };
    443448    for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
     
    481486        }
    482487    }
    483     }
    484 
    485     public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {
     488
     489    return tableNames;
     490    }
     491
     492    public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {
    486493   
    487494    Document nzRowData = null;
     
    491498        break;
    492499
    493     case 5:
    494         /* Get NZ only table data:
     500        //case 5:
     501        //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}";
     502        //case 6:
     503        //filterQueryStr = "{numPagesInMRI: {$gt: 0}}";
     504    case 5: case 6:
     505        String filterQueryStr = (tableNum == 5) ?
     506        "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
     507       
     508        /* Get NZ only table data.
     509           Can be numPagesContainingMRI or numPagesInMRI > 0 depending on filterQueryStr.
     510
    495511           db.Websites.aggregate([
    496512           {
     
    517533
    518534        */
     535
    519536        Bson orQuery = or(
    520537              BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
     
    522539              );
    523540        Bson andQuery = and(
    524                 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
    525                 orQuery
    526               );
     541        BasicDBObject.parse(filterQueryStr), // e.g."{numPagesContainingMRI: {$gt: 0}}"
     542        orQuery
     543        );
    527544        AggregateIterable<Document> output = collection.aggregate(Arrays.asList(
    528545         match(andQuery),
     
    541558        break;
    542559
    543      default: logger.error("Unknown table number: " + tableNum);   
     560
     561    default: logger.error("Unknown table number: " + tableNum);   
    544562    }
    545563
     
    549567    public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum)
    550568    {
     569    //String filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; // only used if tableNum = 5|6
    551570   
    552571    AggregateIterable<Document> output = null;
     
    639658         */
    640659        output = collection.aggregate(Arrays.asList(
    641          match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
     660         match(BasicDBObject.parse("{ numPagesContainingMRI: {$gt: 0} }")),
    642661         unwind("$geoLocationCountryCode"),
    643662         group("$geoLocationCountryCode", Arrays.asList(
     
    698717        ));
    699718        break;
    700        
    701     case 5:
     719        //case 5:
     720        //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}";
     721        //case 6:
     722        //filterQueryStr = "{numPagesInMRI: {$gt: 0}}";   
     723    case 5: case 6:
     724        String filterQueryStr = (tableNum == 5) ?
     725        "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
    702726        /*
    703727          Table of count by countryCode of sites with numPagesContainingMRI > 0
     728          (or numPagesInMRI > 0).
    704729          Just do OVERSEAS here, NZ handled separately
    705730
     
    730755       
    731756        andQuery = and(
    732               BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
    733               BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
    734               BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}")
    735               );
     757          BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
     758          BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"),
     759          BasicDBObject.parse(filterQueryStr) // e.g. "{numPagesContainingMRI: {$gt: 0}}"
     760        );
    736761        output = collection.aggregate(Arrays.asList(
    737762         match(andQuery),
  • other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java

    r33917 r33919  
    88/**
    99 * Runs some of the important mongoDB queries I ran.
     10 *
     11 * This program expects a folder ../mongo-data-auto to exist.
    1012 *
    1113 * TO COMPILE OR RUN, FIRST DO:
     
    2628*/
    2729public class SummaryTool {
     30   
    2831    static Logger logger = Logger.getLogger(org.greenstone.atea.SummaryTool.class.getName());
     32
     33   
     34    static private final String GEOJSON_FEATURES_FILE_PREFIX = "geojson-features_";
     35   
    2936    static private final long FIXED_SEED = 1000;
    3037   
    3138    private final MongoDBQueryer mongodbQueryer;
    3239    private File outFolder;
    33 
    3440
    3541   
     
    101107            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
    102108            if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) {
    103             System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
     109            System.out.println("   " + countryCode + " domain " + domain + " had no webpages where isMRI=true - only containsMRI.");
    104110            }
    105111
     
    149155    //File outFolder = domainsFile.getParentFile();
    150156    String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI";
    151     File outFile = new File(outFolder, filterName+"_"+domainsFile.getName());
     157    File outFile = new File(outFolder, filterName+"_full_"+domainsFile.getName());
    152158
    153159    writeURLsToFile(urlsList, outFile, N_totalNumPages);
     
    158164    int n_numSampleURLs = calcSampleSize(N_totalNumPages);
    159165
    160     System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist: " + N_totalNumPages);
     166    System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist file: " + N_totalNumPages);
    161167    System.err.println("    (out of " + mongodbQueryer.countOfWebpagesMatching(filterType)
    162168               + " web pages across ALL sites for which " + filterName + " = true)");
     
    172178    System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
    173179               + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile));
     180
     181    // For N = 6557, z-alpha-over-2 = 1.6449 and m = 0.05 (5%),
     182    // n = (z-alpha-over-2^2 x N) / (z-alpha-over-2^2 + 4 x (N-1) x m^2)
     183    // = (1.6449^2×6557) ÷ (1.6449^2 + 4 × 6556×0.05^2) = 259.88526851 => 260 rounded up. Check.
    174184    }
    175185
     
    384394        // TODO: generate the tables
    385395
    386         mongodb.writeTables(outFolder);
     396        String[] tableFileNames = mongodb.writeTables(outFolder);
     397        // for each table file name, generate the geojson-features .json file
     398        // that GEOJSON_MAP_TOOL_URL takes as input to produce a map.
     399
     400        for(int i = 1; i < tableFileNames.length; i++) { // empty element at 0
     401            String tablefilename = tableFileNames[i] + ".json"; // filenames have no suffix
     402           
     403            File countsTableFile = new File(outFolder, tablefilename);
     404            if(!countsTableFile.exists()) {
     405            logger.error("@@@ File " + countsTableFile + " does not exist!");
     406            logger.error("@@@ Can't generate map date for this.");
     407            continue;
     408            }
     409            String countsTableFilename = outFolder + File.separator + tablefilename;
     410            CountryCodeCountsMapData mapData
     411            = new CountryCodeCountsMapData(countsTableFilename);
     412            String geoJsonFilename = mapData.writeFeaturesGeoJsonToFile(CountryCodeCountsMapData.SUPPRESS_MAPDATA_DISPLAY);
     413           
     414
     415            /*
     416            // Ensure the geo-json file generated exists
     417            //String geoJsonFilename = outFolder + File.separator
     418            //+ GEOJSON_FEATURES_FILE_PREFIX + tablefilename;
     419           
     420            File geoJsonFile = new File(geoJsonFilename);
     421            if(!geoJsonFile.exists()) {
     422            System.err.println("@@@ geoJson file " + geoJsonFilename + " not generated!");
     423            continue;
     424            }
     425            */
     426            System.err.println("**** Wrote mapdata to file " + geoJsonFilename);
     427            //System.err.println("     Paste the file's geojson contents into "
     428            //+ "the editor at " + CountryCodeCountsMapData.GEOJSON_MAP_TOOL_URL
     429            //+ " to see the data arranged on a world map");           
     430            System.err.println("Total count for query: " + mapData.getTotalCount());
     431           
     432            /*boolean uriEncoded = true;
     433            String mapDataEncodedStr = mapData.getFeaturesGeoJsonString(uriEncoded);
     434            System.err.println("Encoded string: " + mapDataEncodedStr);
     435            */
     436           
     437            System.err.println("Data URL string: " + mapData.getAsMapURL());
     438            System.err.println();
     439            mapData.geoJsonMapScreenshot(outFolder, tablefilename);
     440            System.err.println("---");
     441
     442            // TODO: breaks after first table -> map conversion
     443            break;
     444        }
     445       
    387446        }
    388447
Note: See TracChangeset for help on using the changeset viewer.