Changeset 33919

Show
Ignore:
Timestamp:
13.02.2020 22:40:41 (5 days ago)
Author:
ak19
Message:

SummaryTool? now uses the CountryCodeCountsMapData?.java class to generate the geojson-features files from the tables it already created using MongoDB query results. Switched over from geojson.tools to geojson.io since the latter allows passing geojson mapdata in the URL. The firefox screenshotting is still not working. But I can't even get complex geojson features to work from the commandline yet, so then there's another possible layer of complexity when running firefox as a Java process. Added jna jar files used by Greenstone's SafeProcess? for launching Firefox as a Java process.

Location:
other-projects/maori-lang-detection
Files:
2 added
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33914 r33919  
    16381638UNKNOWN: 
    16391639X gained hitiaotera.com from IL 
     1640(and lost viveipcl.com to CZ) 
    16401641 
    16411642IL: 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java

    r33869 r33919  
    77import java.io.FileWriter; 
    88import java.io.Writer; 
     9 
     10import java.net.URLEncoder; 
    911 
    1012import java.util.HashMap; 
     
    3335import mil.nga.sf.geojson.Position; 
    3436 
     37 
     38import org.greenstone.util.SafeProcess; 
    3539 
    3640/** 
     
    105109 *   https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations/2.10.0 
    106110 */ 
    107 public class CountryCodeCountsMapData { 
     111public class CountryCodeCountsMapData {    
     112     
    108113    static Logger logger = Logger.getLogger(org.greenstone.atea.CountryCodeCountsMapData.class.getName()); 
     114     
     115    static public final String GEOJSON_MAP_TOOL_URL = "http://geojson.io/"; //"http://geojson.tools/"; 
     116    static private final String DATA_STR = "#data=data:application/json,"; 
     117     
     118    // "http://geojson.io" has a URL API to programmatically access 
     119    /*  
     120       See http://geojson.io/ -> Help 
     121 
     122       "I'm a coder 
     123 
     124       geojson.io has an array of cli tools that make it easy to go from a GeoJSON file on your computer to geojson.io." 
     125 
     126       http://geojson.io/#geojson-io-api 
     127       "Geojson.io API 
     128 
     129       You can interact with geojson.io programmatically in two ways: 
     130 
     131       => URL parameters 
     132       Browser console" 
     133 
     134       http://geojson.io/#url-api 
     135       "data=data:application/json, 
     136 
     137       Open the map and load a chunk of GeoJSON data from a URL segment directly onto the map. 
     138       The GeoJSON data should be encoded as per encodeURIComponent(JSON.stringify(geojson_data)). 
     139       Example: 
     140 
     141       http://geojson.io/#data=data:application/json,%7B%22type%22%3A%22LineString%22%2C%22coordinates%22%3A%5B%5B0%2C0%5D%2C%5B10%2C10%5D%5D%7D 
     142    */ 
     143     
     144    public static final int SUPPRESS_MAPDATA_DISPLAY = 0; 
     145    public static final int PRINT_MAPDATA_TO_SCREEN = 1; 
    109146 
    110147    //Map<String, JsonObject> countryToJsonMap; 
     
    121158    private final String geoJsonFilenameWithSuffix; 
    122159    private final File outputFolder; 
     160 
     161 
    123162     
    124163    public CountryCodeCountsMapData(String countryCountsJSONFilename) throws Exception { 
     
    185224        logger.info("No geolocation info found for country code " + countryCode); 
    186225        if(countryCode.equals("EU")) { 
     226            logger.info("   Adding lat,lng for somewhere around Europe"); 
    187227            //logger.info("Unlisted country code: EU"); 
    188228            // add lat and lng for Europe 
     
    192232        } 
    193233        else if(countryCode.equals("UNKNOWN")) { 
     234            logger.info("   Adding lat,lng for somewhere in Antarctica"); 
    194235            //logger.info("Unlisted country code: UNKNOWN"); 
    195236            // add lat and lng for Antarctica 
     
    515556    } 
    516557     
     558    // by default, display mapdata output on screen too 
    517559    public String writeMultiPointGeoJsonToFile() { 
     560    return writeMultiPointGeoJsonToFile(PRINT_MAPDATA_TO_SCREEN); 
     561    } 
     562    public String writeMultiPointGeoJsonToFile(int displayMapData) { 
    518563    final String filename = "multipoint_" + this.geoJsonFilenameWithSuffix; 
    519564    File outFile = new File(this.outputFolder, filename); 
     
    521566    Geometry geometry = this.toMultiPointGeoJson(); 
    522567    String multiPointGeojsonString = FeatureConverter.toStringValue(geometry); 
    523     System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n"); 
     568    if(displayMapData == PRINT_MAPDATA_TO_SCREEN) { 
     569        System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n"); 
     570    } 
    524571    try ( 
    525572         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     
    528575        // Some basic re-formatting for some immediate legibility 
    529576        // But pasting the contents of the file (or the System.err output above) 
    530         // directly into http://geojson.tools/ will instantly reformat the json perfectly anyway. 
     577        // directly into http://geojson.tools/ or http://geojson.io/ 
     578        // will instantly reformat the json perfectly anyway. 
    531579        multiPointGeojsonString = multiPointGeojsonString.replace("[[", "\n[\n\t["); 
    532580        multiPointGeojsonString = multiPointGeojsonString.replace("],[", "],\n\t["); 
     
    544592     
    545593    } 
    546      
     594 
     595    // by default, display mapdata output on screen too 
    547596    public String writeFeaturesGeoJsonToFile() { 
     597    return writeFeaturesGeoJsonToFile(PRINT_MAPDATA_TO_SCREEN); 
     598    } 
     599    // write out geojson features to appropriately named file 
     600    // If displayMapData == PRINT_MAPDATA_TO_SCREEN, then it will also be printed to screen 
     601    public String writeFeaturesGeoJsonToFile(int displayMapData) { 
    548602    final String filename = "geojson-features_" + this.geoJsonFilenameWithSuffix; 
    549603    File outFile = new File(this.outputFolder, filename); 
     
    551605    FeatureCollection featureColl = this.toFeatureCollection(); 
    552606    String featuresGeojsonString = FeatureConverter.toStringValue(featureColl); 
    553     System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n"); 
     607    if(displayMapData == PRINT_MAPDATA_TO_SCREEN) { 
     608        System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n"); 
     609    } 
    554610    try ( 
    555611         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     
    564620    } 
    565621 
    566     return outFile.getAbsolutePath(); 
    567      
    568     } 
    569  
     622    return outFile.getAbsolutePath();    
     623    } 
     624     
     625 
     626    public String getFeaturesGeoJsonString(boolean uriEncoded) { 
     627    String featuresGeojsonString = FeatureConverter.toStringValue(this.toFeatureCollection()); 
     628    if(uriEncoded) { 
     629        // Want to return encodeURIComponent(JSON.stringify(featuresGeojsonString)); 
     630        // https://stackoverflow.com/questions/607176/java-equivalent-to-javascripts-encodeuricomponent-that-produces-identical-outpu 
     631        URLEncoder.encode(featuresGeojsonString); 
     632    } 
     633    return featuresGeojsonString; 
     634    } 
     635 
     636    public String getAsMapURL() { 
     637    boolean uriEncoded = true; 
     638    String url = GEOJSON_MAP_TOOL_URL + DATA_STR + getFeaturesGeoJsonString(uriEncoded); 
     639 
     640    return url; 
     641    } 
     642 
     643    public String geoJsonMapScreenshot(File outputFolder, String fileNamePrefix) { 
     644    // https://stackoverflow.com/questions/49606051/how-to-take-a-screenshot-in-firefox-headless-selenium-in-java 
     645     
     646    // https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode 
     647    // /path/to/firefox -P my-profile --screenshot test.jpg  https://developer.mozilla.org --window-size=800,1000 
     648    // https://stackoverflow.com/questions/15783701/which-characters-need-to-be-escaped-when-using-bash 
     649     
     650    String mapURL = this.getAsMapURL(); 
     651 
     652    String mapURLescapedForBash = mapURL.replace("\"", "\\\"");//.replace("[", "\\[").replace("]", "\\]"); 
     653     
     654    File outputFile = new File(outputFolder + File.separator + fileNamePrefix+".png"); 
     655    String outputFilePath = Utility.getFilePath(outputFile); 
     656 
     657     
     658    String[] cmdArgs = { 
     659        "firefox", 
     660        "--screenshot", 
     661        outputFilePath, 
     662        mapURLescapedForBash //"'" + mapURL + "'" 
     663    }; 
     664     
     665    System.err.print("Running:"); 
     666    for(String arg : cmdArgs) { 
     667        System.err.print(" " + arg); 
     668    } 
     669    System.err.println(); 
     670     
     671 
     672    //String cmdArgs = "firefox --screenshot " + outputFilePath + " " + GEOJSON_MAP_TOOL_URL + DATA_STR; 
     673    //String cmdArgs = "firefox --screenshot " + outputFilePath + " " + "'" + mapURL + "'"; 
     674    //System.err.println("Running: " + cmdArgs); 
     675     
     676    SafeProcess proc = new SafeProcess(cmdArgs); 
     677 
     678    int retVal = proc.runProcess(); 
     679 
     680    logger.info("Process out: " + proc.getStdOutput()); 
     681    logger.info("Process err: " + proc.getStdError()); 
     682    logger.info("Screenshot process returned with: " + retVal); 
     683     
     684    return outputFilePath; 
     685     
     686    } 
    570687     
    571688    public int getTotalCount() { 
     
    613730        System.err.println("***********\nWrote mapdata to files " + multipointOutFileName 
    614731                   + " and " + featuresOutFileName); 
    615         System.err.println("You can paste the geojson contents of either of these files into the " 
    616                    + "editor at http://geojson.tools/ to see the data arranged on a world map"); 
     732        System.err.println("You can paste the geojson contents of either of these files into " 
     733                   + " the editor at " + GEOJSON_MAP_TOOL_URL 
     734                   + " to see the data arranged on a world map"); 
    617735 
    618736        System.err.println("Total count for query: " + mapData.getTotalCount()); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33917 r33919  
    433433 
    434434    /** Perform the aggregates for writing out the summary tables. */ 
    435     public void writeTables(File outFolder) { 
     435    public String[] writeTables(File outFolder) { 
    436436    // In this function, we're always dealing with the Websites mongodb collection. 
    437437    MongoCollection<Document> collection = getWebsitesCollection(); 
    438438 
    439     String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI", 
    440     "3table_sitesWithPagesContainingMRI", "4table_containsMRI_exclTentativeProductSites", 
    441     "5table_sitesWithPagesContainingMRI_allNZGrouped" 
     439    String[] tableNames = { 
     440        "", 
     441        "1table_allCrawledSites", 
     442        "2table_sitesWithPagesInMRI", 
     443        "3table_sitesWithPagesContainingMRI", 
     444        "4table_containsMRI_exclTentativeProductSites", 
     445        "5table_sitesWithPagesContainingMRI_allNZGrouped", 
     446        "5table_sitesWithPagesInMRI_allNZGrouped" 
    442447    }; 
    443448    for (int tableNum = 1; tableNum < tableNames.length; tableNum++) { 
     
    481486        } 
    482487    } 
    483     } 
    484  
    485     public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) { 
     488 
     489    return tableNames; 
     490    } 
     491 
     492    public Document getNZTableRowData(MongoCollection<Document> collection, int tableNum) {  
    486493     
    487494    Document nzRowData = null; 
     
    491498        break; 
    492499 
    493     case 5: 
    494         /* Get NZ only table data: 
     500        //case 5: 
     501        //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; 
     502        //case 6: 
     503        //filterQueryStr = "{numPagesInMRI: {$gt: 0}}"; 
     504    case 5: case 6: 
     505        String filterQueryStr = (tableNum == 5) ? 
     506        "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 
     507         
     508        /* Get NZ only table data. 
     509           Can be numPagesContainingMRI or numPagesInMRI > 0 depending on filterQueryStr. 
     510 
    495511           db.Websites.aggregate([ 
    496512           { 
     
    517533 
    518534        */ 
     535 
    519536        Bson orQuery = or( 
    520537              BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"), 
     
    522539              ); 
    523540        Bson andQuery = and( 
    524                 BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
    525                 orQuery 
    526               ); 
     541        BasicDBObject.parse(filterQueryStr), // e.g."{numPagesContainingMRI: {$gt: 0}}" 
     542        orQuery 
     543        ); 
    527544        AggregateIterable<Document> output = collection.aggregate(Arrays.asList( 
    528545         match(andQuery), 
     
    541558        break; 
    542559 
    543      default: logger.error("Unknown table number: " + tableNum);    
     560 
     561    default: logger.error("Unknown table number: " + tableNum);    
    544562    } 
    545563 
     
    549567    public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) 
    550568    { 
     569    //String filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; // only used if tableNum = 5|6 
    551570     
    552571    AggregateIterable<Document> output = null; 
     
    639658         */ 
    640659        output = collection.aggregate(Arrays.asList( 
    641          match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")), 
     660         match(BasicDBObject.parse("{ numPagesContainingMRI: {$gt: 0} }")), 
    642661         unwind("$geoLocationCountryCode"), 
    643662         group("$geoLocationCountryCode", Arrays.asList( 
     
    698717        )); 
    699718        break; 
    700          
    701     case 5: 
     719        //case 5: 
     720        //filterQueryStr = "{numPagesContainingMRI: {$gt: 0}}"; 
     721        //case 6: 
     722        //filterQueryStr = "{numPagesInMRI: {$gt: 0}}";     
     723    case 5: case 6: 
     724        String filterQueryStr = (tableNum == 5) ? 
     725        "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}"; 
    702726        /* 
    703727          Table of count by countryCode of sites with numPagesContainingMRI > 0 
     728          (or numPagesInMRI > 0). 
    704729          Just do OVERSEAS here, NZ handled separately 
    705730 
     
    730755         
    731756        andQuery = and( 
    732               BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
    733               BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"), 
    734               BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}") 
    735               ); 
     757          BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
     758          BasicDBObject.parse("{domain: {$not: /\\.nz$/}}"), 
     759          BasicDBObject.parse(filterQueryStr) // e.g. "{numPagesContainingMRI: {$gt: 0}}" 
     760        ); 
    736761        output = collection.aggregate(Arrays.asList( 
    737762         match(andQuery), 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java

    r33917 r33919  
    88/** 
    99 * Runs some of the important mongoDB queries I ran. 
     10 * 
     11 * This program expects a folder ../mongo-data-auto to exist. 
    1012 * 
    1113 * TO COMPILE OR RUN, FIRST DO: 
     
    2628*/ 
    2729public class SummaryTool { 
     30     
    2831    static Logger logger = Logger.getLogger(org.greenstone.atea.SummaryTool.class.getName()); 
     32 
     33     
     34    static private final String GEOJSON_FEATURES_FILE_PREFIX = "geojson-features_"; 
     35     
    2936    static private final long FIXED_SEED = 1000; 
    3037     
    3138    private final MongoDBQueryer mongodbQueryer; 
    3239    private File outFolder; 
    33  
    3440 
    3541     
     
    101107            // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know 
    102108            if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) { 
    103             System.out.println("   " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI."); 
     109            System.out.println("   " + countryCode + " domain " + domain + " had no webpages where isMRI=true - only containsMRI."); 
    104110            } 
    105111 
     
    149155    //File outFolder = domainsFile.getParentFile(); 
    150156    String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI"; 
    151     File outFile = new File(outFolder, filterName+"_"+domainsFile.getName()); 
     157    File outFile = new File(outFolder, filterName+"_full_"+domainsFile.getName()); 
    152158 
    153159    writeURLsToFile(urlsList, outFile, N_totalNumPages); 
     
    158164    int n_numSampleURLs = calcSampleSize(N_totalNumPages); 
    159165 
    160     System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist: " + N_totalNumPages); 
     166    System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist file: " + N_totalNumPages); 
    161167    System.err.println("    (out of " + mongodbQueryer.countOfWebpagesMatching(filterType) 
    162168               + " web pages across ALL sites for which " + filterName + " = true)"); 
     
    172178    System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs " 
    173179               + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile)); 
     180 
     181    // For N = 6557, z-alpha-over-2 = 1.6449 and m = 0.05 (5%), 
     182    // n = (z-alpha-over-2^2 x N) / (z-alpha-over-2^2 + 4 x (N-1) x m^2) 
     183    // = (1.6449^2×6557) ÷ (1.6449^2 + 4 × 6556×0.05^2) = 259.88526851 => 260 rounded up. Check. 
    174184    } 
    175185 
     
    384394        // TODO: generate the tables 
    385395 
    386         mongodb.writeTables(outFolder); 
     396        String[] tableFileNames = mongodb.writeTables(outFolder); 
     397        // for each table file name, generate the geojson-features .json file 
     398        // that GEOJSON_MAP_TOOL_URL takes as input to produce a map. 
     399 
     400        for(int i = 1; i < tableFileNames.length; i++) { // empty element at 0 
     401            String tablefilename = tableFileNames[i] + ".json"; // filenames have no suffix 
     402             
     403            File countsTableFile = new File(outFolder, tablefilename); 
     404            if(!countsTableFile.exists()) { 
     405            logger.error("@@@ File " + countsTableFile + " does not exist!"); 
     406            logger.error("@@@ Can't generate map date for this."); 
     407            continue; 
     408            } 
     409            String countsTableFilename = outFolder + File.separator + tablefilename; 
     410            CountryCodeCountsMapData mapData 
     411            = new CountryCodeCountsMapData(countsTableFilename); 
     412            String geoJsonFilename = mapData.writeFeaturesGeoJsonToFile(CountryCodeCountsMapData.SUPPRESS_MAPDATA_DISPLAY); 
     413             
     414 
     415            /* 
     416            // Ensure the geo-json file generated exists 
     417            //String geoJsonFilename = outFolder + File.separator 
     418            //+ GEOJSON_FEATURES_FILE_PREFIX + tablefilename; 
     419             
     420            File geoJsonFile = new File(geoJsonFilename); 
     421            if(!geoJsonFile.exists()) { 
     422            System.err.println("@@@ geoJson file " + geoJsonFilename + " not generated!"); 
     423            continue; 
     424            } 
     425            */ 
     426            System.err.println("**** Wrote mapdata to file " + geoJsonFilename); 
     427            //System.err.println("     Paste the file's geojson contents into " 
     428            //+ "the editor at " + CountryCodeCountsMapData.GEOJSON_MAP_TOOL_URL 
     429            //+ " to see the data arranged on a world map");             
     430            System.err.println("Total count for query: " + mapData.getTotalCount()); 
     431             
     432            /*boolean uriEncoded = true; 
     433            String mapDataEncodedStr = mapData.getFeaturesGeoJsonString(uriEncoded); 
     434            System.err.println("Encoded string: " + mapDataEncodedStr); 
     435            */ 
     436             
     437            System.err.println("Data URL string: " + mapData.getAsMapURL()); 
     438            System.err.println(); 
     439            mapData.geoJsonMapScreenshot(outFolder, tablefilename); 
     440            System.err.println("---"); 
     441 
     442            // TODO: breaks after first table -> map conversion 
     443            break; 
     444        } 
     445         
    387446        } 
    388447