source: other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java@ 33805

Last change on this file since 33805 was 33805, checked in by ak19, 4 years ago
  1. Moving the static countrycodes.json file to conf folder and updated CountryCodeCountsMapData.java to work with its new location. 2. CountryCodeContsMapData.java further sensibly names output files based on input filenames instead of producing identical filenames on each run indepedent of (different) input files. 3. Adding the geojson and map for mongodb query results for counts by country codes of sites where at least 1 page is overall detected by OpenNLP as MRI.
File size: 16.9 KB
Line 
1package org.greenstone.atea;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.File;
6import java.io.FileReader;
7import java.io.FileWriter;
8import java.io.Writer;
9
10import java.util.HashMap;
11import java.util.LinkedList;
12import java.util.List;
13import java.util.Map;
14
15import org.apache.commons.csv.*;
16import org.apache.log4j.Logger;
17
18// Google's gson imports for parsing any kind of json
19import com.google.gson.JsonArray;
20import com.google.gson.JsonElement;
21import com.google.gson.JsonObject;
22import com.google.gson.JsonParser;
23
24// For working with GeoJSON's Simple Features in Java
25import mil.nga.sf.geojson.Feature;
26import mil.nga.sf.geojson.FeatureCollection;
27import mil.nga.sf.geojson.FeatureConverter;
28import mil.nga.sf.geojson.Geometry;
29import mil.nga.sf.geojson.MultiPoint;
30import mil.nga.sf.geojson.Polygon;
31import mil.nga.sf.geojson.Position;
32
33
34/**
35 * Run a mongodb query that produces counts per countrycode like in the following 2 examples:
36 *
37 * 1. count of country codes for all sites
38 * db.Websites.aggregate([
39 *
40 * { $unwind: "$geoLocationCountryCode" },
41 * {
42 * $group: {
43 * _id: "$geoLocationCountryCode",
44 * count: { $sum: 1 }
45 * }
46 * },
47 * { $sort : { count : -1} }
48 * ]);
49 *
50 * Then store the mongodb query result's JSON format output in a file called "counts.json".
51 * Then run this program with counts.json as parameter
52 * Copy the geojson output into http://geojson.tools/
53 *
54 * 2. count of country codes for sites that have at least one page detected as MRI
55 *
56 * db.Websites.aggregate([
57 * {
58 * $match: {
59 * numPagesInMRI: {$gt: 0}
60 * }
61 * },
62 * { $unwind: "$geoLocationCountryCode" },
63 * {
64 * $group: {
65 * _id: {$toLower: '$geoLocationCountryCode'},
66 * count: { $sum: 1 }
67 * }
68 * },
69 * { $sort : { count : -1} }
70 * ]);
71 *
72 * Store the mongodb query result's JSON format output in a file called "counts_sitesWithPagesInMRI.json".
73 * Then run this program with counts_sitesWithPagesInMRI.json as parameter.
74 * Copy the geojson output into http://geojson.tools/
75 *
76 * ##################
77 * TO COMPILE:
78 * maori-lang-detection/src$
79 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/CountryCodeCountsMapData.java
80 *
81 * TO RUN:
82 * maori-lang-detection/src$
83 * java -cp ".:../conf:../lib/*" org/greenstone/atea/CountryCodeCountsMapData ../mongodb-data/countrycodes.json ../mongodb-data/counts.json
84 *###################
85 *
86 * This class needs the gson library, and now the sf-geojson(-2.02).jar and
87 * helper jars sf(-2.02).jar and 3 jackson jars too,
88 * to create and store Simple Features geo json objects with Java.
89 * I copied the gson jar file from GS3.
90 *
91 * Simple Features GeoJSON Java
92 * https://ngageoint.github.io/simple-features-geojson-java/ - liks to API and more
93 *
94 * https://mvnrepository.com/artifact/mil.nga.sf/sf-geojson (https://github.com/ngageoint/simple-features-geojson-java/)
95 *
96 * Also need the basic data types used by the Geometry objects above:
97 * https://mvnrepository.com/artifact/mil.nga/sf (https://github.com/ngageoint/simple-features-java)
98 *
99 * Further helper jars needed (because of encountering the exception documented at
100 * stackoverflow.com/questions/36278293/java-lang-classnotfoundexception-com-fasterxml-jackson-core-jsonprocessingexcep/36279872)
101 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core/2.10.0
102 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind
103 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations/2.10.0
104 */
105public class CountryCodeCountsMapData {
106 static Logger logger = Logger.getLogger(org.greenstone.atea.CountryCodeCountsMapData.class.getName());
107
108 //Map<String, JsonObject> countryToJsonMap;
109 JsonArray countryCodesJsonArray;
110 JsonArray countryCountsJsonArray;
111
112 // North-central Antarctica coords
113 private final double ANTARCTICA_LNG = 57.0d;
114 private final double ANTARCTICA_LAT = -70.0d;
115 // For EU coords, spot in Atlantic Ocean close to western European coast.
116 private final double EU_LNG = -20.0d;
117 private final double EU_LAT = 50.0d;
118
119 private final String geoJsonFilenameWithSuffix;
120 private final File outputFolder;
121
122 public CountryCodeCountsMapData(String countryCountsJSONFilename) throws Exception {
123
124 // work out the unique filename we're going to save the geojson files under
125 // and the folder we're going to save them into
126 File countryCountsJSONFile = new File(countryCountsJSONFilename);
127 String tailname = countryCountsJSONFile.getName();
128 this.geoJsonFilenameWithSuffix = (tailname.startsWith("counts_")) ? tailname.substring("counts_".length()) : tailname;
129 this.outputFolder = countryCountsJSONFile.getParentFile().getCanonicalFile(); // canonical resolves any .. and . in path
130
131 // locate the countrycodes.json file
132 File countryCoordsJSONFile = new File(this.getClass().getClassLoader().getResource("countrycodes.json").getFile());
133
134 // Create a map of ALL country code names to ALL the country code json objects
135 // that contain the location (lat, lng) info for each country code
136 Map<String, JsonObject> countryToJsonMap = new HashMap<String, JsonObject>();
137
138 // Parse json file of country codes and put into a JsonArray.
139 // then put into map of each country code to its JsonObject.
140 countryCodesJsonArray = parseJSONFile(countryCoordsJSONFile);
141 for(JsonElement obj : countryCodesJsonArray) {
142 JsonObject countryCodeJson = obj.getAsJsonObject();
143 countryToJsonMap.put(countryCodeJson.get("country").getAsString(), countryCodeJson);
144 }
145
146 // Parse json file of country code counts
147 // Then for each JsonObject in this file,
148 // find a match on its country code in the map created above to get a country code JsonObject
149 // Get the longitude and latitude of the JsonObject that matched that country code.
150 // Add this lng,lat location information to the current JsonObject from the counts file.
151 countryCountsJsonArray = parseJSONFile(countryCountsJSONFile);
152
153 for(JsonElement obj : countryCountsJsonArray) {
154 JsonObject json = obj.getAsJsonObject();
155 String countryCode = json.get("_id").getAsString().toUpperCase();
156 // set the property back as uppercase and with property name "countrycode" instead of "_id"
157 json.remove("_id");
158 json.addProperty("countrycode", countryCode);
159
160 int count = (int)json.get("count").getAsDouble();
161
162 //logger.info("Got country code: " + countryCode);
163 //logger.info(" count: " + count);
164
165 // locate in countryCode map
166 JsonObject countryCodeJson = countryToJsonMap.get(countryCode);
167
168 if(countryCodeJson != null) {
169 //logger.info("Found in map: " + countryCodeJson.toString());
170
171 // for geojson, want longitude then latitude
172 Double lng = countryCodeJson.get("longitude").getAsDouble();
173 Double lat = countryCodeJson.get("latitude").getAsDouble();
174 //logger.info("long: " + Double.toString(lng) + ", lat: " + Double.toString(lat));
175 String countryName = countryCodeJson.get("name").getAsString();
176
177 // let's add lat and lng fields to countryCounts object
178 json.addProperty("lng", lng); // adds Number: https://javadoc.io/static/com.google.code.gson/gson/2.8.5/com/google/gson/JsonObject.html
179 json.addProperty("lat", lat);
180 json.addProperty("region", countryName);
181
182 } else {
183 logger.info("No geolocation info found for country code " + countryCode);
184 if(countryCode.equals("EU")) {
185 //logger.info("Unlisted country code: EU");
186 // add lat and lng for Europe
187 json.addProperty("lng", EU_LNG);
188 json.addProperty("lat", EU_LAT);
189 json.addProperty("region", "Europe");
190 }
191 else if(countryCode.equals("UNKNOWN")) {
192 //logger.info("Unlisted country code: UNKNOWN");
193 // add lat and lng for Antarctica
194 json.addProperty("lng", ANTARCTICA_LNG);
195 json.addProperty("lat", ANTARCTICA_LAT);
196 json.addProperty("region", "UNKNOWN");
197 } else {
198 logger.error("ERROR: entirely unknown country code: " + countryCode);
199 }
200 }
201 }
202
203 }
204
205 /** Convert mongodb tabular output of json records stored in the given file
206 * into a JsonArray.
207 */
208 public JsonArray parseJSONFile(File file) throws Exception {
209 JsonArray jsonArray = null;
210 // read into string
211 try (
212 BufferedReader reader = new BufferedReader(new FileReader(file));
213 ) {
214 StringBuilder str = //new StringBuilder();
215 new StringBuilder("[");
216 String line;
217 while((line = reader.readLine()) != null) {
218 line = line.replaceAll("/\\* [^\\/]* \\*/", ""); // get rid of any multiline comments symbols on a single line
219 str.append(line);
220 if(line.endsWith("}")) {
221 str.append(",\n");
222 }
223 }
224 // replace last comma with closing bracket
225 String fileContents = str.substring(0, str.length()-2) + "]";
226
227 //System.err.println("Got file:\n" + fileContents);
228
229 // https://stackoverflow.com/questions/2591098/how-to-parse-json-in-java
230 jsonArray = new JsonParser().parse(fileContents).getAsJsonArray();
231
232 } catch(Exception e) {
233 throw e;
234 }
235
236
237 return jsonArray;
238 }
239
240 /**
241 * Reading
242 * https://www.here.xyz/api/concepts/geojsonbasics/
243 * https://ngageoint.github.io/simple-features-geojson-java/docs/api/
244 *
245 * https://stackoverflow.com/questions/55621480/cant-access-coordinates-member-of-geojson-feature-collection
246 *
247 * Downloaded geojson simple features' jar file from maven, but it didn't work:
248 * a more private version of MultiPoint.java is not included in the jar file (there's only
249 * mil.nga.sf.geojson.MultiPoint , whereas
250 * mil.nga.sf.MultiPoint is missing
251 *
252 * This seems to have gone wrong at
253 * https://github.com/ngageoint/simple-features-geojson-java/tree/master/src/main/java/mil/nga/sf
254 * but the one at
255 * https://github.com/ngageoint/simple-features-java/tree/master/src/main/java/mil/nga/sf
256 * has it. So I've been trying to build that, but don't have the correct version of maven.
257 */
258 public Geometry toMultiPointGeoJson() {
259 //System.err.println("toGeoJSON() is not yet implemented.");
260
261 List<Position> points = new LinkedList<Position>();
262
263 for(JsonElement obj : this.countryCountsJsonArray) {
264 JsonObject json = obj.getAsJsonObject();
265 Double lng = json.get("lng").getAsDouble();
266 Double lat = json.get("lat").getAsDouble();
267
268 Position point = new Position(lng, lat);
269 points.add(point);
270 }
271
272 Geometry multiPoint = new MultiPoint(points);
273
274 return multiPoint;
275 }
276
277 // https://javadoc.io/static/com.google.code.gson/gson/2.8.5/index.html
278 public FeatureCollection toFeatureCollection() {
279 final int HISTOGRAM_WIDTH = 4;
280
281 FeatureCollection featureCollection = new FeatureCollection();
282
283 for(JsonElement obj : this.countryCountsJsonArray) {
284 JsonObject json = obj.getAsJsonObject();
285
286 String countryCode = json.get("countrycode").getAsString();
287 String region = json.get("region").getAsString();
288 int count = json.get("count").getAsInt();
289
290 // make a histogram for each country
291 Geometry rectangle = this.toPolygon(json, count, HISTOGRAM_WIDTH);
292
293 Feature countryFeature = new Feature(rectangle);
294 Map<String, Object> featureProperties = new HashMap<String, Object>();
295 featureProperties.put("count", new Integer(count));
296 featureProperties.put("code", countryCode);
297 featureProperties.put("region", region);
298 countryFeature.setProperties(featureProperties);
299
300 featureCollection.addFeature(countryFeature);
301 }
302
303 return featureCollection;
304 }
305
306 // create rectangular "histogram" for each country code
307 private Geometry toPolygon(JsonObject json, int count, int HISTOGRAM_WIDTH) {
308 int half_width = HISTOGRAM_WIDTH/2;
309 double vertical_factor = 1.0;
310
311 Double lng = json.get("lng").getAsDouble();
312 Double lat = json.get("lat").getAsDouble();
313
314 String countryCode = json.get("countrycode").getAsString();
315
316
317 //create the 4 corners of the rectangle
318 // West is negative, east is positive, south is negative, north is positive
319 // See http://www.learnz.org.nz/sites/learnz.org.nz/files/lat-long-geo-data-01_0.jpg
320 // But since the histograms grow vertically/northwards and we can't go past a latitude of 90,
321 // to compensate, we increase the width of the histograms by the same factor as our inability
322 // to grow northwards.
323 Double north = lat + (vertical_factor * count);
324 while (north > 90) {
325 // recalculate north after decreasing histogram's vertical growth
326 // by the same factor as we increase its width
327 vertical_factor = vertical_factor/2.0;
328 half_width = 2 * half_width;
329 north = lat + (vertical_factor * count);
330 }
331 Double east = lng + half_width;
332 Double west = lng - half_width;
333 Double south = lat;
334
335 List<List<Position>> outerList = new LinkedList<List<Position>>();
336 List<Position> points = new LinkedList<Position>();
337 outerList.add(points);
338
339
340 points.add(new Position(west, south)); // Position(lng, lat) not Position(lat, lng)
341 points.add(new Position(west, north));
342 points.add(new Position(east, north));
343 points.add(new Position(east, south));
344
345
346 Geometry rectangle = new Polygon(outerList);
347
348 // Coords: a List of List of Positions, see https://ngageoint.github.io/simple-features-geojson-java/docs/api/
349 // https://www.here.xyz/api/concepts/geojsonbasics/#polygon
350
351 return rectangle;
352 }
353
354 public String writeMultiPointGeoJsonToFile() {
355 final String filename = "multipoint_" + this.geoJsonFilenameWithSuffix;
356 File outFile = new File(this.outputFolder, filename);
357
358 Geometry geometry = this.toMultiPointGeoJson();
359 String multiPointGeojsonString = FeatureConverter.toStringValue(geometry);
360 System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n");
361 try (
362 Writer writer = new BufferedWriter(new FileWriter(outFile));
363 ) {
364
365 // Some basic re-formatting for some immediate legibility
366 // But pasting the contents of the file (or the System.err output above)
367 // directly into http://geojson.tools/ will instantly reformat the json perfectly anyway.
368 multiPointGeojsonString = multiPointGeojsonString.replace("[[", "\n[\n\t[");
369 multiPointGeojsonString = multiPointGeojsonString.replace("],[", "],\n\t[");
370 multiPointGeojsonString = multiPointGeojsonString.replace("]]", "]\n]");
371
372 writer.write(multiPointGeojsonString + "\n");
373 } catch(Exception e) {
374 logger.error("Unable to write multipoint geojson:\n**********************");
375 logger.error(multiPointGeojsonString);
376 logger.error("**********************\ninto file " + outFile.getAbsolutePath());
377 logger.error(e.getMessage(), e);
378 }
379
380 return outFile.getAbsolutePath();
381
382 }
383
384 public String writeFeaturesGeoJsonToFile() {
385 final String filename = "geojson-features_" + this.geoJsonFilenameWithSuffix;
386 File outFile = new File(this.outputFolder, filename);
387
388 FeatureCollection featureColl = this.toFeatureCollection();
389 String featuresGeojsonString = FeatureConverter.toStringValue(featureColl);
390 System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n");
391 try (
392 Writer writer = new BufferedWriter(new FileWriter(outFile));
393 ) {
394
395 writer.write(featuresGeojsonString + "\n");
396 } catch(Exception e) {
397 logger.error("Unable to write multipoint geojson:\n**********************");
398 logger.error(featuresGeojsonString);
399 logger.error("**********************\ninto file " + outFile.getAbsolutePath());
400 logger.error(e.getMessage(), e);
401 }
402
403 return outFile.getAbsolutePath();
404
405 }
406
407
408 public int getTotalCount() {
409 int total = 0;
410 for(JsonElement obj : this.countryCountsJsonArray) {
411 JsonObject json = obj.getAsJsonObject();
412 int count = json.get("count").getAsInt();
413 total += count;
414 }
415 return total;
416 }
417
418
419 // Unfinished and unused
420 public void parseCSVFile(String filename) throws Exception {
421 File csvData = new File(filename);
422 CSVParser parser = CSVParser.parse(csvData, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
423 for (CSVRecord csvRecord : parser) {
424 logger.info("Got record: " + csvRecord.toString());
425 }
426 }
427
428 public static void printUsage() {
429 System.err.println("CountryCodeCountsMapData <counts-by-countrycode-file>.json");
430 }
431
432 public static void main(String args[]) {
433 if(args.length != 1) {
434 printUsage();
435 System.exit(-1);
436 }
437
438 try {
439 File countsFile = new File(args[0]);
440
441 CountryCodeCountsMapData mapData = new CountryCodeCountsMapData(args[0]);
442
443 String multipointOutFileName = mapData.writeMultiPointGeoJsonToFile();
444 String featuresOutFileName = mapData.writeFeaturesGeoJsonToFile();
445
446 System.err.println("***********\nWrote mapdata to files " + multipointOutFileName
447 + " and " + featuresOutFileName);
448 System.err.println("You can paste the geojson contents of either of these files into the "
449 + "editor at http://geojson.tools/ to see the data arranged on a world map");
450
451 System.err.println("Total count for query: " + mapData.getTotalCount());
452
453 } catch(Exception e) {
454 logger.error(e.getMessage(), e);
455 }
456 }
457}
Note: See TracBrowser for help on using the repository browser.