source: other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java@ 33869

Last change on this file since 33869 was 33869, checked in by ak19, 4 years ago

First cut at the RandomURLsForDomainGenerator.java class and the mongodb method it needs added to MongoDBAccess. Still need to generate a domainURLs file to start testing whether the code even works. But at least it finally compiles.

File size: 22.7 KB
Line 
1package org.greenstone.atea;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.File;
6import java.io.FileReader;
7import java.io.FileWriter;
8import java.io.Writer;
9
10import java.util.HashMap;
11import java.util.LinkedList;
12import java.util.List;
13import java.util.Map;
14
15//import java.lang.Math; //automatically imported apparently
16
17import org.apache.commons.csv.*;
18import org.apache.log4j.Logger;
19
20// Google's gson imports for parsing any kind of json
21import com.google.gson.JsonArray;
22import com.google.gson.JsonElement;
23import com.google.gson.JsonObject;
24import com.google.gson.JsonParser;
25
26// For working with GeoJSON's Simple Features in Java
27import mil.nga.sf.geojson.Feature;
28import mil.nga.sf.geojson.FeatureCollection;
29import mil.nga.sf.geojson.FeatureConverter;
30import mil.nga.sf.geojson.Geometry;
31import mil.nga.sf.geojson.MultiPoint;
32import mil.nga.sf.geojson.Polygon;
33import mil.nga.sf.geojson.Position;
34
35
36/**
37 * Run a mongodb query that produces counts per countrycode like in the following 2 examples:
38 *
39 * 1. count of country codes for all sites
40 * db.Websites.aggregate([
41 *
42 * { $unwind: "$geoLocationCountryCode" },
43 * {
44 * $group: {
45 * _id: "$geoLocationCountryCode",
46 * count: { $sum: 1 }
47 * }
48 * },
49 * { $sort : { count : -1} }
50 * ]);
51 *
52 * Then store the mongodb query result's JSON format output in a file called "counts.json".
53 * Then run this program with counts.json as parameter
54 * Copy the geojson output into http://geojson.tools/
55 *
56 * 2. count of country codes for sites that have at least one page detected as MRI
57 *
58 * db.Websites.aggregate([
59 * {
60 * $match: {
61 * numPagesInMRI: {$gt: 0}
62 * }
63 * },
64 * { $unwind: "$geoLocationCountryCode" },
65 * {
66 * $group: {
67 * _id: {$toLower: '$geoLocationCountryCode'},
68 * count: { $sum: 1 }
69 * }
70 * },
71 * { $sort : { count : -1} }
72 * ]);
73 *
74 * Store the mongodb query result's JSON format output in a file called "counts_sitesWithPagesInMRI.json".
75 * Then run this program with counts_sitesWithPagesInMRI.json as parameter.
76 * Copy the geojson output into http://geojson.tools/
77 *
78 * ##################
79 * TO COMPILE:
80 * maori-lang-detection/src$
81 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/CountryCodeCountsMapData.java
82 *
83 * TO RUN:
84 * maori-lang-detection/src$
85 * java -cp ".:../conf:../lib/*" org/greenstone/atea/CountryCodeCountsMapData ../mongodb-data/counts.json
86 *###################
87 *
88 * This class needs the gson library, and now the sf-geojson(-2.02).jar and
89 * helper jars sf(-2.02).jar and 3 jackson jars too,
90 * to create and store Simple Features geo json objects with Java.
91 * I copied the gson jar file from GS3.
92 *
93 * Simple Features GeoJSON Java
94 * https://ngageoint.github.io/simple-features-geojson-java/ - liks to API and more
95 *
96 * https://mvnrepository.com/artifact/mil.nga.sf/sf-geojson (https://github.com/ngageoint/simple-features-geojson-java/)
97 *
98 * Also need the basic data types used by the Geometry objects above:
99 * https://mvnrepository.com/artifact/mil.nga/sf (https://github.com/ngageoint/simple-features-java)
100 *
101 * Further helper jars needed (because of encountering the exception documented at
102 * stackoverflow.com/questions/36278293/java-lang-classnotfoundexception-com-fasterxml-jackson-core-jsonprocessingexcep/36279872)
103 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core/2.10.0
104 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind
105 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations/2.10.0
106 */
107public class CountryCodeCountsMapData {
108 static Logger logger = Logger.getLogger(org.greenstone.atea.CountryCodeCountsMapData.class.getName());
109
110 //Map<String, JsonObject> countryToJsonMap;
111 JsonArray countryCodesJsonArray;
112 JsonArray countryCountsJsonArray;
113
114 // North-central Antarctica coords
115 private final double ANTARCTICA_LNG = 57.0d;
116 private final double ANTARCTICA_LAT = -70.0d;
117 // For EU coords, spot in Atlantic Ocean close to western European coast.
118 private final double EU_LNG = -20.0d;
119 private final double EU_LAT = 50.0d;
120
121 private final String geoJsonFilenameWithSuffix;
122 private final File outputFolder;
123
124 public CountryCodeCountsMapData(String countryCountsJSONFilename) throws Exception {
125
126 // work out the unique filename we're going to save the geojson files under
127 // and the folder we're going to save them into
128 File countryCountsJSONFile = new File(countryCountsJSONFilename);
129 String tailname = countryCountsJSONFile.getName();
130 this.geoJsonFilenameWithSuffix = (tailname.startsWith("counts_")) ? tailname.substring("counts_".length()) : tailname;
131 this.outputFolder = countryCountsJSONFile.getParentFile().getCanonicalFile(); // canonical resolves any .. and . in path
132
133 // locate the countrycodes.json file
134 File countryCoordsJSONFile = new File(this.getClass().getClassLoader().getResource("countrycodes.json").getFile());
135
136 // Create a map of ALL country code names to ALL the country code json objects
137 // that contain the location (lat, lng) info for each country code
138 Map<String, JsonObject> countryToJsonMap = new HashMap<String, JsonObject>();
139
140 // Parse json file of country codes and put into a JsonArray.
141 // then put into map of each country code to its JsonObject.
142 countryCodesJsonArray = parseJSONFile(countryCoordsJSONFile);
143 for(JsonElement obj : countryCodesJsonArray) {
144 JsonObject countryCodeJson = obj.getAsJsonObject();
145 countryToJsonMap.put(countryCodeJson.get("country").getAsString(), countryCodeJson);
146 }
147
148 // Parse json file of country code counts
149 // Then for each JsonObject in this file,
150 // find a match on its country code in the map created above to get a country code JsonObject
151 // Get the longitude and latitude of the JsonObject that matched that country code.
152 // Add this lng,lat location information to the current JsonObject from the counts file.
153 countryCountsJsonArray = parseJSONFile(countryCountsJSONFile);
154
155 for(JsonElement obj : countryCountsJsonArray) {
156 JsonObject json = obj.getAsJsonObject();
157 String countryCode = json.get("_id").getAsString().toUpperCase();
158 // set the property back as uppercase and with property name "countrycode" instead of "_id"
159 json.remove("_id");
160 json.addProperty("countrycode", countryCode);
161
162 int count = (int)json.get("count").getAsDouble();
163
164 //logger.info("Got country code: " + countryCode);
165 //logger.info(" count: " + count);
166
167 // locate in countryCode map
168 JsonObject countryCodeJson = countryToJsonMap.get(countryCode);
169
170 if(countryCodeJson != null) {
171 //logger.info("Found in map: " + countryCodeJson.toString());
172
173 // for geojson, want longitude then latitude
174 Double lng = countryCodeJson.get("longitude").getAsDouble();
175 Double lat = countryCodeJson.get("latitude").getAsDouble();
176 //logger.info("long: " + Double.toString(lng) + ", lat: " + Double.toString(lat));
177 String countryName = countryCodeJson.get("name").getAsString();
178
179 // let's add lat and lng fields to countryCounts object
180 json.addProperty("lng", lng); // adds Number: https://javadoc.io/static/com.google.code.gson/gson/2.8.5/com/google/gson/JsonObject.html
181 json.addProperty("lat", lat);
182 json.addProperty("region", countryName);
183
184 } else {
185 logger.info("No geolocation info found for country code " + countryCode);
186 if(countryCode.equals("EU")) {
187 //logger.info("Unlisted country code: EU");
188 // add lat and lng for Europe
189 json.addProperty("lng", EU_LNG);
190 json.addProperty("lat", EU_LAT);
191 json.addProperty("region", "Europe");
192 }
193 else if(countryCode.equals("UNKNOWN")) {
194 //logger.info("Unlisted country code: UNKNOWN");
195 // add lat and lng for Antarctica
196 json.addProperty("lng", ANTARCTICA_LNG);
197 json.addProperty("lat", ANTARCTICA_LAT);
198 json.addProperty("region", "UNKNOWN");
199 } else {
200 logger.error("ERROR: entirely unknown country code: " + countryCode);
201 }
202 }
203 }
204
205 }
206
207 /** Convert mongodb tabular output of json records stored in the given file
208 * into a JsonArray.
209 */
210 public JsonArray parseJSONFile(File file) throws Exception {
211 JsonArray jsonArray = null;
212 // read into string
213 try (
214 BufferedReader reader = new BufferedReader(new FileReader(file));
215 ) {
216 StringBuilder str = //new StringBuilder();
217 new StringBuilder("[");
218 String line;
219
220 boolean multi_line_comment = false;
221
222 while((line = reader.readLine()) != null) {
223 line = line.trim();
224
225 // ignore any single line comments nested in multi-line symbols
226 if(line.startsWith("/*") && line.endsWith("*/")) {
227 continue; // skip line
228 }
229
230 // skip multi-line comments spread over multiple lines
231 // assumes this ends on a line containing */ without further content on the line.
232 if(line.startsWith("/*") && !line.endsWith("*/")) {
233 multi_line_comment = true;
234 continue; // skip line
235 }
236 if(multi_line_comment) {
237 if(line.contains("*/")) {
238 multi_line_comment = false;
239 }
240
241 continue; // we're in a comment or at end of comment, skip line
242 }
243
244 str.append(line);
245 if(line.endsWith("}")) {
246 str.append(",\n");
247 }
248 }
249 // replace last comma with closing bracket
250 String fileContents = str.substring(0, str.length()-2) + "]";
251
252 //System.err.println("Got file:\n" + fileContents);
253
254 // https://stackoverflow.com/questions/2591098/how-to-parse-json-in-java
255 jsonArray = new JsonParser().parse(fileContents).getAsJsonArray();
256
257 } catch(Exception e) {
258 throw e;
259 }
260
261
262 return jsonArray;
263 }
264
265 /**
266 * Reading
267 * https://www.here.xyz/api/concepts/geojsonbasics/
268 * https://ngageoint.github.io/simple-features-geojson-java/docs/api/
269 *
270 * https://stackoverflow.com/questions/55621480/cant-access-coordinates-member-of-geojson-feature-collection
271 *
272 * Downloaded geojson simple features' jar file from maven, but it didn't work:
273 * a more private version of MultiPoint.java is not included in the jar file (there's only
274 * mil.nga.sf.geojson.MultiPoint , whereas
275 * mil.nga.sf.MultiPoint is missing
276 *
277 * This seems to have gone wrong at
278 * https://github.com/ngageoint/simple-features-geojson-java/tree/master/src/main/java/mil/nga/sf
279 * but the one at
280 * https://github.com/ngageoint/simple-features-java/tree/master/src/main/java/mil/nga/sf
281 * has it. So I've been trying to build that, but don't have the correct version of maven.
282 */
283 public Geometry toMultiPointGeoJson() {
284 //System.err.println("toGeoJSON() is not yet implemented.");
285
286 List<Position> points = new LinkedList<Position>();
287
288 for(JsonElement obj : this.countryCountsJsonArray) {
289 JsonObject json = obj.getAsJsonObject();
290 Double lng = json.get("lng").getAsDouble();
291 Double lat = json.get("lat").getAsDouble();
292
293 Position point = new Position(lng, lat);
294 points.add(point);
295 }
296
297 Geometry multiPoint = new MultiPoint(points);
298
299 return multiPoint;
300 }
301
302 // https://javadoc.io/static/com.google.code.gson/gson/2.8.5/index.html
303 public FeatureCollection toFeatureCollection() {
304 final int HISTOGRAM_WIDTH = 4;
305
306 FeatureCollection featureCollection = new FeatureCollection();
307
308 for(JsonElement obj : this.countryCountsJsonArray) {
309 JsonObject json = obj.getAsJsonObject();
310
311 String countryCode = json.get("countrycode").getAsString();
312 String region = json.get("region").getAsString();
313 int count = json.get("count").getAsInt();
314
315 // make a histogram for each country
316 Geometry rectangle = this.toPolygon(json, count, HISTOGRAM_WIDTH);
317
318 Feature countryFeature = new Feature(rectangle);
319 Map<String, Object> featureProperties = new HashMap<String, Object>();
320 featureProperties.put("count", new Integer(count));
321 featureProperties.put("code", countryCode);
322 featureProperties.put("region", region);
323 countryFeature.setProperties(featureProperties);
324
325 featureCollection.addFeature(countryFeature);
326 }
327
328 return featureCollection;
329 }
330
331 // create rectangular "histogram" for each country code
332 private Geometry toPolygon(JsonObject json, final int count, final int HISTOGRAM_WIDTH) {
333 int half_width = HISTOGRAM_WIDTH/2;
334 double vertical_factor = 1.0;
335
336 final Double lng = json.get("lng").getAsDouble();
337 final Double lat = json.get("lat").getAsDouble();
338
339 String countryCode = json.get("countrycode").getAsString();
340
341
342 //create the 4 corners of the rectangle
343 // West is negative, east is positive, south is negative, north is positive
344 // See http://www.learnz.org.nz/sites/learnz.org.nz/files/lat-long-geo-data-01_0.jpg
345 // But since the histograms grow vertically/northwards and we can't go past a latitude of 90,
346 // to compensate, we increase the width of the histograms by the same factor as our inability
347 // to grow northwards.
348 Double north = lat + (vertical_factor * count);
349
350 while (north > 90) {
351 // recalculate north after decreasing histogram's vertical growth
352 // by the same factor as we increase its width
353 vertical_factor = vertical_factor/2.0;
354 half_width = 2 * half_width;
355 north = lat + (vertical_factor * count);
356 }
357 Double east = lng + half_width;
358 Double west = lng - half_width;
359 Double south = lat;
360
361 List<Position> pts = recalculateAreaIfLarge(count, HISTOGRAM_WIDTH, countryCode, lat, lng, north, south, east, west);
362
363 /*
364 System.err.println("For country " + countryCode + ":");
365 System.err.println("north = " + north);
366 System.err.println("south = " + south);
367 System.err.println("east = " + east);
368 System.err.println("west = " + west + "\n");
369 System.err.println("-------------");
370 */
371
372 List<List<Position>> outerList = new LinkedList<List<Position>>();
373 if(pts != null) {
374 outerList.add(pts);
375 } else {
376
377
378 List<Position> points = new LinkedList<Position>();
379 outerList.add(points);
380
381 points.add(new Position(west, south)); // Position(lng, lat) not Position(lat, lng)
382 points.add(new Position(west, north));
383 points.add(new Position(east, north));
384 points.add(new Position(east, south));
385 }
386
387 Geometry rectangle = new Polygon(outerList);
388
389 // Coords: a List of List of Positions, see https://ngageoint.github.io/simple-features-geojson-java/docs/api/
390 // https://www.here.xyz/api/concepts/geojsonbasics/#polygon
391
392 return rectangle;
393 }
394
395 private List<Position> recalculateAreaIfLarge(final int count, final int HISTOGRAM_WIDTH, String countryCode,
396 final Double lat, final Double lng,
397 Double north, Double south, Double east, Double west) {
398 boolean recalculated = false;
399
400 // Check if we're dealing with very large numbers, in which case, we can have follow off the longitude edges
401 // Max longitude values are -180 to 180. So a max of 360 units between them. (Max latitude is -90 to 90)
402 // "Longitude is in the range -180 and +180 specifying coordinates west and east of the Prime Meridian, respectively.
403 // For reference, the Equator has a latitude of 0°, the North pole has a latitude of 90° north (written 90° N or +90°),
404 // and the South pole has a latitude of -90°."
405 if((east + Math.abs(west)) > 360 || east > 180 || west < -180) {
406 System.err.println("For country " + countryCode + ":");
407 System.err.println("north = " + north);
408 System.err.println("south = " + south);
409 System.err.println("east = " + east);
410 System.err.println("west = " + west + "\n");
411
412 int half_width = HISTOGRAM_WIDTH/2; // reset half_width
413
414 double v_tmp_count = Math.sqrt(count);
415 //double h_tmp_count = Math.floor(v_tmp_count);
416 //v_tmp_count = Math.ceil(v_tmp_count);
417 double h_tmp_count = v_tmp_count;
418System.err.println("count = " + count);
419 System.err.println("v = " + v_tmp_count);
420 System.err.println("h = " + h_tmp_count);
421 System.err.println("lat = " + lat);
422 System.err.println("lng = " + lng + "\n");
423
424 if(h_tmp_count > 90) { // 360 max width, of which each longitude
425 // is 4 units (horizontal factor = 4, and half-width is half
426 // of that). So max width/h_tmp_count allowed 90 => 360
427 // longitude on map (-180 to 180).
428 // Put the excess h_tmp_count into v_tmp_count and ensure
429 // that does not go over 90+90 = 180 max. Vertical_factor is 1.
430
431 System.err.println("Out of longitude range. Attempting to compensate...");
432
433 double diff = h_tmp_count - 80.0; // actually 90 wraps on geojson tools, 80 doesn't
434 h_tmp_count -= diff;
435 v_tmp_count = (count/h_tmp_count);
436
437 if(v_tmp_count > 180 || h_tmp_count > 90) {
438 System.err.println("Warning: still exceeded max latitude and/or longitude range");
439 }
440
441 }
442
443 System.err.println("Recalculating polygon for country with high count: " + countryCode + ".");
444 System.err.println("count = " + count);
445 System.err.println("v = " + v_tmp_count);
446 System.err.println("h = " + h_tmp_count);
447 System.err.println("lat = " + lat);
448 System.err.println("lng = " + lng + "\n");
449
450
451 north = lat + v_tmp_count;
452 south = lat;
453 east = lng + (h_tmp_count * half_width); // a certain width, half_width, represents one unit in the x axis
454 west = lng - (h_tmp_count * half_width);
455
456 /*
457 System.err.println("north = " + north);
458 System.err.println("south = " + south);
459 System.err.println("east = " + east);
460 System.err.println("west = " + west + "\n");
461 */
462
463 if(north > 90) {
464 // centre vertically on lat
465 north = lat + (v_tmp_count/2);
466 south = lat - (v_tmp_count/2);
467 }
468
469 if(west < -180.0) {
470 double h_diff = -180.0 - west; // west is a larger negative value than -180, so subtracting west from -180 produces a positive h_diff value
471 west = -180.0; // set to extreme western edge
472 east = east + h_diff;
473 }
474 else if(east > 180.0) {
475 double h_diff = east - 180.0; // the country's longitude (lng) is h_diff from the eastern edge
476 east = 180.0; // maximise eastern edge
477 west = west - h_diff; // then grow the remainder of h_tmp_count in the opposite (western/negative) direction
478 }
479
480 // NOTE: Can't centre on country, (lat,lng), as we don't know whether either of lat or lng has gone past the edge
481
482 // Hopefully we don't exceed +90/-90 lat and +/-180 longitude
483
484 recalculated = true;
485
486
487 } else if(west < -140.0) {
488 // past -140 west, the edges don't wrap well in geotools, so shift any points more west/negative than -140:
489
490 double diff = -140.0 - west;
491 west = -140.0;
492 east += diff;
493
494 recalculated = true;
495 }
496
497 if(recalculated) {
498 System.err.println("\nnorth = " + north);
499 System.err.println("south = " + south);
500 System.err.println("east = " + east);
501 System.err.println("west = " + west);
502
503
504 List<Position> points = new LinkedList<Position>();
505
506 points.add(new Position(west, south)); // Position(lng, lat) not Position(lat, lng)
507 points.add(new Position(west, north));
508 points.add(new Position(east, north));
509 points.add(new Position(east, south));
510
511 return points;
512 }
513
514 return null;
515 }
516
517 public String writeMultiPointGeoJsonToFile() {
518 final String filename = "multipoint_" + this.geoJsonFilenameWithSuffix;
519 File outFile = new File(this.outputFolder, filename);
520
521 Geometry geometry = this.toMultiPointGeoJson();
522 String multiPointGeojsonString = FeatureConverter.toStringValue(geometry);
523 System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n");
524 try (
525 Writer writer = new BufferedWriter(new FileWriter(outFile));
526 ) {
527
528 // Some basic re-formatting for some immediate legibility
529 // But pasting the contents of the file (or the System.err output above)
530 // directly into http://geojson.tools/ will instantly reformat the json perfectly anyway.
531 multiPointGeojsonString = multiPointGeojsonString.replace("[[", "\n[\n\t[");
532 multiPointGeojsonString = multiPointGeojsonString.replace("],[", "],\n\t[");
533 multiPointGeojsonString = multiPointGeojsonString.replace("]]", "]\n]");
534
535 writer.write(multiPointGeojsonString + "\n");
536 } catch(Exception e) {
537 logger.error("Unable to write multipoint geojson:\n**********************");
538 logger.error(multiPointGeojsonString);
539 logger.error("**********************\ninto file " + outFile.getAbsolutePath());
540 logger.error(e.getMessage(), e);
541 }
542
543 return outFile.getAbsolutePath();
544
545 }
546
547 public String writeFeaturesGeoJsonToFile() {
548 final String filename = "geojson-features_" + this.geoJsonFilenameWithSuffix;
549 File outFile = new File(this.outputFolder, filename);
550
551 FeatureCollection featureColl = this.toFeatureCollection();
552 String featuresGeojsonString = FeatureConverter.toStringValue(featureColl);
553 System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n");
554 try (
555 Writer writer = new BufferedWriter(new FileWriter(outFile));
556 ) {
557
558 writer.write(featuresGeojsonString + "\n");
559 } catch(Exception e) {
560 logger.error("Unable to write multipoint geojson:\n**********************");
561 logger.error(featuresGeojsonString);
562 logger.error("**********************\ninto file " + outFile.getAbsolutePath());
563 logger.error(e.getMessage(), e);
564 }
565
566 return outFile.getAbsolutePath();
567
568 }
569
570
571 public int getTotalCount() {
572 int total = 0;
573 for(JsonElement obj : this.countryCountsJsonArray) {
574 JsonObject json = obj.getAsJsonObject();
575 int count = json.get("count").getAsInt();
576 total += count;
577 }
578 return total;
579 }
580
581
582 // Unfinished and unused
583 public void parseCSVFile(String filename) throws Exception {
584 File csvData = new File(filename);
585 CSVParser parser = CSVParser.parse(csvData, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
586 for (CSVRecord csvRecord : parser) {
587 logger.info("Got record: " + csvRecord.toString());
588 }
589 }
590
591 public static void printUsage() {
592 System.err.println("CountryCodeCountsMapData <counts-by-countrycode-file>.json");
593 }
594
595 public static void main(String args[]) {
596 if(args.length != 1) {
597 printUsage();
598 System.exit(-1);
599 }
600
601 try {
602 File countsFile = new File(args[0]);
603 if(!countsFile.exists()) {
604 System.err.println("File " + countsFile + " does not exist");
605 System.exit(-1);
606 }
607
608 CountryCodeCountsMapData mapData = new CountryCodeCountsMapData(args[0]);
609
610 String multipointOutFileName = mapData.writeMultiPointGeoJsonToFile();
611 String featuresOutFileName = mapData.writeFeaturesGeoJsonToFile();
612
613 System.err.println("***********\nWrote mapdata to files " + multipointOutFileName
614 + " and " + featuresOutFileName);
615 System.err.println("You can paste the geojson contents of either of these files into the "
616 + "editor at http://geojson.tools/ to see the data arranged on a world map");
617
618 System.err.println("Total count for query: " + mapData.getTotalCount());
619
620 } catch(Exception e) {
621 logger.error(e.getMessage(), e);
622 }
623 }
624}
Note: See TracBrowser for help on using the repository browser.