source: other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java@ 33800

Last change on this file since 33800 was 33800, checked in by ak19, 4 years ago

Removed an adult site from crawled contents and added its url to blacklist conf file (for if ever anyone crawls our MRI set of common crawl sites again)

File size: 14.9 KB
Line 
1package org.greenstone.atea;
2
3import java.io.BufferedReader;
4import java.io.BufferedWriter;
5import java.io.File;
6import java.io.FileReader;
7import java.io.FileWriter;
8import java.io.Writer;
9
10import java.util.HashMap;
11import java.util.LinkedList;
12import java.util.List;
13import java.util.Map;
14
15import org.apache.commons.csv.*;
16import org.apache.log4j.Logger;
17
18// Google's gson imports for parsing any kind of json
19import com.google.gson.JsonArray;
20import com.google.gson.JsonElement;
21import com.google.gson.JsonObject;
22import com.google.gson.JsonParser;
23
24// For working with GeoJSON's Simple Features in Java
25import mil.nga.sf.geojson.Feature;
26import mil.nga.sf.geojson.FeatureCollection;
27import mil.nga.sf.geojson.FeatureConverter;
28import mil.nga.sf.geojson.Geometry;
29import mil.nga.sf.geojson.MultiPoint;
30import mil.nga.sf.geojson.Polygon;
31import mil.nga.sf.geojson.Position;
32
33
34/** Simple Features GeoJSON Java
35 * https://ngageoint.github.io/simple-features-geojson-java/ - liks to API and more
36 *
37 * https://mvnrepository.com/artifact/mil.nga.sf/sf-geojson (https://github.com/ngageoint/simple-features-geojson-java/)
38 *
39 * Also need the basic data types used by the Geometry objects above:
40 * https://mvnrepository.com/artifact/mil.nga/sf (https://github.com/ngageoint/simple-features-java)
41 *
42 * Further helper jars needed (because of encountering the exception documented at
43 * stackoverflow.com/questions/36278293/java-lang-classnotfoundexception-com-fasterxml-jackson-core-jsonprocessingexcep/36279872)
44 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core/2.10.0
45 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind
46 * https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations/2.10.0
47 */
48
49/**
50 * This class needs the gson library, and now the sf(-2.02).jar and sf-geojson(-2.02).jar files too
51 * to create and store Simple Features geo json objects with Java.
52 * I copied the gson jar file from GS3.
53 *
54 * TO COMPILE:
55 * maori-lang-detection/src$
56 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/CountryCodeCountsMapData.java
57 *
58 * TO RUN:
59 * maori-lang-detection/src$
60 * java -cp ".:../conf:../lib/*" org/greenstone/atea/CountryCodeCountsMapData ../mongodb-data/countrycodes.json ../mongodb-data/counts.json
61 *
62 */
63public class CountryCodeCountsMapData {
64 static Logger logger = Logger.getLogger(org.greenstone.atea.CountryCodeCountsMapData.class.getName());
65
66 //Map<String, JsonObject> countryToJsonMap;
67 JsonArray countryCodesJsonArray;
68 JsonArray countryCountsJsonArray;
69
70 // North-central Antarctica coords
71 private final double ANTARCTICA_LNG = 57.0d;
72 private final double ANTARCTICA_LAT = -70.0d;
73 // For EU coords, spot in Atlantic Ocean close to western European coast.
74 private final double EU_LNG = -20.0d;
75 private final double EU_LAT = 50.0d;
76
77 public CountryCodeCountsMapData(String countryCoordsJSONFile, String countryCountsJSONFile) throws Exception {
78
79 // map of country codes to lat, lng json for that country code
80 Map<String, JsonObject> countryToJsonMap = new HashMap<String, JsonObject>();
81
82 // Parse json file of country codes and put into a JsonArray.
83 // then put into map of each country code to its JsonObject.
84 countryCodesJsonArray = parseJSONFile(countryCoordsJSONFile);
85 for(JsonElement obj : countryCodesJsonArray) {
86 JsonObject countryCodeJson = obj.getAsJsonObject();
87 countryToJsonMap.put(countryCodeJson.get("country").getAsString(), countryCodeJson);
88 }
89
90 // Parse json file of country code counts
91 // Then for each JsonObject in this file,
92 // find a match on its country code in the map created above to get a country code JsonObject
93 // Get the longitude and latitude of the JsonObject that matched that country code.
94 // Add this lng,lat location information to the current JsonObject from the counts file.
95 countryCountsJsonArray = parseJSONFile(countryCountsJSONFile);
96
97 for(JsonElement obj : countryCountsJsonArray) {
98 JsonObject json = obj.getAsJsonObject();
99 String countryCode = json.get("_id").getAsString().toUpperCase();
100 // set the property back as uppercase and with property name "countrycode" instead of "_id"
101 json.remove("_id");
102 json.addProperty("countrycode", countryCode);
103
104 int count = (int)json.get("count").getAsDouble();
105
106 //logger.info("Got country code: " + countryCode);
107 //logger.info(" count: " + count);
108
109 // locate in countryCode map
110 JsonObject countryCodeJson = countryToJsonMap.get(countryCode);
111
112 if(countryCodeJson != null) {
113 //logger.info("Found in map: " + countryCodeJson.toString());
114
115 // for geojson, want longitude then latitude
116 Double lng = countryCodeJson.get("longitude").getAsDouble();
117 Double lat = countryCodeJson.get("latitude").getAsDouble();
118 //logger.info("long: " + Double.toString(lng) + ", lat: " + Double.toString(lat));
119 String countryName = countryCodeJson.get("name").getAsString();
120
121 // let's add lat and lng fields to countryCounts object
122 json.addProperty("lng", lng); // adds Number: https://javadoc.io/static/com.google.code.gson/gson/2.8.5/com/google/gson/JsonObject.html
123 json.addProperty("lat", lat);
124 json.addProperty("region", countryName);
125
126 } else {
127 logger.info("No geolocation info found for country code " + countryCode);
128 if(countryCode.equals("EU")) {
129 //logger.info("Unlisted country code: EU");
130 // add lat and lng for Europe
131 json.addProperty("lng", EU_LNG);
132 json.addProperty("lat", EU_LAT);
133 json.addProperty("region", "Europe");
134 }
135 else if(countryCode.equals("UNKNOWN")) {
136 //logger.info("Unlisted country code: UNKNOWN");
137 // add lat and lng for Antarctica
138 json.addProperty("lng", ANTARCTICA_LNG);
139 json.addProperty("lat", ANTARCTICA_LAT);
140 json.addProperty("region", "UNKNOWN");
141 } else {
142 logger.error("ERROR: entirely unknown country code: " + countryCode);
143 }
144 }
145 }
146
147 }
148
149 /** Convert mongodb tabular output of json records stored in the given file
150 * into a JsonArray.
151 */
152 public JsonArray parseJSONFile(String filename) throws Exception {
153 JsonArray jsonArray = null;
154 // read into string
155 try (
156 BufferedReader reader = new BufferedReader(new FileReader(filename));
157 ) {
158 StringBuilder str = //new StringBuilder();
159 new StringBuilder("[");
160 String line;
161 while((line = reader.readLine()) != null) {
162 line = line.replaceAll("/\\* [^\\/]* \\*/", "");
163 str.append(line);
164 if(line.endsWith("}")) {
165 str.append(",\n");
166 }
167 }
168 // replace last comma with closing bracket
169 String fileContents = str.substring(0, str.length()-2) + "]";
170
171 //System.err.println("Got file:\n" + fileContents);
172
173 // https://stackoverflow.com/questions/2591098/how-to-parse-json-in-java
174 jsonArray = new JsonParser().parse(fileContents).getAsJsonArray();
175
176 } catch(Exception e) {
177 throw e;
178 }
179
180
181 return jsonArray;
182 }
183
184 /**
185 * Reading
186 * https://www.here.xyz/api/concepts/geojsonbasics/
187 * https://ngageoint.github.io/simple-features-geojson-java/docs/api/
188 *
189 * https://stackoverflow.com/questions/55621480/cant-access-coordinates-member-of-geojson-feature-collection
190 *
191 * Downloaded geojson simple features' jar file from maven, but it didn't work:
192 * a more private version of MultiPoint.java is not included in the jar file (there's only
193 * mil.nga.sf.geojson.MultiPoint , whereas
194 * mil.nga.sf.MultiPoint is missing
195 *
196 * This seems to have gone wrong at
197 * https://github.com/ngageoint/simple-features-geojson-java/tree/master/src/main/java/mil/nga/sf
198 * but the one at
199 * https://github.com/ngageoint/simple-features-java/tree/master/src/main/java/mil/nga/sf
200 * has it. So I've been trying to build that, but don't have the correct version of maven.
201 */
202 public Geometry toMultiPointGeoJson() {
203 //System.err.println("toGeoJSON() is not yet implemented.");
204
205 List<Position> points = new LinkedList<Position>();
206
207 for(JsonElement obj : this.countryCountsJsonArray) {
208 JsonObject json = obj.getAsJsonObject();
209 Double lng = json.get("lng").getAsDouble();
210 Double lat = json.get("lat").getAsDouble();
211
212 Position point = new Position(lng, lat);
213 points.add(point);
214 }
215
216 Geometry multiPoint = new MultiPoint(points);
217
218 return multiPoint;
219 }
220
221 // https://javadoc.io/static/com.google.code.gson/gson/2.8.5/index.html
222 public FeatureCollection toFeatureCollection() {
223 final int HISTOGRAM_WIDTH = 4;
224
225 FeatureCollection featureCollection = new FeatureCollection();
226
227 for(JsonElement obj : this.countryCountsJsonArray) {
228 JsonObject json = obj.getAsJsonObject();
229
230 String countryCode = json.get("countrycode").getAsString();
231 String region = json.get("region").getAsString();
232 int count = json.get("count").getAsInt();
233
234 // make a histogram for each country
235 Geometry rectangle = this.toPolygon(json, count, HISTOGRAM_WIDTH);
236
237 Feature countryFeature = new Feature(rectangle);
238 Map<String, Object> featureProperties = new HashMap<String, Object>();
239 featureProperties.put("count", new Integer(count));
240 featureProperties.put("code", countryCode);
241 featureProperties.put("region", region);
242 countryFeature.setProperties(featureProperties);
243
244 featureCollection.addFeature(countryFeature);
245 }
246
247 return featureCollection;
248 }
249
250 // create rectangular "histogram" for each country code
251 private Geometry toPolygon(JsonObject json, int count, int HISTOGRAM_WIDTH) {
252 int half_width = HISTOGRAM_WIDTH/2;
253 double vertical_factor = 1.0;
254
255 Double lng = json.get("lng").getAsDouble();
256 Double lat = json.get("lat").getAsDouble();
257
258 String countryCode = json.get("countrycode").getAsString();
259
260
261 //create the 4 corners of the rectangle
262 // West is negative, east is positive, south is negative, north is positive
263 // See http://www.learnz.org.nz/sites/learnz.org.nz/files/lat-long-geo-data-01_0.jpg
264 // But since the histograms grow vertically/northwards and we can't go past a latitude of 90,
265 // to compensate, we increase the width of the histograms by the same factor as our inability
266 // to grow northwards.
267 Double north = lat + (vertical_factor * count);
268 while (north > 90) {
269 // recalculate north after decreasing histogram's vertical growth
270 // by the same factor as we increase its width
271 vertical_factor = vertical_factor/2.0;
272 half_width = 2 * half_width;
273 north = lat + (vertical_factor * count);
274 }
275 Double east = lng + half_width;
276 Double west = lng - half_width;
277 Double south = lat;
278
279 List<List<Position>> outerList = new LinkedList<List<Position>>();
280 List<Position> points = new LinkedList<Position>();
281 outerList.add(points);
282
283
284 points.add(new Position(west, south)); // Position(lng, lat) not Position(lat, lng)
285 points.add(new Position(west, north));
286 points.add(new Position(east, north));
287 points.add(new Position(east, south));
288
289
290 Geometry rectangle = new Polygon(outerList);
291
292 // Coords: a List of List of Positions, see https://ngageoint.github.io/simple-features-geojson-java/docs/api/
293 // https://www.here.xyz/api/concepts/geojsonbasics/#polygon
294
295 return rectangle;
296 }
297
298 public String writeMultiPointGeoJsonToFile(File folder) {
299 final String filename = "multipoint.json";
300 File outFile = new File(folder, filename);
301
302 Geometry geometry = this.toMultiPointGeoJson();
303 String multiPointGeojsonString = FeatureConverter.toStringValue(geometry);
304 System.err.println("\nMap data as MultiPoint geometry:\n" + multiPointGeojsonString + "\n");
305 try (
306 Writer writer = new BufferedWriter(new FileWriter(outFile));
307 ) {
308
309 // Some basic re-formatting for some immediate legibility
310 // But pasting the contents of the file (or the System.err output above)
311 // directly into http://geojson.tools/ will instantly reformat the json perfectly anyway.
312 multiPointGeojsonString = multiPointGeojsonString.replace("[[", "\n[\n\t[");
313 multiPointGeojsonString = multiPointGeojsonString.replace("],[", "],\n\t[");
314 multiPointGeojsonString = multiPointGeojsonString.replace("]]", "]\n]");
315
316 writer.write(multiPointGeojsonString + "\n");
317 } catch(Exception e) {
318 logger.error("Unable to write multipoint geojson:\n**********************");
319 logger.error(multiPointGeojsonString);
320 logger.error("**********************\ninto file " + outFile.getAbsolutePath());
321 logger.error(e.getMessage(), e);
322 }
323
324 return outFile.getAbsolutePath();
325
326 }
327
328 public String writeFeaturesGeoJsonToFile(File folder) {
329 final String filename = "geojson-features.json";
330 File outFile = new File(folder, filename);
331
332 FeatureCollection featureColl = this.toFeatureCollection();
333 String featuresGeojsonString = FeatureConverter.toStringValue(featureColl);
334 System.err.println("\nMap data as featurecollection:\n" + featuresGeojsonString + "\n");
335 try (
336 Writer writer = new BufferedWriter(new FileWriter(outFile));
337 ) {
338
339 writer.write(featuresGeojsonString + "\n");
340 } catch(Exception e) {
341 logger.error("Unable to write multipoint geojson:\n**********************");
342 logger.error(featuresGeojsonString);
343 logger.error("**********************\ninto file " + outFile.getAbsolutePath());
344 logger.error(e.getMessage(), e);
345 }
346
347 return outFile.getAbsolutePath();
348
349 }
350
351
352 public int getTotalCount() {
353 int total = 0;
354 for(JsonElement obj : this.countryCountsJsonArray) {
355 JsonObject json = obj.getAsJsonObject();
356 int count = json.get("count").getAsInt();
357 total += count;
358 }
359 return total;
360 }
361
362
363 // Unfinished and unused
364 public void parseCSVFile(String filename) throws Exception {
365 File csvData = new File(filename);
366 CSVParser parser = CSVParser.parse(csvData, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
367 for (CSVRecord csvRecord : parser) {
368 logger.info("Got record: " + csvRecord.toString());
369 }
370 }
371
372 public static void printUsage() {
373 System.err.println("CountryCodeCountsMapData countrycodes.json counts.json");
374 }
375
376 public static void main(String args[]) {
377 if(args.length != 2) {
378 printUsage();
379 System.exit(-1);
380 }
381
382 try {
383 File countsFile = new File(args[1]);
384 File parentFolder = countsFile.getParentFile().getCanonicalFile(); // canonical resolves any .. and . in path
385
386 CountryCodeCountsMapData mapData = new CountryCodeCountsMapData(args[0], args[1]);
387
388 String multipointOutFileName = mapData.writeMultiPointGeoJsonToFile(parentFolder);
389 String featuresOutFileName = mapData.writeFeaturesGeoJsonToFile(parentFolder);
390
391 System.err.println("***********\nWrote mapdata to files " + multipointOutFileName
392 + " and " + featuresOutFileName);
393 System.err.println("You can paste the geojson contents of either of these files into the "
394 + "editor at http://geojson.tools/ to see the data arranged on a world map");
395
396 System.err.println("Total count for query: " + mapData.getTotalCount());
397
398 } catch(Exception e) {
399 logger.error(e.getMessage(), e);
400 }
401 }
402}
Note: See TracBrowser for help on using the repository browser.