source: other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java@ 33925

Last change on this file since 33925 was 33925, checked in by ak19, 4 years ago
  1. Bugfix: oversight, should return uri encoded URL for mapData, forgot to return modified string. Even with this correction, getting firefox screenshot of the full geojson still doesn't work with either the complex geojson or simple polygon shape, from command line or program.
File size: 18.2 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8import org.greenstone.util.SafeProcess;
9
10/**
11 * Runs some of the important mongoDB queries I ran.
12 *
13 * This program expects a folder ../mongo-data-auto to exist.
14 *
15 * TO COMPILE OR RUN, FIRST DO:
16 * cd maori-lang-detection/apache-opennlp-1.9.1
17 * export OPENNLP_HOME=`pwd`
18 * cd maori-lang-detection/src
19 *
20 * TO COMPILE:
21 * maori-lang-detection/src$
22 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool.java
23 *
24 * TO RUN:
25 * maori-lang-detection/src$
26 * java -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool
27 * OR:
28 * java -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool ../mongodb-data/domainsNZ_IsMRI.txt
29 *
30*/
31public class SummaryTool {
32
33 static Logger logger = Logger.getLogger(org.greenstone.atea.SummaryTool.class.getName());
34
35
36 static private final String GEOJSON_FEATURES_FILE_PREFIX = "geojson-features_";
37
38 static private final long FIXED_SEED = 1000;
39
40 private final MongoDBQueryer mongodbQueryer;
41 private File outFolder;
42
43
44 public static class Tuple {
45 public final String url;
46 public final String countryCode;
47
48 public Tuple(String url, String countryCode) {
49 this.url = url;
50 this.countryCode = countryCode;
51 }
52
53 public String toString() {
54 return this.url + "," + countryCode;
55 }
56 }
57
58
59 public SummaryTool(MongoDBQueryer mongodbQueryer, File outFolder)
60 {
61 this.mongodbQueryer = mongodbQueryer;
62 this.outFolder = outFolder;
63 }
64
65
66 public void produceURLsForPagesInMRI(File domainsFile) {
67 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
68 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
69 writeURLsToFile(urlsList, outFile, urlsList.size());
70
71 System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
72 + Utility.getFilePath(outFile));
73 }
74
75 public void produceURLsForPagesContainingMRI(File domainsFile) {
76 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile);
77 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
78 writeURLsToFile(urlsList, outFile, urlsList.size());
79
80 System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
81 + Utility.getFilePath(outFile));
82 }
83
84 private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) {
85 ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
86
87 // 1. read each url from the domainsFile
88 // 1a. do the query
89 // 1b. add the arraylist result to urls
90
91 try (
92 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
93 ) {
94
95 String domain;
96
97 while((domain = reader.readLine()) != null) {
98 domain = domain.trim();
99 if(!domain.equals("")) {
100
101 String countryCode = "";
102 int index = domain.lastIndexOf(",");
103 if(index != -1) {
104 countryCode = domain.substring(index+1).trim();
105 domain = domain.substring(0, index);
106 }
107 ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType);
108
109 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
110 if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) {
111 System.out.println(" " + countryCode + " domain " + domain + " had no webpages where isMRI=true - only containsMRI.");
112 }
113
114 //urlsList.addAll(moreURLs);
115 for(int i = 0; i < moreURLs.size(); i++) {
116 urlsList.add(new Tuple(moreURLs.get(i), countryCode));
117 }
118
119 }
120 }
121 System.err.println("");
122 } catch(Exception e) {
123 logger.error("Unable to read URLs from file " + Utility.getFilePath(domainsFile));
124 logger.error(e.getMessage(), e);
125 }
126
127 return urlsList;
128 }
129
130 /** Given a hand curated list of all sites with positive numPagesContainingMRI
131 * determined by manual inspection, get a listing of all their web pages that
132 * are IN_MRI (or CONTAINS_MRI?).
133 * Total all these pages that are inMRI (N), then work out the correct sample size (n)
134 * at 90% confidence with 5% margin of error. Then generate a random listing
135 * of n of these pages in MRI of these trusted sites and output to a file
136 * for manual inspection of the sample webpage URLs at page-level. */
137 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
138 * of all their web pages IN_MRI (or CONTAINS_MRI).
139 * Plus a listing of all the NZ pages IN_MRI. */
140 //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
141 public void mriWebPageListingForDomainListing(File domainsFile) {
142
143 int filterType = MongoDBQueryer.IS_MRI;
144
145 // for overseas websites,
146 //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
147
148 // 0. get a list of all the web pages in the given domain listing where isMRI = true
149 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
150 // produceURLsForPagesInMRI(domainsFile);
151
152 // 1. calculate the population size, N, the number of all webpages in the given domain
153 // site listing where isMRI = true.
154 int N_totalNumPages = urlsList.size();
155
156 // 2. write all the URLs in urlsList to a file
157 //File outFolder = domainsFile.getParentFile();
158 String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI";
159 File outFile = new File(outFolder, filterName+"_full_"+domainsFile.getName());
160
161 writeURLsToFile(urlsList, outFile, N_totalNumPages);
162 System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
163 + "\ninto file: " + Utility.getFilePath(outFile));
164
165 // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
166 int n_numSampleURLs = calcSampleSize(N_totalNumPages);
167
168 System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist file: " + N_totalNumPages);
169 System.err.println(" (out of " + mongodbQueryer.countOfWebpagesMatching(filterType)
170 + " web pages across ALL sites for which " + filterName + " = true)");
171 System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
172
173 // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
174 // Using a constant seed for reproducibility
175 // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
176 Collections.shuffle(urlsList, new Random(FIXED_SEED));
177
178 outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
179 writeURLsToFile(urlsList, outFile, n_numSampleURLs);
180 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
181 + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile));
182
183 // For N = 6557, z-alpha-over-2 = 1.6449 and m = 0.05 (5%),
184 // n = (z-alpha-over-2^2 x N) / (z-alpha-over-2^2 + 4 x (N-1) x m^2)
185 // = (1.6449^2×6557) ÷ (1.6449^2 + 4 × 6556×0.05^2) = 259.88526851 => 260 rounded up. Check.
186 }
187
188 /**
189 * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
190 * for given population size N.
191 * @return n, the sample size.
192 */
193 public int calcSampleSize(int N) {
194
195 // calculate sample size n for population size N if using 90% confidence and 5% margin of error
196 // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
197 // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
198 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
199
200 double m = 0.05; // margin of error = 5%
201 // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
202 // For 90% confidence, use the table of known z_alpha/2 values from step 1 of
203 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
204 double z_alpha_over_2 = 1.6449;
205
206 // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
207 // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
208 double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
209
210 // Round up to get a whole number:
211 return (int)Math.ceil(n);
212 }
213
214 /**
215 * Writes out the first n URLs in urlsList into outFile.
216 */
217 private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
218 try (
219 Writer writer = new BufferedWriter(new FileWriter(outFile));
220 ) {
221
222 for (int i=0; i < n; i++) {
223 Tuple urlInfo = urlsList.get(i);
224
225 //System.out.println(list.get(i));
226 writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
227 }
228 } catch(Exception e) {
229 logger.error("Unable to write to file " + Utility.getFilePath(outFile));
230 logger.error(e.getMessage(), e);
231 }
232 }
233
234 /* ---------------------------------------- */
235 /**
236 * Create the file 5counts_containsMRISites_allNZGrouped.json
237 * that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages
238 * that CONTAIN_MRI, followed by counts and domains listing for overseas sites
239 * that CONTAIN_MRI.
240 * @return full path of file generated
241 */
242 public String writeContainsMRISites_nzSitesAndTLDsGrouped() {
243
244 File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json");
245
246 String filename = Utility.getFilePath(outFile);
247
248 try (
249 Writer writer = new BufferedWriter(new FileWriter(outFile));
250 ) {
251 // first write out NZ sites and .nz TLD count and domains
252 mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
253 // next write out all overseas sites (not NZ origin or .nz TLD)
254 // that have no "mi" in the URL path as mi.* or */mi
255 boolean isMiInURLPath = false;
256 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI);
257
258 } catch(Exception e) {
259 logger.error("Unable to write to file " + filename);
260 logger.error(e.getMessage(), e);
261 }
262
263 System.err.println("*** Wrote file: " + filename);
264
265 return filename;
266 }
267
268 /**
269 * Create the file 5a_counts_tentativeNonAutotranslatedSites.json
270 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
271 * followed by counts and domain listing for overseas sites that are either from Australia
272 * or don't contain mi in their URL path.
273 * @return full path of file generated
274 */
275 public String writeTentativeNonAutotranslatedSites() {
276
277 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
278
279 String filename = Utility.getFilePath(outFile);
280
281 try (
282 Writer writer = new BufferedWriter(new FileWriter(outFile));
283 ) {
284 // first write out NZ sites and .nz TLD count and domains
285 mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
286 // next write out all overseas sites (not NZ origin or .nz TLD)
287 // that have no "mi" in the URL path as mi.* or */mi
288 boolean isMiInURLPath = false;
289 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
290
291 } catch(Exception e) {
292 logger.error("Unable to write to file " + filename);
293 logger.error(e.getMessage(), e);
294 }
295
296 System.err.println("*** Wrote file: " + filename);
297
298 return filename;
299 }
300
301 /**
302 * Create the file 5b_counts_overseasSitesWithMiInPath.json
303 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
304 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
305 * This listing is separate to allow easier weeding out of product sites/autotranslated
306 * sites when eyeballing the listing output.
307 */
308 public String writeOverseasSitesWithMiInURLPath() {
309 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
310
311 String filename = Utility.getFilePath(outFile);
312 try (
313 Writer writer = new BufferedWriter(new FileWriter(outFile));
314 ) {
315 boolean isMiInURLPath = true;
316 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
317
318 } catch(Exception e) {
319 logger.error("Unable to write to file " + filename);
320 logger.error(e.getMessage(), e);
321 }
322
323 System.err.println("*** Wrote file: " + filename);
324 return filename;
325 }
326
327 public static void printUsage() {
328 System.err.println("Usage: SummaryTool [domains.txt]");
329 }
330
331 /**
332 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
333 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
334 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
335 * translated and really contain at least one webpage containing at least one sentence in MRI.
336 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
337 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
338 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
339 * 90% confidence with 5% margin of error for testing binary outcomes, see
340 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
341 */
342 public static void main(String args[]) {
343 if(args.length >= 2) {
344 printUsage();
345 System.exit(-1);
346 }
347
348 try (
349 MongoDBQueryer mongodb = new MongoDBQueryer();
350 ) {
351
352 mongodb.connectToDB();
353
354 // output files will be stored in mongodb-data-auto
355 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
356 SummaryTool listing = new SummaryTool(mongodb, outFolder);
357
358 System.out.println("*************************************");
359
360
361 if(args.length >= 1) { // web page urls listing mode
362 File domainsFile = new File(args[0]);
363 if(!domainsFile.exists()) {
364 System.err.println("File " + domainsFile + " does not exist");
365 System.exit(-1);
366 }
367
368 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
369 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
370
371
372 // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
373 // then also do the shuffle to gen X num of random web page URLs.
374 //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
375 listing.mriWebPageListingForDomainListing(domainsFile);
376
377 // TODO: generate the special table (6)
378
379 } else {
380
381 // calculating sample size works:
382 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
383 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
384
385 // get all sites where >0 pages have containsMRI=true
386 // grouping NZ sites and .nz TLDs together and remainder under overseas
387 // geolocations.
388 String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped();
389
390 // separately:
391 // - all NZ containsMRI + overseas tentative non-product sites with containMRI
392 // - overseas tentative product sites with containMRI
393 filename = listing.writeTentativeNonAutotranslatedSites();
394 filename = listing.writeOverseasSitesWithMiInURLPath();
395
396 // TODO: generate the tables
397
398 String[] tableFileNames = mongodb.writeTables(outFolder);
399 // for each table file name, generate the geojson-features .json file
400 // that GEOJSON_MAP_TOOL_URL takes as input to produce a map.
401
402 for(int i = 1; i < tableFileNames.length; i++) { // empty element at 0
403 String tablefilename = tableFileNames[i] + ".json"; // filenames have no suffix
404
405 File countsTableFile = new File(outFolder, tablefilename);
406 if(!countsTableFile.exists()) {
407 logger.error("@@@ File " + countsTableFile + " does not exist!");
408 logger.error("@@@ Can't generate map date for this.");
409 continue;
410 }
411 String countsTableFilename = outFolder + File.separator + tablefilename;
412 CountryCodeCountsMapData mapData
413 = new CountryCodeCountsMapData(countsTableFilename);
414 String geoJsonFilename = mapData.writeFeaturesGeoJsonToFile(CountryCodeCountsMapData.SUPPRESS_MAPDATA_DISPLAY);
415
416
417 /*
418 // Ensure the geo-json file generated exists
419 //String geoJsonFilename = outFolder + File.separator
420 //+ GEOJSON_FEATURES_FILE_PREFIX + tablefilename;
421
422 File geoJsonFile = new File(geoJsonFilename);
423 if(!geoJsonFile.exists()) {
424 System.err.println("@@@ geoJson file " + geoJsonFilename + " not generated!");
425 continue;
426 }
427 */
428 System.err.println("**** Wrote mapdata to file " + geoJsonFilename);
429 //System.err.println(" Paste the file's geojson contents into "
430 //+ "the editor at " + CountryCodeCountsMapData.GEOJSON_MAP_TOOL_URL
431 //+ " to see the data arranged on a world map");
432 System.err.println("Total count for query: " + mapData.getTotalCount());
433
434 // Running the command:
435 // python -mjson.tool file.json
436 // to confirm if the JSON in the provided json file parses/is valid JSON.
437
438 String[] cmdArgs = {"python", "-mjson.tool", geoJsonFilename};
439 SafeProcess p = new SafeProcess(cmdArgs);
440 int retVal = p.runProcess();
441 logger.info("\nJson validation tool returned with value: " + retVal);
442 if(retVal == 0) {
443 logger.info("Valid geo json.");
444 } else {
445 logger.error(" std out: " + p.getStdOutput());
446 logger.error(" std err: " + p.getStdError());
447 logger.error("INVALID geo json generated. Not attempting screenshot of map.");
448 continue;
449 }
450
451
452 /*boolean uriEncoded = true;
453 String mapDataEncodedStr = mapData.getFeaturesGeoJsonString(uriEncoded);
454 System.err.println("Encoded string: " + mapDataEncodedStr);
455 */
456
457 System.err.println("Data URL string: " + mapData.getAsMapURL());
458 System.err.println();
459 mapData.geoJsonMapScreenshot(outFolder, tablefilename);
460 System.err.println("---");
461
462 // TODO: breaks after first table -> map conversion
463 break;
464 }
465
466 }
467
468 System.out.println("*************************************");
469 } catch(Exception e) {
470 logger.error(e.getMessage(), e);
471 }
472 }
473}
Note: See TracBrowser for help on using the repository browser.