source: other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java@ 33917

Last change on this file since 33917 was 33917, checked in by ak19, 4 years ago

Added some better reporting when confirming sample size was correct

File size: 15.0 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9 * Runs some of the important mongoDB queries I ran.
10 *
11 * TO COMPILE OR RUN, FIRST DO:
12 * cd maori-lang-detection/apache-opennlp-1.9.1
13 * export OPENNLP_HOME=`pwd`
14 * cd maori-lang-detection/src
15 *
16 * TO COMPILE:
17 * maori-lang-detection/src$
18 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool.java
19 *
20 * TO RUN:
21 * maori-lang-detection/src$
22 * java -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool
23 * OR:
24 * java -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool ../mongodb-data/domainsNZ_IsMRI.txt
25 *
26*/
27public class SummaryTool {
28 static Logger logger = Logger.getLogger(org.greenstone.atea.SummaryTool.class.getName());
29 static private final long FIXED_SEED = 1000;
30
31 private final MongoDBQueryer mongodbQueryer;
32 private File outFolder;
33
34
35
36 public static class Tuple {
37 public final String url;
38 public final String countryCode;
39
40 public Tuple(String url, String countryCode) {
41 this.url = url;
42 this.countryCode = countryCode;
43 }
44
45 public String toString() {
46 return this.url + "," + countryCode;
47 }
48 }
49
50
51 public SummaryTool(MongoDBQueryer mongodbQueryer, File outFolder)
52 {
53 this.mongodbQueryer = mongodbQueryer;
54 this.outFolder = outFolder;
55 }
56
57
58 public void produceURLsForPagesInMRI(File domainsFile) {
59 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
60 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
61 writeURLsToFile(urlsList, outFile, urlsList.size());
62
63 System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
64 + Utility.getFilePath(outFile));
65 }
66
67 public void produceURLsForPagesContainingMRI(File domainsFile) {
68 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile);
69 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
70 writeURLsToFile(urlsList, outFile, urlsList.size());
71
72 System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
73 + Utility.getFilePath(outFile));
74 }
75
76 private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) {
77 ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
78
79 // 1. read each url from the domainsFile
80 // 1a. do the query
81 // 1b. add the arraylist result to urls
82
83 try (
84 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
85 ) {
86
87 String domain;
88
89 while((domain = reader.readLine()) != null) {
90 domain = domain.trim();
91 if(!domain.equals("")) {
92
93 String countryCode = "";
94 int index = domain.lastIndexOf(",");
95 if(index != -1) {
96 countryCode = domain.substring(index+1).trim();
97 domain = domain.substring(0, index);
98 }
99 ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType);
100
101 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
102 if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) {
103 System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
104 }
105
106 //urlsList.addAll(moreURLs);
107 for(int i = 0; i < moreURLs.size(); i++) {
108 urlsList.add(new Tuple(moreURLs.get(i), countryCode));
109 }
110
111 }
112 }
113 System.err.println("");
114 } catch(Exception e) {
115 logger.error("Unable to read URLs from file " + Utility.getFilePath(domainsFile));
116 logger.error(e.getMessage(), e);
117 }
118
119 return urlsList;
120 }
121
122 /** Given a hand curated list of all sites with positive numPagesContainingMRI
123 * determined by manual inspection, get a listing of all their web pages that
124 * are IN_MRI (or CONTAINS_MRI?).
125 * Total all these pages that are inMRI (N), then work out the correct sample size (n)
126 * at 90% confidence with 5% margin of error. Then generate a random listing
127 * of n of these pages in MRI of these trusted sites and output to a file
128 * for manual inspection of the sample webpage URLs at page-level. */
129 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
130 * of all their web pages IN_MRI (or CONTAINS_MRI).
131 * Plus a listing of all the NZ pages IN_MRI. */
132 //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
133 public void mriWebPageListingForDomainListing(File domainsFile) {
134
135 int filterType = MongoDBQueryer.IS_MRI;
136
137 // for overseas websites,
138 //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
139
140 // 0. get a list of all the web pages in the given domain listing where isMRI = true
141 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
142 // produceURLsForPagesInMRI(domainsFile);
143
144 // 1. calculate the population size, N, the number of all webpages in the given domain
145 // site listing where isMRI = true.
146 int N_totalNumPages = urlsList.size();
147
148 // 2. write all the URLs in urlsList to a file
149 //File outFolder = domainsFile.getParentFile();
150 String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI";
151 File outFile = new File(outFolder, filterName+"_"+domainsFile.getName());
152
153 writeURLsToFile(urlsList, outFile, N_totalNumPages);
154 System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
155 + "\ninto file: " + Utility.getFilePath(outFile));
156
157 // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
158 int n_numSampleURLs = calcSampleSize(N_totalNumPages);
159
160 System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist: " + N_totalNumPages);
161 System.err.println(" (out of " + mongodbQueryer.countOfWebpagesMatching(filterType)
162 + " web pages across ALL sites for which " + filterName + " = true)");
163 System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
164
165 // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
166 // Using a constant seed for reproducibility
167 // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
168 Collections.shuffle(urlsList, new Random(FIXED_SEED));
169
170 outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
171 writeURLsToFile(urlsList, outFile, n_numSampleURLs);
172 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
173 + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile));
174 }
175
176 /**
177 * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
178 * for given population size N.
179 * @return n, the sample size.
180 */
181 public int calcSampleSize(int N) {
182
183 // calculate sample size n for population size N if using 90% confidence and 5% margin of error
184 // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
185 // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
186 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
187
188 double m = 0.05; // margin of error = 5%
189 // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
190 // For 90% confidence, use the table of known z_alpha/2 values from step 1 of
191 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
192 double z_alpha_over_2 = 1.6449;
193
194 // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
195 // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
196 double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
197
198 // Round up to get a whole number:
199 return (int)Math.ceil(n);
200 }
201
202 /**
203 * Writes out the first n URLs in urlsList into outFile.
204 */
205 private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
206 try (
207 Writer writer = new BufferedWriter(new FileWriter(outFile));
208 ) {
209
210 for (int i=0; i < n; i++) {
211 Tuple urlInfo = urlsList.get(i);
212
213 //System.out.println(list.get(i));
214 writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
215 }
216 } catch(Exception e) {
217 logger.error("Unable to write to file " + Utility.getFilePath(outFile));
218 logger.error(e.getMessage(), e);
219 }
220 }
221
222 /* ---------------------------------------- */
223 /**
224 * Create the file 5counts_containsMRISites_allNZGrouped.json
225 * that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages
226 * that CONTAIN_MRI, followed by counts and domains listing for overseas sites
227 * that CONTAIN_MRI.
228 * @return full path of file generated
229 */
230 public String writeContainsMRISites_nzSitesAndTLDsGrouped() {
231
232 File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json");
233
234 String filename = Utility.getFilePath(outFile);
235
236 try (
237 Writer writer = new BufferedWriter(new FileWriter(outFile));
238 ) {
239 // first write out NZ sites and .nz TLD count and domains
240 mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
241 // next write out all overseas sites (not NZ origin or .nz TLD)
242 // that have no "mi" in the URL path as mi.* or */mi
243 boolean isMiInURLPath = false;
244 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI);
245
246 } catch(Exception e) {
247 logger.error("Unable to write to file " + filename);
248 logger.error(e.getMessage(), e);
249 }
250
251 System.err.println("*** Wrote file: " + filename);
252
253 return filename;
254 }
255
256 /**
257 * Create the file 5a_counts_tentativeNonAutotranslatedSites.json
258 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
259 * followed by counts and domain listing for overseas sites that are either from Australia
260 * or don't contain mi in their URL path.
261 * @return full path of file generated
262 */
263 public String writeTentativeNonAutotranslatedSites() {
264
265 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
266
267 String filename = Utility.getFilePath(outFile);
268
269 try (
270 Writer writer = new BufferedWriter(new FileWriter(outFile));
271 ) {
272 // first write out NZ sites and .nz TLD count and domains
273 mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
274 // next write out all overseas sites (not NZ origin or .nz TLD)
275 // that have no "mi" in the URL path as mi.* or */mi
276 boolean isMiInURLPath = false;
277 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
278
279 } catch(Exception e) {
280 logger.error("Unable to write to file " + filename);
281 logger.error(e.getMessage(), e);
282 }
283
284 System.err.println("*** Wrote file: " + filename);
285
286 return filename;
287 }
288
289 /**
290 * Create the file 5b_counts_overseasSitesWithMiInPath.json
291 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
292 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
293 * This listing is separate to allow easier weeding out of product sites/autotranslated
294 * sites when eyeballing the listing output.
295 */
296 public String writeOverseasSitesWithMiInURLPath() {
297 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
298
299 String filename = Utility.getFilePath(outFile);
300 try (
301 Writer writer = new BufferedWriter(new FileWriter(outFile));
302 ) {
303 boolean isMiInURLPath = true;
304 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
305
306 } catch(Exception e) {
307 logger.error("Unable to write to file " + filename);
308 logger.error(e.getMessage(), e);
309 }
310
311 System.err.println("*** Wrote file: " + filename);
312 return filename;
313 }
314
315 public static void printUsage() {
316 System.err.println("Usage: SummaryTool [domains.txt]");
317 }
318
319 /**
320 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
321 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
322 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
323 * translated and really contain at least one webpage containing at least one sentence in MRI.
324 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
325 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
326 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
327 * 90% confidence with 5% margin of error for testing binary outcomes, see
328 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
329 */
330 public static void main(String args[]) {
331 if(args.length >= 2) {
332 printUsage();
333 System.exit(-1);
334 }
335
336 try (
337 MongoDBQueryer mongodb = new MongoDBQueryer();
338 ) {
339
340 mongodb.connectToDB();
341
342 // output files will be stored in mongodb-data-auto
343 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
344 SummaryTool listing = new SummaryTool(mongodb, outFolder);
345
346 System.out.println("*************************************");
347
348
349 if(args.length >= 1) { // web page urls listing mode
350 File domainsFile = new File(args[0]);
351 if(!domainsFile.exists()) {
352 System.err.println("File " + domainsFile + " does not exist");
353 System.exit(-1);
354 }
355
356 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
357 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
358
359
360 // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
361 // then also do the shuffle to gen X num of random web page URLs.
362 //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
363 listing.mriWebPageListingForDomainListing(domainsFile);
364
365 // TODO: generate the special table (6)
366
367 } else {
368
369 // calculating sample size works:
370 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
371 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
372
373 // get all sites where >0 pages have containsMRI=true
374 // grouping NZ sites and .nz TLDs together and remainder under overseas
375 // geolocations.
376 String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped();
377
378 // separately:
379 // - all NZ containsMRI + overseas tentative non-product sites with containMRI
380 // - overseas tentative product sites with containMRI
381 filename = listing.writeTentativeNonAutotranslatedSites();
382 filename = listing.writeOverseasSitesWithMiInURLPath();
383
384 // TODO: generate the tables
385
386 mongodb.writeTables(outFolder);
387 }
388
389 System.out.println("*************************************");
390 } catch(Exception e) {
391 logger.error(e.getMessage(), e);
392 }
393 }
394}
Note: See TracBrowser for help on using the repository browser.