Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java@ 33911

Last change on this file since 33911 was 33911, checked in by ak19, 4 years ago
Correct commit message for previous and current commit: 1. After refactoring MongoDBAccess class into additional subclass MongoDBQueryer, I split the import statements accordingly too. 2. Renamed WebPageURLsListing.java class to SummaryTool.java
File size: 14.8 KB

Line
1	package org.greenstone.atea;
2
3	import java.util.*;
4	import java.io.*;
5
6	import org.apache.log4j.Logger;
7
8	/**
9	* Runs some of the important mongoDB queries I ran.
10	*
11	* TO COMPILE OR RUN, FIRST DO:
12	* cd maori-lang-detection/apache-opennlp-1.9.1
13	* export OPENNLP_HOME=`pwd`
14	* cd maori-lang-detection/src
15	*
16	* TO COMPILE:
17	* maori-lang-detection/src$
18	* javac -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool.java
19	*
20	* TO RUN:
21	* maori-lang-detection/src$
22	* java -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool
23	* OR:
24	* java -cp ".:../conf:../lib/*" org/greenstone/atea/SummaryTool ../mongodb-data/domainsNZ_IsMRI.txt
25	*
26	*/
27	public class SummaryTool {
28	static Logger logger = Logger.getLogger(org.greenstone.atea.SummaryTool.class.getName());
29	static private final long FIXED_SEED = 1000;
30
31	private final MongoDBQueryer mongodbQueryer;
32	private File outFolder;
33
34
35
36	public static class Tuple {
37	public final String url;
38	public final String countryCode;
39
40	public Tuple(String url, String countryCode) {
41	this.url = url;
42	this.countryCode = countryCode;
43	}
44
45	public String toString() {
46	return this.url + "," + countryCode;
47	}
48	}
49
50
51	public SummaryTool(MongoDBQueryer mongodbQueryer, File outFolder)
52	{
53	this.mongodbQueryer = mongodbQueryer;
54	this.outFolder = outFolder;
55	}
56
57
58	public void produceURLsForPagesInMRI(File domainsFile) {
59	ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
60	File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
61	writeURLsToFile(urlsList, outFile, urlsList.size());
62
63	System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
64	+ Utility.getFilePath(outFile));
65	}
66
67	public void produceURLsForPagesContainingMRI(File domainsFile) {
68	ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile);
69	File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
70	writeURLsToFile(urlsList, outFile, urlsList.size());
71
72	System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
73	+ Utility.getFilePath(outFile));
74	}
75
76	private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) {
77	ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
78
79	// 1. read each url from the domainsFile
80	// 1a. do the query
81	// 1b. add the arraylist result to urls
82
83	try (
84	BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
85	) {
86
87	String domain;
88
89	while((domain = reader.readLine()) != null) {
90	domain = domain.trim();
91	if(!domain.equals("")) {
92
93	String countryCode = "";
94	int index = domain.lastIndexOf(",");
95	if(index != -1) {
96	countryCode = domain.substring(index+1).trim();
97	domain = domain.substring(0, index);
98	}
99	ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType);
100
101	// Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
102	if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) {
103	System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
104	}
105
106	//urlsList.addAll(moreURLs);
107	for(int i = 0; i < moreURLs.size(); i++) {
108	urlsList.add(new Tuple(moreURLs.get(i), countryCode));
109	}
110
111	}
112	}
113	System.err.println("");
114	} catch(Exception e) {
115	logger.error("Unable to read URLs from file " + Utility.getFilePath(domainsFile));
116	logger.error(e.getMessage(), e);
117	}
118
119	return urlsList;
120	}
121
122	/** Given a hand curated list of all sites with positive numPagesContainingMRI
123	* determined by manual inspection, get a listing of all their web pages that
124	* are IN_MRI (or CONTAINS_MRI?).
125	* Total all these pages that are inMRI (N), then work out the correct sample size (n)
126	* at 90% confidence with 5% margin of error. Then generate a random listing
127	* of n of these pages in MRI of these trusted sites and output to a file
128	* for manual inspection of the sample webpage URLs at page-level. */
129	/* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
130	* of all their web pages IN_MRI (or CONTAINS_MRI).
131	* Plus a listing of all the NZ pages IN_MRI. */
132	//public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
133	public void mriWebPageListingForDomainListing(File domainsFile) {
134
135	int filterType = MongoDBQueryer.IS_MRI;
136
137	// for overseas websites,
138	//produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
139
140	// 0. get a list of all the web pages in the given domain listing where isMRI = true
141	ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile);
142	// produceURLsForPagesInMRI(domainsFile);
143
144	// 1. calculate the population size, N, the number of all webpages in the given domain
145	// site listing where isMRI = true.
146	int N_totalNumPages = urlsList.size();
147
148	// 2. write all the URLs in urlsList to a file
149	//File outFolder = domainsFile.getParentFile();
150	String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_";
151	File outFile = new File(outFolder, fileName+domainsFile.getName());
152
153	writeURLsToFile(urlsList, outFile, N_totalNumPages);
154	System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
155	+ "\ninto file: " + Utility.getFilePath(outFile));
156
157	// 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
158	int n_numSampleURLs = calcSampleSize(N_totalNumPages);
159
160	System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
161	System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
162
163	// 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
164	// Using a constant seed for reproducibility
165	// https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
166	Collections.shuffle(urlsList, new Random(FIXED_SEED));
167
168	outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
169	writeURLsToFile(urlsList, outFile, n_numSampleURLs);
170	System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
171	+ "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile));
172	}
173
174	/**
175	* Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
176	* for given population size N.
177	* @return n, the sample size.
178	*/
179	public int calcSampleSize(int N) {
180
181	// calculate sample size n for population size N if using 90% confidence and 5% margin of error
182	// https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
183	// https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
184	// https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
185
186	double m = 0.05; // margin of error = 5%
187	// for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
188	// For 90% confidence, use the table of known z_alpha/2 values from step 1 of
189	// https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
190	double z_alpha_over_2 = 1.6449;
191
192	// Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
193	// see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
194	double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
195
196	// Round up to get a whole number:
197	return (int)Math.ceil(n);
198	}
199
200	/**
201	* Writes out the first n URLs in urlsList into outFile.
202	*/
203	private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
204	try (
205	Writer writer = new BufferedWriter(new FileWriter(outFile));
206	) {
207
208	for (int i=0; i < n; i++) {
209	Tuple urlInfo = urlsList.get(i);
210
211	//System.out.println(list.get(i));
212	writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
213	}
214	} catch(Exception e) {
215	logger.error("Unable to write to file " + Utility.getFilePath(outFile));
216	logger.error(e.getMessage(), e);
217	}
218	}
219
220	/* ---------------------------------------- */
221	/**
222	* Create the file 5counts_containsMRISites_allNZGrouped.json
223	* that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages
224	* that CONTAIN_MRI, followed by counts and domains listing for overseas sites
225	* that CONTAIN_MRI.
226	* @return full path of file generated
227	*/
228	public String writeContainsMRISites_nzSitesAndTLDsGrouped() {
229
230	File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json");
231
232	String filename = Utility.getFilePath(outFile);
233
234	try (
235	Writer writer = new BufferedWriter(new FileWriter(outFile));
236	) {
237	// first write out NZ sites and .nz TLD count and domains
238	mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
239	// next write out all overseas sites (not NZ origin or .nz TLD)
240	// that have no "mi" in the URL path as mi.* or */mi
241	boolean isMiInURLPath = false;
242	mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI);
243
244	} catch(Exception e) {
245	logger.error("Unable to write to file " + filename);
246	logger.error(e.getMessage(), e);
247	}
248
249	System.err.println("*** Wrote file: " + filename);
250
251	return filename;
252	}
253
254	/**
255	* Create the file 5a_counts_tentativeNonAutotranslatedSites.json
256	* that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
257	* followed by counts and domain listing for overseas sites that are either from Australia
258	* or don't contain mi in their URL path.
259	* @return full path of file generated
260	*/
261	public String writeTentativeNonAutotranslatedSites() {
262
263	File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
264
265	String filename = Utility.getFilePath(outFile);
266
267	try (
268	Writer writer = new BufferedWriter(new FileWriter(outFile));
269	) {
270	// first write out NZ sites and .nz TLD count and domains
271	mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI);
272	// next write out all overseas sites (not NZ origin or .nz TLD)
273	// that have no "mi" in the URL path as mi.* or */mi
274	boolean isMiInURLPath = false;
275	mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
276
277	} catch(Exception e) {
278	logger.error("Unable to write to file " + filename);
279	logger.error(e.getMessage(), e);
280	}
281
282	System.err.println("*** Wrote file: " + filename);
283
284	return filename;
285	}
286
287	/**
288	* Create the file 5b_counts_overseasSitesWithMiInPath.json
289	* Listing of the remainder of overseas sites that CONTAIN_MRI not included by
290	* writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
291	* This listing is separate to allow easier weeding out of product sites/autotranslated
292	* sites when eyeballing the listing output.
293	*/
294	public String writeOverseasSitesWithMiInURLPath() {
295	File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
296
297	String filename = Utility.getFilePath(outFile);
298	try (
299	Writer writer = new BufferedWriter(new FileWriter(outFile));
300	) {
301	boolean isMiInURLPath = true;
302	mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath);
303
304	} catch(Exception e) {
305	logger.error("Unable to write to file " + filename);
306	logger.error(e.getMessage(), e);
307	}
308
309	System.err.println("*** Wrote file: " + filename);
310	return filename;
311	}
312
313	public static void printUsage() {
314	System.err.println("Usage: SummaryTool [domains.txt]");
315	}
316
317	/**
318	* If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
319	* with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
320	* You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
321	* translated and really contain at least one webpage containing at least one sentence in MRI.
322	* If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
323	* matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
324	* isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
325	* 90% confidence with 5% margin of error for testing binary outcomes, see
326	* https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
327	*/
328	public static void main(String args[]) {
329	if(args.length >= 2) {
330	printUsage();
331	System.exit(-1);
332	}
333
334	try (
335	MongoDBQueryer mongodb = new MongoDBQueryer();
336	) {
337
338	mongodb.connectToDB();
339
340	// output files will be stored in mongodb-data-auto
341	File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
342	SummaryTool listing = new SummaryTool(mongodb, outFolder);
343
344	System.out.println("*************************************");
345
346
347	if(args.length >= 1) { // web page urls listing mode
348	File domainsFile = new File(args[0]);
349	if(!domainsFile.exists()) {
350	System.err.println("File " + domainsFile + " does not exist");
351	System.exit(-1);
352	}
353
354	//String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
355	//String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
356
357
358	// TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
359	// then also do the shuffle to gen X num of random web page URLs.
360	//String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
361	listing.mriWebPageListingForDomainListing(domainsFile);
362
363	// TODO: generate the special table (6)
364
365	} else {
366
367	// calculating sample size works:
368	//System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
369	//System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
370
371	// get all sites where >0 pages have containsMRI=true
372	// grouping NZ sites and .nz TLDs together and remainder under overseas
373	// geolocations.
374	String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped();
375
376	// separately:
377	// - all NZ containsMRI + overseas tentative non-product sites with containMRI
378	// - overseas tentative product sites with containMRI
379	filename = listing.writeTentativeNonAutotranslatedSites();
380	filename = listing.writeOverseasSitesWithMiInURLPath();
381
382	// TODO: generate the tables
383
384	mongodb.writeTables(outFolder);
385	}
386
387	System.out.println("*************************************");
388	} catch(Exception e) {
389	logger.error(e.getMessage(), e);
390	}
391	}
392	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: