Context Navigation

WebPageURLsListing.java@ 33884

Last change on this file since 33884 was 33884, checked in by ak19, 4 years ago

Previous commit had lots of modifications, and only 2 files matched the simple commit message of clarifications. The code changes in the prev commit were to incorporate the processing of a domains File (of curated sites) and write out all webPages in ach of those sites where isMRI=true. And then calculate a representative sample size n out of N total isMRI webPages, then shuffle that list of isMRI webPages and write out the first n webPage URLs in that list. 1. This commit: incorporating country code alongside URLs as Dr Bainbridge requested.

File size: 13.0 KB

Line
1	package org.greenstone.atea;
2
3	import java.util.*;
4	import java.io.*;
5
6	import org.apache.log4j.Logger;
7
8	/**
9	* Runs some of the important mongoDB queries I ran.
10	*
11	* TO COMPILE OR RUN, FIRST DO:
12	* cd maori-lang-detection/apache-opennlp-1.9.1
13	* export OPENNLP_HOME=`pwd`
14	* cd maori-lang-detection/src
15	*
16	* TO COMPILE:
17	* maori-lang-detection/src$
18	* javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
19	*
20	* TO RUN:
21	* maori-lang-detection/src$
22	* java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt
23	*
24	*/
25	public class WebPageURLsListing {
26	static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
27	static private final long FIXED_SEED = 1000;
28
29	private final MongoDBAccess mongodbAccess;
30	private File outFolder;
31
32
33
34	public static class Tuple {
35	public final String url;
36	public final String countryCode;
37
38	public Tuple(String url, String countryCode) {
39	this.url = url;
40	this.countryCode = countryCode;
41	}
42
43	public String toString() {
44	return this.url + "," + countryCode;
45	}
46	}
47
48
49	public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
50	{
51	this.mongodbAccess = mongodbAccess;
52	this.outFolder = outFolder;
53	}
54
55	private String getFilePath(File file) {
56	try {
57	return file.getCanonicalPath();
58	} catch(IOException e) {
59	return file.getAbsolutePath();
60	}
61	}
62
63	public void produceURLsForPagesInMRI(File domainsFile) {
64	ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
65	File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
66	writeURLsToFile(urlsList, outFile, urlsList.size());
67
68	System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
69	+ getFilePath(outFile));
70	}
71
72	public void produceURLsForPagesContainingMRI(File domainsFile) {
73	ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);
74	File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
75	writeURLsToFile(urlsList, outFile, urlsList.size());
76
77	System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
78	+ getFilePath(outFile));
79	}
80
81	private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) {
82	ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
83
84	// 1. read each url from the domainsFile
85	// 1a. do the query
86	// 1b. add the arraylist result to urls
87
88	try (
89	BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
90	) {
91
92	String domain;
93
94	while((domain = reader.readLine()) != null) {
95	domain = domain.trim();
96	if(!domain.equals("")) {
97
98	String countryCode = "";
99	int index = domain.lastIndexOf(",");
100	if(index != -1) {
101	countryCode = domain.substring(index+1).trim();
102	domain = domain.substring(0, index);
103	}
104	ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
105
106	// Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
107	if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
108	System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
109	}
110
111	//urlsList.addAll(moreURLs);
112	for(int i = 0; i < moreURLs.size(); i++) {
113	urlsList.add(new Tuple(moreURLs.get(i), countryCode));
114	}
115
116	}
117	}
118	System.err.println("");
119	} catch(Exception e) {
120	logger.error("Unable to read URLs from file " + getFilePath(domainsFile));
121	logger.error(e.getMessage(), e);
122	}
123
124	return urlsList;
125	}
126
127	/** Given a hand curated list of NZ sites with positive numPagesContainingMRI,
128	* get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).
129	* Total all these pages in MRI (N), then work out the correct sample size (n)
130	* at 90% confidence with 5% margin of error. Then generate a random listing
131	* of n of these pages in MRI of these trusted sites and output to a file
132	* for manual inspection. */
133	/* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
134	* of all their web pages IN_MRI (or CONTAINS_MRI).
135	* Plus a listing of all the NZ pages IN_MRI. */
136	//public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
137	public void mriWebPageListingForDomainListing(File domainsFile) {
138
139	int filterType = MongoDBAccess.IS_MRI;
140
141	// for overseas websites,
142	//produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
143
144	// 0. get a list of all the web pages in the given domain listing where isMRI = true
145	ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
146	// produceURLsForPagesInMRI(domainsFile);
147
148	// 1. calculate the population size, N, the number of all webpages in the given domain
149	// site listing where isMRI = true.
150	int N_totalNumPages = urlsList.size();
151
152	// 2. write all the URLs in urlsList to a file
153	//File outFolder = domainsFile.getParentFile();
154	String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
155	File outFile = new File(outFolder, fileName+domainsFile.getName());
156
157	writeURLsToFile(urlsList, outFile, N_totalNumPages);
158	System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
159	+ "\ninto file: " + getFilePath(outFile));
160
161	// 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
162	int n_numSampleURLs = calcSampleSize(N_totalNumPages);
163
164	System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
165	System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
166
167	// 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
168	// Using a constant seed for reproducibility
169	// https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
170	Collections.shuffle(urlsList, new Random(FIXED_SEED));
171
172	outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
173	writeURLsToFile(urlsList, outFile, n_numSampleURLs);
174	System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
175	+ "for the sites in input domainsFile\ninto file: " + getFilePath(outFile));
176	}
177
178	/**
179	* Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
180	* for given population size N.
181	* @return n, the sample size.
182	*/
183	public int calcSampleSize(int N) {
184
185	// calculate sample size n for population size N if using 90% confidence and 5% margin of error
186	// https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
187	// https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
188	// https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
189
190	double m = 0.05; // margin of error = 5%
191	// for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
192	// For 90% confidence, use the table of known z_alpha/2 values from step 1 of
193	// https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
194	double z_alpha_over_2 = 1.6449;
195
196	// Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
197	// see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
198	double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
199
200	// Round up to get a whole number:
201	return (int)Math.ceil(n);
202	}
203
204	/**
205	* Writes out the first n URLs in urlsList into outFile.
206	*/
207	private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
208	try (
209	Writer writer = new BufferedWriter(new FileWriter(outFile));
210	) {
211
212	for (int i=0; i < n; i++) {
213	Tuple urlInfo = urlsList.get(i);
214
215	//System.out.println(list.get(i));
216	writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
217	}
218	} catch(Exception e) {
219	logger.error("Unable to write to file " + getFilePath(outFile));
220	logger.error(e.getMessage(), e);
221	}
222	}
223
224	/* ---------------------------------------- */
225
226	/**
227	* Create the file 5counts_tentativeNonAutotranslatedSites.json
228	* that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
229	* followed by counts and domain listing for overseas sites that are either from Australia
230	* or don't contain mi in their URL path.
231	* @return full path of file generated
232	*/
233	public String writeTentativeNonAutotranslatedSites() {
234
235	File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
236
237	String filename = getFilePath(outFile);
238
239	try (
240	Writer writer = new BufferedWriter(new FileWriter(outFile));
241	) {
242	// first write out NZ sites and .nz TLD count and domains
243	mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
244	// next write out all overseas sites (not NZ origin or .nz TLD)
245	// that have no "mi" in the URL path as mi.* or */mi
246	boolean isMiInURLPath = false;
247	mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
248
249	} catch(Exception e) {
250	logger.error("Unable to write to file " + filename);
251	logger.error(e.getMessage(), e);
252	}
253
254	System.err.println("*** Wrote file: " + filename);
255
256	return filename;
257	}
258
259	/**
260	* Listing of the remainder of overseas sites that CONTAIN_MRI not included by
261	* writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
262	* This listing is separate to allow easier weeding out of product sites/autotranslated
263	* sites when eyeballing the listing output.
264	*/
265	public String writeOverseasSitesWithMiInURLPath() {
266	File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
267
268	String filename = getFilePath(outFile);
269	try (
270	Writer writer = new BufferedWriter(new FileWriter(outFile));
271	) {
272	boolean isMiInURLPath = true;
273	mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
274
275	} catch(Exception e) {
276	logger.error("Unable to write to file " + filename);
277	logger.error(e.getMessage(), e);
278	}
279
280	System.err.println("*** Wrote file: " + filename);
281	return filename;
282	}
283
284	public static void printUsage() {
285	System.err.println("Usage: WebPageURLsListing [domains.txt]");
286	}
287
288	/**
289	* If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
290	* with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
291	* You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
292	* translated and really contain at least one webpage containing at least one sentence in MRI.
293	* If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
294	* matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
295	* isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
296	* 90% confidence with 5% margin of error for testing binary outcomes, see
297	* https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
298	*/
299	public static void main(String args[]) {
300	if(args.length >= 2) {
301	printUsage();
302	System.exit(-1);
303	}
304
305	try (
306	MongoDBAccess mongodb = new MongoDBAccess();
307	) {
308
309	mongodb.connectToDB();
310
311	// output files will be stored in mongodb-data-auto
312	File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
313	WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
314
315	System.out.println("*************************************");
316
317
318	if(args.length >= 1) {
319	File domainsFile = new File(args[0]);
320	if(!domainsFile.exists()) {
321	System.err.println("File " + domainsFile + " does not exist");
322	System.exit(-1);
323	}
324
325	//String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
326	//String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
327
328
329	// TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
330	// then also do the shuffle to gen X num of random web page URLs.
331	//String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
332	listing.mriWebPageListingForDomainListing(domainsFile);
333
334	// TODO: generate the special table (6)
335
336	} else {
337
338	// calculating sample size works:
339	//System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
340	//System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
341
342	String filename = listing.writeTentativeNonAutotranslatedSites();
343	filename = listing.writeOverseasSitesWithMiInURLPath();
344
345	// TODO: generate the tables
346	}
347
348	System.out.println("*************************************");
349	} catch(Exception e) {
350	logger.error(e.getMessage(), e);
351	}
352	}
353	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33884

Download in other formats: