source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33883

Last change on this file since 33883 was 33883, checked in by ak19, 4 years ago

Clarifications

File size: 12.1 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9 * Runs some of the important mongoDB queries I ran.
10 *
11 * TO COMPILE OR RUN, FIRST DO:
12 * cd maori-lang-detection/apache-opennlp-1.9.1
13 * export OPENNLP_HOME=`pwd`
14 * cd maori-lang-detection/src
15 *
16 * TO COMPILE:
17 * maori-lang-detection/src$
18 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
19 *
20 * TO RUN:
21 * maori-lang-detection/src$
22 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt
23 *
24*/
25public class WebPageURLsListing {
26 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
27 static private final long FIXED_SEED = 1000;
28
29 private final MongoDBAccess mongodbAccess;
30 private File outFolder;
31
32
33 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
34 {
35 this.mongodbAccess = mongodbAccess;
36 this.outFolder = outFolder;
37 }
38
39 private String getFilePath(File file) {
40 try {
41 return file.getCanonicalPath();
42 } catch(IOException e) {
43 return file.getAbsolutePath();
44 }
45 }
46
47 public void produceURLsForPagesInMRI(File domainsFile) {
48 ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
49 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
50 writeURLsToFile(urlsList, outFile, urlsList.size());
51
52 System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
53 + getFilePath(outFile));
54 }
55
56 public void produceURLsForPagesContainingMRI(File domainsFile) {
57 ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);
58 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
59 writeURLsToFile(urlsList, outFile, urlsList.size());
60
61 System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
62 + getFilePath(outFile));
63 }
64
65 private ArrayList<String> getURLsForWebPages(int filterType, File domainsFile) {
66 ArrayList<String> urlsList = new ArrayList<String>();
67
68 // 1. read each url from the domainsFile
69 // 1a. do the query
70 // 1b. add the arraylist result to urls
71
72 try (
73 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
74 ) {
75
76 String domain;
77
78 while((domain = reader.readLine()) != null) {
79 domain = domain.trim();
80 if(!domain.equals("")) {
81 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
82 urlsList.addAll(moreURLs);
83 if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
84 System.out.println("Domain " + domain + " had no isMRI webpages (only containsMRI).");
85 }
86 }
87 }
88 } catch(Exception e) {
89 logger.error("Unable to read URLs from file " + getFilePath(domainsFile));
90 logger.error(e.getMessage(), e);
91 }
92
93 return urlsList;
94 }
95
96 /** Given a hand curated list of NZ sites with positive numPagesContainingMRI,
97 * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).
98 * Total all these pages in MRI (N), then work out the correct sample size (n)
99 * at 90% confidence with 5% margin of error. Then generate a random listing
100 * of n of these pages in MRI of these trusted sites and output to a file
101 * for manual inspection. */
102 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
103 * of all their web pages IN_MRI (or CONTAINS_MRI).
104 * Plus a listing of all the NZ pages IN_MRI. */
105 //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
106 public void mriWebPageListingForDomainListing(File domainsFile) {
107
108 int filterType = MongoDBAccess.IS_MRI;
109
110 // for overseas websites,
111 //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
112
113 // 0. get a list of all the web pages in the given domain listing where isMRI = true
114 ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
115 // produceURLsForPagesInMRI(domainsFile);
116
117 // 1. calculate the population size, N, the number of all webpages in the given domain
118 // site listing where isMRI = true.
119 int N_totalNumPages = urlsList.size();
120
121 // 2. write all the URLs in urlsList to a file
122 //File outFolder = domainsFile.getParentFile();
123 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
124 File outFile = new File(outFolder, fileName+domainsFile.getName());
125
126 writeURLsToFile(urlsList, outFile, N_totalNumPages);
127 System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
128 + "\ninto file: " + getFilePath(outFile));
129
130 // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
131 int n_numSampleURLs = calcSampleSize(N_totalNumPages);
132
133 System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
134 System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
135
136 // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
137 // Using a constant seed for reproducibility
138 // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
139 Collections.shuffle(urlsList, new Random(FIXED_SEED));
140
141 outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
142 writeURLsToFile(urlsList, outFile, n_numSampleURLs);
143 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
144 + "for the sites in input domainsFile\ninto file: " + getFilePath(outFile));
145 }
146
147 /**
148 * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
149 * for given population size N.
150 * @return n, the sample size.
151 */
152 public int calcSampleSize(int N) {
153
154 // calculate sample size n for population size N if using 90% confidence and 5% margin of error
155 // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
156 // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
157 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
158
159 double m = 0.05; // margin of error = 5%
160 // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
161 // For 90% confidence, use the table of known z_alpha/2 values from step 1 of
162 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
163 double z_alpha_over_2 = 1.6449;
164
165 // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
166 // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
167 double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
168
169 // Round up to get a whole number:
170 return (int)Math.ceil(n);
171 }
172
173 /**
174 * Writes out the first n URLs in urlsList into outFile.
175 */
176 private void writeURLsToFile(ArrayList<String> urlsList, File outFile, final int n) {
177 try (
178 Writer writer = new BufferedWriter(new FileWriter(outFile));
179 ) {
180
181 for (int i=0; i < n; i++) {
182 String url = urlsList.get(i);
183 //System.out.println(list.get(i));
184 writer.write(url + "\n");
185 }
186 } catch(Exception e) {
187 logger.error("Unable to write to file " + getFilePath(outFile));
188 logger.error(e.getMessage(), e);
189 }
190 }
191
192 /* ---------------------------------------- */
193
194 /**
195 * Create the file 5counts_tentativeNonAutotranslatedSites.json
196 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
197 * followed by counts and domain listing for overseas sites that are either from Australia
198 * or don't contain mi in their URL path.
199 * @return full path of file generated
200 */
201 public String writeTentativeNonAutotranslatedSites() {
202
203 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
204
205 String filename = getFilePath(outFile);
206
207 try (
208 Writer writer = new BufferedWriter(new FileWriter(outFile));
209 ) {
210 // first write out NZ sites and .nz TLD count and domains
211 mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
212 // next write out all overseas sites (not NZ origin or .nz TLD)
213 // that have no "mi" in the URL path as mi.* or */mi
214 boolean isMiInURLPath = false;
215 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
216
217 } catch(Exception e) {
218 logger.error("Unable to write to file " + filename);
219 logger.error(e.getMessage(), e);
220 }
221
222 System.err.println("*** Wrote file: " + filename);
223
224 return filename;
225 }
226
227 /**
228 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
229 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
230 * This listing is separate to allow easier weeding out of product sites/autotranslated
231 * sites when eyeballing the listing output.
232 */
233 public String writeOverseasSitesWithMiInURLPath() {
234 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
235
236 String filename = getFilePath(outFile);
237 try (
238 Writer writer = new BufferedWriter(new FileWriter(outFile));
239 ) {
240 boolean isMiInURLPath = true;
241 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
242
243 } catch(Exception e) {
244 logger.error("Unable to write to file " + filename);
245 logger.error(e.getMessage(), e);
246 }
247
248 System.err.println("*** Wrote file: " + filename);
249 return filename;
250 }
251
252 public static void printUsage() {
253 System.err.println("Usage: WebPageURLsListing [domains.txt]");
254 }
255
256 /**
257 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
258 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
259 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
260 * translated and really contain at least one webpage containing at least one sentence in MRI.
261 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
262 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
263 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
264 * 90% confidence with 5% margin of error for testing binary outcomes, see
265 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
266 */
267 public static void main(String args[]) {
268 if(args.length >= 2) {
269 printUsage();
270 System.exit(-1);
271 }
272
273 try (
274 MongoDBAccess mongodb = new MongoDBAccess();
275 ) {
276
277 mongodb.connectToDB();
278
279 // output files will be stored in mongodb-data-auto
280 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
281 WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
282
283
284 if(args.length >= 1) {
285 File domainsFile = new File(args[0]);
286 if(!domainsFile.exists()) {
287 System.err.println("File " + domainsFile + " does not exist");
288 System.exit(-1);
289 }
290
291 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
292 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
293
294
295 // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
296 // then also do the shuffle to gen X num of random web page URLs.
297 //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
298 listing.mriWebPageListingForDomainListing(domainsFile);
299
300 // TODO: generate the special table (6)
301
302 } else {
303
304 // calculating sample size works:
305 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
306 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
307
308 String filename = listing.writeTentativeNonAutotranslatedSites();
309 filename = listing.writeOverseasSitesWithMiInURLPath();
310
311 // TODO: generate the tables
312 }
313
314 } catch(Exception e) {
315 logger.error(e.getMessage(), e);
316 }
317 }
318}
Note: See TracBrowser for help on using the repository browser.