source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33885

Last change on this file since 33885 was 33885, checked in by ak19, 4 years ago

Attempting to write the tables. csv not yet supported. Table 1 done.

File size: 13.1 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9 * Runs some of the important mongoDB queries I ran.
10 *
11 * TO COMPILE OR RUN, FIRST DO:
12 * cd maori-lang-detection/apache-opennlp-1.9.1
13 * export OPENNLP_HOME=`pwd`
14 * cd maori-lang-detection/src
15 *
16 * TO COMPILE:
17 * maori-lang-detection/src$
18 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
19 *
20 * TO RUN:
21 * maori-lang-detection/src$
22 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing
23 * OR:
24 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt
25 *
26*/
27public class WebPageURLsListing {
28 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
29 static private final long FIXED_SEED = 1000;
30
31 private final MongoDBAccess mongodbAccess;
32 private File outFolder;
33
34
35
36 public static class Tuple {
37 public final String url;
38 public final String countryCode;
39
40 public Tuple(String url, String countryCode) {
41 this.url = url;
42 this.countryCode = countryCode;
43 }
44
45 public String toString() {
46 return this.url + "," + countryCode;
47 }
48 }
49
50
51 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
52 {
53 this.mongodbAccess = mongodbAccess;
54 this.outFolder = outFolder;
55 }
56
57 private String getFilePath(File file) {
58 try {
59 return file.getCanonicalPath();
60 } catch(IOException e) {
61 return file.getAbsolutePath();
62 }
63 }
64
65 public void produceURLsForPagesInMRI(File domainsFile) {
66 ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
67 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
68 writeURLsToFile(urlsList, outFile, urlsList.size());
69
70 System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
71 + getFilePath(outFile));
72 }
73
74 public void produceURLsForPagesContainingMRI(File domainsFile) {
75 ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);
76 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
77 writeURLsToFile(urlsList, outFile, urlsList.size());
78
79 System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
80 + getFilePath(outFile));
81 }
82
83 private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) {
84 ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
85
86 // 1. read each url from the domainsFile
87 // 1a. do the query
88 // 1b. add the arraylist result to urls
89
90 try (
91 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
92 ) {
93
94 String domain;
95
96 while((domain = reader.readLine()) != null) {
97 domain = domain.trim();
98 if(!domain.equals("")) {
99
100 String countryCode = "";
101 int index = domain.lastIndexOf(",");
102 if(index != -1) {
103 countryCode = domain.substring(index+1).trim();
104 domain = domain.substring(0, index);
105 }
106 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
107
108 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
109 if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
110 System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
111 }
112
113 //urlsList.addAll(moreURLs);
114 for(int i = 0; i < moreURLs.size(); i++) {
115 urlsList.add(new Tuple(moreURLs.get(i), countryCode));
116 }
117
118 }
119 }
120 System.err.println("");
121 } catch(Exception e) {
122 logger.error("Unable to read URLs from file " + getFilePath(domainsFile));
123 logger.error(e.getMessage(), e);
124 }
125
126 return urlsList;
127 }
128
129 /** Given a hand curated list of NZ sites with positive numPagesContainingMRI,
130 * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).
131 * Total all these pages in MRI (N), then work out the correct sample size (n)
132 * at 90% confidence with 5% margin of error. Then generate a random listing
133 * of n of these pages in MRI of these trusted sites and output to a file
134 * for manual inspection. */
135 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
136 * of all their web pages IN_MRI (or CONTAINS_MRI).
137 * Plus a listing of all the NZ pages IN_MRI. */
138 //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
139 public void mriWebPageListingForDomainListing(File domainsFile) {
140
141 int filterType = MongoDBAccess.IS_MRI;
142
143 // for overseas websites,
144 //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
145
146 // 0. get a list of all the web pages in the given domain listing where isMRI = true
147 ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
148 // produceURLsForPagesInMRI(domainsFile);
149
150 // 1. calculate the population size, N, the number of all webpages in the given domain
151 // site listing where isMRI = true.
152 int N_totalNumPages = urlsList.size();
153
154 // 2. write all the URLs in urlsList to a file
155 //File outFolder = domainsFile.getParentFile();
156 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
157 File outFile = new File(outFolder, fileName+domainsFile.getName());
158
159 writeURLsToFile(urlsList, outFile, N_totalNumPages);
160 System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
161 + "\ninto file: " + getFilePath(outFile));
162
163 // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
164 int n_numSampleURLs = calcSampleSize(N_totalNumPages);
165
166 System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
167 System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
168
169 // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
170 // Using a constant seed for reproducibility
171 // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
172 Collections.shuffle(urlsList, new Random(FIXED_SEED));
173
174 outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
175 writeURLsToFile(urlsList, outFile, n_numSampleURLs);
176 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
177 + "for the sites in input domainsFile\ninto file: " + getFilePath(outFile));
178 }
179
180 /**
181 * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
182 * for given population size N.
183 * @return n, the sample size.
184 */
185 public int calcSampleSize(int N) {
186
187 // calculate sample size n for population size N if using 90% confidence and 5% margin of error
188 // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
189 // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
190 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
191
192 double m = 0.05; // margin of error = 5%
193 // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
194 // For 90% confidence, use the table of known z_alpha/2 values from step 1 of
195 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
196 double z_alpha_over_2 = 1.6449;
197
198 // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
199 // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
200 double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
201
202 // Round up to get a whole number:
203 return (int)Math.ceil(n);
204 }
205
206 /**
207 * Writes out the first n URLs in urlsList into outFile.
208 */
209 private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
210 try (
211 Writer writer = new BufferedWriter(new FileWriter(outFile));
212 ) {
213
214 for (int i=0; i < n; i++) {
215 Tuple urlInfo = urlsList.get(i);
216
217 //System.out.println(list.get(i));
218 writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
219 }
220 } catch(Exception e) {
221 logger.error("Unable to write to file " + getFilePath(outFile));
222 logger.error(e.getMessage(), e);
223 }
224 }
225
226 /* ---------------------------------------- */
227
228 /**
229 * Create the file 5counts_tentativeNonAutotranslatedSites.json
230 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
231 * followed by counts and domain listing for overseas sites that are either from Australia
232 * or don't contain mi in their URL path.
233 * @return full path of file generated
234 */
235 public String writeTentativeNonAutotranslatedSites() {
236
237 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
238
239 String filename = getFilePath(outFile);
240
241 try (
242 Writer writer = new BufferedWriter(new FileWriter(outFile));
243 ) {
244 // first write out NZ sites and .nz TLD count and domains
245 mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
246 // next write out all overseas sites (not NZ origin or .nz TLD)
247 // that have no "mi" in the URL path as mi.* or */mi
248 boolean isMiInURLPath = false;
249 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
250
251 } catch(Exception e) {
252 logger.error("Unable to write to file " + filename);
253 logger.error(e.getMessage(), e);
254 }
255
256 System.err.println("*** Wrote file: " + filename);
257
258 return filename;
259 }
260
261 /**
262 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
263 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
264 * This listing is separate to allow easier weeding out of product sites/autotranslated
265 * sites when eyeballing the listing output.
266 */
267 public String writeOverseasSitesWithMiInURLPath() {
268 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
269
270 String filename = getFilePath(outFile);
271 try (
272 Writer writer = new BufferedWriter(new FileWriter(outFile));
273 ) {
274 boolean isMiInURLPath = true;
275 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
276
277 } catch(Exception e) {
278 logger.error("Unable to write to file " + filename);
279 logger.error(e.getMessage(), e);
280 }
281
282 System.err.println("*** Wrote file: " + filename);
283 return filename;
284 }
285
286 public static void printUsage() {
287 System.err.println("Usage: WebPageURLsListing [domains.txt]");
288 }
289
290 /**
291 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
292 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
293 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
294 * translated and really contain at least one webpage containing at least one sentence in MRI.
295 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
296 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
297 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
298 * 90% confidence with 5% margin of error for testing binary outcomes, see
299 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
300 */
301 public static void main(String args[]) {
302 if(args.length >= 2) {
303 printUsage();
304 System.exit(-1);
305 }
306
307 try (
308 MongoDBAccess mongodb = new MongoDBAccess();
309 ) {
310
311 mongodb.connectToDB();
312
313 // output files will be stored in mongodb-data-auto
314 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
315 WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
316
317 System.out.println("*************************************");
318
319
320 if(args.length >= 1) {
321 File domainsFile = new File(args[0]);
322 if(!domainsFile.exists()) {
323 System.err.println("File " + domainsFile + " does not exist");
324 System.exit(-1);
325 }
326
327 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
328 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
329
330
331 // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
332 // then also do the shuffle to gen X num of random web page URLs.
333 //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
334 listing.mriWebPageListingForDomainListing(domainsFile);
335
336 // TODO: generate the special table (6)
337
338 } else {
339
340 // calculating sample size works:
341 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
342 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
343
344 String filename = listing.writeTentativeNonAutotranslatedSites();
345 filename = listing.writeOverseasSitesWithMiInURLPath();
346
347 // TODO: generate the tables
348
349 mongodb.writeTables(outFolder);
350 }
351
352 System.out.println("*************************************");
353 } catch(Exception e) {
354 logger.error(e.getMessage(), e);
355 }
356 }
357}
Note: See TracBrowser for help on using the repository browser.