source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33906

Last change on this file since 33906 was 33906, checked in by ak19, 4 years ago

Code is intermediate state. 1. Introduced basicDomain field to MongoDB and recreated the MongoDB tables/collections, this will help discount duplicated domains under http and https, with and without www. Though webpage URLs may potentially still be unique and not duplicated across all 4 possible variants, I want them counted under the same base domain name. 2. Another issue noticed now is that some of the sites appear to be hosted on multiple countries servers, and so slightly different country code counts and domainlistings are returned. 3. So added code modifications (untested) to sort the domains alphabetically after stripping protocol and www to allow comparing the old domainListing results of MongoDB's now renamed oldWebsites and oldWebpages collections to the new versions of these collections and to then update the differences in manual counts.

File size: 13.2 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9 * Runs some of the important mongoDB queries I ran.
10 *
11 * TO COMPILE OR RUN, FIRST DO:
12 * cd maori-lang-detection/apache-opennlp-1.9.1
13 * export OPENNLP_HOME=`pwd`
14 * cd maori-lang-detection/src
15 *
16 * TO COMPILE:
17 * maori-lang-detection/src$
18 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
19 *
20 * TO RUN:
21 * maori-lang-detection/src$
22 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing
23 * OR:
24 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt
25 *
26*/
27public class WebPageURLsListing {
28 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
29 static private final long FIXED_SEED = 1000;
30
31 private final MongoDBAccess mongodbAccess;
32 private File outFolder;
33
34
35
36 public static class Tuple {
37 public final String url;
38 public final String countryCode;
39
40 public Tuple(String url, String countryCode) {
41 this.url = url;
42 this.countryCode = countryCode;
43 }
44
45 public String toString() {
46 return this.url + "," + countryCode;
47 }
48 }
49
50
51 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
52 {
53 this.mongodbAccess = mongodbAccess;
54 this.outFolder = outFolder;
55 }
56
57
58 public void produceURLsForPagesInMRI(File domainsFile) {
59 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
60 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
61 writeURLsToFile(urlsList, outFile, urlsList.size());
62
63 System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
64 + Utility.getFilePath(outFile));
65 }
66
67 public void produceURLsForPagesContainingMRI(File domainsFile) {
68 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile);
69 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
70 writeURLsToFile(urlsList, outFile, urlsList.size());
71
72 System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
73 + Utility.getFilePath(outFile));
74 }
75
76 private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) {
77 ArrayList<Tuple> urlsList = new ArrayList<Tuple>();
78
79 // 1. read each url from the domainsFile
80 // 1a. do the query
81 // 1b. add the arraylist result to urls
82
83 try (
84 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
85 ) {
86
87 String domain;
88
89 while((domain = reader.readLine()) != null) {
90 domain = domain.trim();
91 if(!domain.equals("")) {
92
93 String countryCode = "";
94 int index = domain.lastIndexOf(",");
95 if(index != -1) {
96 countryCode = domain.substring(index+1).trim();
97 domain = domain.substring(0, index);
98 }
99 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
100
101 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know
102 if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
103 System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI.");
104 }
105
106 //urlsList.addAll(moreURLs);
107 for(int i = 0; i < moreURLs.size(); i++) {
108 urlsList.add(new Tuple(moreURLs.get(i), countryCode));
109 }
110
111 }
112 }
113 System.err.println("");
114 } catch(Exception e) {
115 logger.error("Unable to read URLs from file " + Utility.getFilePath(domainsFile));
116 logger.error(e.getMessage(), e);
117 }
118
119 return urlsList;
120 }
121
122 /** Given a hand curated list of all sites with positive numPagesContainingMRI
123 * determined by manual inspection, get a listing of all their web pages that
124 * are IN_MRI (or CONTAINS_MRI?).
125 * Total all these pages that are inMRI (N), then work out the correct sample size (n)
126 * at 90% confidence with 5% margin of error. Then generate a random listing
127 * of n of these pages in MRI of these trusted sites and output to a file
128 * for manual inspection of the sample webpage URLs at page-level. */
129 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
130 * of all their web pages IN_MRI (or CONTAINS_MRI).
131 * Plus a listing of all the NZ pages IN_MRI. */
132 //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
133 public void mriWebPageListingForDomainListing(File domainsFile) {
134
135 int filterType = MongoDBAccess.IS_MRI;
136
137 // for overseas websites,
138 //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
139
140 // 0. get a list of all the web pages in the given domain listing where isMRI = true
141 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile);
142 // produceURLsForPagesInMRI(domainsFile);
143
144 // 1. calculate the population size, N, the number of all webpages in the given domain
145 // site listing where isMRI = true.
146 int N_totalNumPages = urlsList.size();
147
148 // 2. write all the URLs in urlsList to a file
149 //File outFolder = domainsFile.getParentFile();
150 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
151 File outFile = new File(outFolder, fileName+domainsFile.getName());
152
153 writeURLsToFile(urlsList, outFile, N_totalNumPages);
154 System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
155 + "\ninto file: " + Utility.getFilePath(outFile));
156
157 // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
158 int n_numSampleURLs = calcSampleSize(N_totalNumPages);
159
160 System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
161 System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
162
163 // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
164 // Using a constant seed for reproducibility
165 // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
166 Collections.shuffle(urlsList, new Random(FIXED_SEED));
167
168 outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
169 writeURLsToFile(urlsList, outFile, n_numSampleURLs);
170 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
171 + "for the sites in input domainsFile\ninto file: " + Utility.getFilePath(outFile));
172 }
173
174 /**
175 * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
176 * for given population size N.
177 * @return n, the sample size.
178 */
179 public int calcSampleSize(int N) {
180
181 // calculate sample size n for population size N if using 90% confidence and 5% margin of error
182 // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
183 // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
184 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
185
186 double m = 0.05; // margin of error = 5%
187 // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
188 // For 90% confidence, use the table of known z_alpha/2 values from step 1 of
189 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
190 double z_alpha_over_2 = 1.6449;
191
192 // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
193 // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
194 double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
195
196 // Round up to get a whole number:
197 return (int)Math.ceil(n);
198 }
199
200 /**
201 * Writes out the first n URLs in urlsList into outFile.
202 */
203 private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) {
204 try (
205 Writer writer = new BufferedWriter(new FileWriter(outFile));
206 ) {
207
208 for (int i=0; i < n; i++) {
209 Tuple urlInfo = urlsList.get(i);
210
211 //System.out.println(list.get(i));
212 writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode
213 }
214 } catch(Exception e) {
215 logger.error("Unable to write to file " + Utility.getFilePath(outFile));
216 logger.error(e.getMessage(), e);
217 }
218 }
219
220 /* ---------------------------------------- */
221
222 /**
223 * Create the file 5counts_tentativeNonAutotranslatedSites.json
224 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
225 * followed by counts and domain listing for overseas sites that are either from Australia
226 * or don't contain mi in their URL path.
227 * @return full path of file generated
228 */
229 public String writeTentativeNonAutotranslatedSites() {
230
231 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
232
233 String filename = Utility.getFilePath(outFile);
234
235 try (
236 Writer writer = new BufferedWriter(new FileWriter(outFile));
237 ) {
238 // first write out NZ sites and .nz TLD count and domains
239 mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
240 // next write out all overseas sites (not NZ origin or .nz TLD)
241 // that have no "mi" in the URL path as mi.* or */mi
242 boolean isMiInURLPath = false;
243 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
244
245 } catch(Exception e) {
246 logger.error("Unable to write to file " + filename);
247 logger.error(e.getMessage(), e);
248 }
249
250 System.err.println("*** Wrote file: " + filename);
251
252 return filename;
253 }
254
255 /**
256 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
257 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
258 * This listing is separate to allow easier weeding out of product sites/autotranslated
259 * sites when eyeballing the listing output.
260 */
261 public String writeOverseasSitesWithMiInURLPath() {
262 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
263
264 String filename = Utility.getFilePath(outFile);
265 try (
266 Writer writer = new BufferedWriter(new FileWriter(outFile));
267 ) {
268 boolean isMiInURLPath = true;
269 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
270
271 } catch(Exception e) {
272 logger.error("Unable to write to file " + filename);
273 logger.error(e.getMessage(), e);
274 }
275
276 System.err.println("*** Wrote file: " + filename);
277 return filename;
278 }
279
280 public static void printUsage() {
281 System.err.println("Usage: WebPageURLsListing [domains.txt]");
282 }
283
284 /**
285 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
286 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
287 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
288 * translated and really contain at least one webpage containing at least one sentence in MRI.
289 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
290 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
291 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
292 * 90% confidence with 5% margin of error for testing binary outcomes, see
293 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
294 */
295 public static void main(String args[]) {
296 if(args.length >= 2) {
297 printUsage();
298 System.exit(-1);
299 }
300
301 try (
302 MongoDBAccess mongodb = new MongoDBAccess();
303 ) {
304
305 mongodb.connectToDB();
306
307 // output files will be stored in mongodb-data-auto
308 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
309 WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
310
311 System.out.println("*************************************");
312
313
314 if(args.length >= 1) {
315 File domainsFile = new File(args[0]);
316 if(!domainsFile.exists()) {
317 System.err.println("File " + domainsFile + " does not exist");
318 System.exit(-1);
319 }
320
321 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
322 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
323
324
325 // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
326 // then also do the shuffle to gen X num of random web page URLs.
327 //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
328 listing.mriWebPageListingForDomainListing(domainsFile);
329
330 // TODO: generate the special table (6)
331
332 } else {
333
334 // calculating sample size works:
335 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
336 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
337
338 String filename = listing.writeTentativeNonAutotranslatedSites();
339 filename = listing.writeOverseasSitesWithMiInURLPath();
340
341 // TODO: generate the tables
342
343 mongodb.writeTables(outFolder);
344 }
345
346 System.out.println("*************************************");
347 } catch(Exception e) {
348 logger.error(e.getMessage(), e);
349 }
350 }
351}
Note: See TracBrowser for help on using the repository browser.