Changeset 33883 for other-projects
- Timestamp:
- 2020-01-31T21:50:34+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/mongodb-data/5table_tentativeNonProductSites1.csv
r33848 r33883 1 "_id","siteCount ","numPagesInMRICount","numPagesContainingMRICount"1 "_id","siteCount (numPagesContainingMRICount > 0)","numPagesInMRICount","numPagesContainingMRICount" 2 2 "nz","176.0","4360","9641" 3 3 "us","117.0","757","2655" -
other-projects/maori-lang-detection/mongodb-data/6table_nonProductSites1_manualShortlist.json
r33872 r33883 180 180 181 181 182 First column: n pages that are in MRI / n sampled isMRI pages 183 Second column: n pages that do contain MRI / n sampled pages that are not isMRI yet contain MRI 182 184 183 185 /* 1 */ -
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33882 r33883 537 537 // newlines after every array element in the json: 538 538 String jsonStr = prettyPrintJson(doc.toJson()); 539 System.err.println(jsonStr);539 //System.err.println(jsonStr); 540 540 try { 541 541 writer.write(jsonStr + NEWLINE); … … 553 553 } 554 554 555 556 public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) {557 558 // should only have one doc559 for (Document doc : output) {560 //System.out.println(doc);561 System.out.println(doc.toJson());562 }563 }564 565 555 566 556 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ -
other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java
r33871 r33883 1 1 package org.greenstone.atea; 2 2 3 import java.util.*; 3 import java.util.*; /* includes Random */ 4 4 import java.io.*; 5 5 … … 71 71 Writer writer = new BufferedWriter(new FileWriter(outFile)); 72 72 ) { 73 Collections.shuffle(urlsList );73 Collections.shuffle(urlsList, new Random(1000)); 74 74 for (int i=0; i<numURLs; i++) { 75 75 String url = urlsList.get(i); -
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33882 r33883 7 7 8 8 /** 9 * TO COMPILE OR RUN, FIRST DO: 9 * Runs some of the important mongoDB queries I ran. 10 * 11 * TO COMPILE OR RUN, FIRST DO: 10 12 * cd maori-lang-detection/apache-opennlp-1.9.1 11 13 * export OPENNLP_HOME=`pwd` … … 18 20 * TO RUN: 19 21 * maori-lang-detection/src$ 20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 25522 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 21 23 * 22 24 */ 23 25 public class WebPageURLsListing { 24 26 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName()); 27 static private final long FIXED_SEED = 1000; 25 28 26 29 private final MongoDBAccess mongodbAccess; 27 30 private File outFolder; 28 31 32 29 33 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) 30 34 { … … 32 36 this.outFolder = outFolder; 33 37 } 34 35 public String produceURLsForPagesInMRI(File domainsFile) { 36 return writeFile(MongoDBAccess.IS_MRI, domainsFile); 37 } 38 39 public String produceURLsForPagesContainingMRI(File domainsFile) { 40 return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile); 41 } 42 43 44 public String writeFile(int filterType, File domainsFile/*, int numURLs*/) { 45 38 39 private String getFilePath(File file) { 40 try { 41 return file.getCanonicalPath(); 42 } catch(IOException e) { 43 return file.getAbsolutePath(); 44 } 45 } 46 47 public void produceURLsForPagesInMRI(File domainsFile) { 48 ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 49 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 50 writeURLsToFile(urlsList, outFile, urlsList.size()); 51 52 System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: " 53 + getFilePath(outFile)); 54 } 55 56 public void produceURLsForPagesContainingMRI(File domainsFile) { 57 ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile); 58 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 59 writeURLsToFile(urlsList, outFile, urlsList.size()); 60 61 System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: " 62 + getFilePath(outFile)); 63 } 64 65 private ArrayList<String> getURLsForWebPages(int filterType, File domainsFile) { 46 66 ArrayList<String> urlsList = new ArrayList<String>(); 47 67 … … 52 72 try ( 53 73 BufferedReader reader = new BufferedReader(new FileReader(domainsFile)); 54 ) { 55 74 ) { 56 75 57 76 String domain; … … 62 81 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 63 82 urlsList.addAll(moreURLs); 83 if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) { 84 System.out.println("Domain " + domain + " had no isMRI webpages (only containsMRI)."); 85 } 64 86 } 65 87 } 66 88 } catch(Exception e) { 67 logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath()); 68 logger.error(e.getMessage(), e); 69 } 70 71 // Shuffle the urlsList, then write out the first numURLs into a file. 72 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 73 File parentFolder = domainsFile.getParentFile(); 74 //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName()); 89 logger.error("Unable to read URLs from file " + getFilePath(domainsFile)); 90 logger.error(e.getMessage(), e); 91 } 92 93 return urlsList; 94 } 95 96 /** Given a hand curated list of NZ sites with positive numPagesContainingMRI, 97 * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?). 98 * Total all these pages in MRI (N), then work out the correct sample size (n) 99 * at 90% confidence with 5% margin of error. Then generate a random listing 100 * of n of these pages in MRI of these trusted sites and output to a file 101 * for manual inspection. */ 102 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing 103 * of all their web pages IN_MRI (or CONTAINS_MRI). 104 * Plus a listing of all the NZ pages IN_MRI. */ 105 //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) { 106 public void mriWebPageListingForDomainListing(File domainsFile) { 107 108 int filterType = MongoDBAccess.IS_MRI; 109 110 // for overseas websites, 111 //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile); 112 113 // 0. get a list of all the web pages in the given domain listing where isMRI = true 114 ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 115 // produceURLsForPagesInMRI(domainsFile); 116 117 // 1. calculate the population size, N, the number of all webpages in the given domain 118 // site listing where isMRI = true. 119 int N_totalNumPages = urlsList.size(); 120 121 // 2. write all the URLs in urlsList to a file 122 //File outFolder = domainsFile.getParentFile(); 75 123 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; 76 File outFile = new File(parentFolder, fileName+domainsFile.getName()); 77 78 // write out ALL the URLs 124 File outFile = new File(outFolder, fileName+domainsFile.getName()); 125 126 writeURLsToFile(urlsList, outFile, N_totalNumPages); 127 System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile" 128 + "\ninto file: " + getFilePath(outFile)); 129 130 // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error 131 int n_numSampleURLs = calcSampleSize(N_totalNumPages); 132 133 System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages); 134 System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs); 135 136 // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file 137 // Using a constant seed for reproducibility 138 // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically 139 Collections.shuffle(urlsList, new Random(FIXED_SEED)); 140 141 outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName()); 142 writeURLsToFile(urlsList, outFile, n_numSampleURLs); 143 System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs " 144 + "for the sites in input domainsFile\ninto file: " + getFilePath(outFile)); 145 } 146 147 /** 148 * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error 149 * for given population size N. 150 * @return n, the sample size. 151 */ 152 public int calcSampleSize(int N) { 153 154 // calculate sample size n for population size N if using 90% confidence and 5% margin of error 155 // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 156 // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1 157 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/ 158 159 double m = 0.05; // margin of error = 5% 160 // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%. 161 // For 90% confidence, use the table of known z_alpha/2 values from step 1 of 162 // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/ 163 double z_alpha_over_2 = 1.6449; 164 165 // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2) 166 // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 167 double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0))); 168 169 // Round up to get a whole number: 170 return (int)Math.ceil(n); 171 } 172 173 /** 174 * Writes out the first n URLs in urlsList into outFile. 175 */ 176 private void writeURLsToFile(ArrayList<String> urlsList, File outFile, final int n) { 79 177 try ( 80 178 Writer writer = new BufferedWriter(new FileWriter(outFile)); 81 179 ) { 82 180 83 for (int i=0; i < urlsList.size(); i++) {181 for (int i=0; i < n; i++) { 84 182 String url = urlsList.get(i); 85 183 //System.out.println(list.get(i)); … … 87 185 } 88 186 } catch(Exception e) { 89 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 90 logger.error(e.getMessage(), e); 91 } 92 93 /* 94 // shuffle list and take the first n - write to file 95 try ( 96 Writer writer = new BufferedWriter(new FileWriter(outFile)); 97 ) { 98 Collections.shuffle(urlsList); 99 for (int i=0; i<numURLs; i++) { 100 String url = urlsList.get(i); 101 //System.out.println(list.get(i)); 102 writer.write(url + "\n"); 103 } 104 } catch(Exception e) { 105 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 106 logger.error(e.getMessage(), e); 107 } 108 */ 109 110 return outFile.getAbsolutePath(); 187 logger.error("Unable to write to file " + getFilePath(outFile)); 188 logger.error(e.getMessage(), e); 189 } 111 190 } 112 191 … … 124 203 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json"); 125 204 126 String filename = outFile.getAbsolutePath();205 String filename = getFilePath(outFile); 127 206 128 207 try ( … … 135 214 boolean isMiInURLPath = false; 136 215 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 137 138 filename = outFile.getCanonicalPath(); 139 } catch(Exception e) { 140 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 216 217 } catch(Exception e) { 218 logger.error("Unable to write to file " + filename); 141 219 logger.error(e.getMessage(), e); 142 220 } … … 149 227 /** 150 228 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by 151 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path 229 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path. 230 * This listing is separate to allow easier weeding out of product sites/autotranslated 231 * sites when eyeballing the listing output. 152 232 */ 153 233 public String writeOverseasSitesWithMiInURLPath() { 154 234 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json"); 155 235 156 String filename = outFile.getAbsolutePath();236 String filename = getFilePath(outFile); 157 237 try ( 158 238 Writer writer = new BufferedWriter(new FileWriter(outFile)); … … 160 240 boolean isMiInURLPath = true; 161 241 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 162 filename = outFile.getCanonicalPath(); 163 } catch(Exception e) { 164 logger.error("Unable to write to file " + outFile.getAbsolutePath());242 243 } catch(Exception e) { 244 logger.error("Unable to write to file " + filename); 165 245 logger.error(e.getMessage(), e); 166 246 } … … 171 251 172 252 public static void printUsage() { 173 System.err.println("WebPageURLsListing [domains.txt]"); 174 } 175 176 // Depending on args, generates isMRI and containsMRI file listings for: 177 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ 178 179 253 System.err.println("Usage: WebPageURLsListing [domains.txt]"); 254 } 255 256 /** 257 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains), 258 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately. 259 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically 260 * translated and really contain at least one webpage containing at least one sentence in MRI. 261 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages 262 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching 263 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving 264 * 90% confidence with 5% margin of error for testing binary outcomes, see 265 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 266 */ 180 267 public static void main(String args[]) { 181 268 if(args.length >= 2) { … … 202 289 } 203 290 204 205 //int genNumURLs = Integer.parseInt(args[1]);206 291 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile); 207 292 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile); 208 293 209 //listing.writeWebPagesOfAllNZSitesAndDomainListing(); 294 295 // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI 296 // then also do the shuffle to gen X num of random web page URLs. 297 //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile); 298 listing.mriWebPageListingForDomainListing(domainsFile); 299 300 // TODO: generate the special table (6) 210 301 211 302 } else { 212 303 304 // calculating sample size works: 305 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360)); 306 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681)); 307 213 308 String filename = listing.writeTentativeNonAutotranslatedSites(); 214 309 filename = listing.writeOverseasSitesWithMiInURLPath(); 310 311 // TODO: generate the tables 215 312 } 216 217 313 218 314 } catch(Exception e) {
Note:
See TracChangeset
for help on using the changeset viewer.