Changeset 33883

Show
Ignore:
Timestamp:
31.01.2020 21:50:34 (3 weeks ago)
Author:
ak19
Message:

Clarifications

Location:
other-projects/maori-lang-detection
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/mongodb-data/5table_tentativeNonProductSites1.csv

    r33848 r33883  
    1 "_id","siteCount","numPagesInMRICount","numPagesContainingMRICount" 
     1"_id","siteCount (numPagesContainingMRICount > 0)","numPagesInMRICount","numPagesContainingMRICount" 
    22"nz","176.0","4360","9641" 
    33"us","117.0","757","2655" 
  • other-projects/maori-lang-detection/mongodb-data/6table_nonProductSites1_manualShortlist.json

    r33872 r33883  
    180180 
    181181 
     182First column: n pages that are in MRI / n sampled isMRI pages 
     183Second column: n pages that do contain MRI / n sampled pages that are not isMRI yet contain MRI 
    182184 
    183185/* 1 */ 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33882 r33883  
    537537    // newlines after every array element in the json: 
    538538    String jsonStr = prettyPrintJson(doc.toJson()); 
    539     System.err.println(jsonStr); 
     539    //System.err.println(jsonStr); 
    540540    try { 
    541541        writer.write(jsonStr + NEWLINE); 
     
    553553    } 
    554554 
    555  
    556     public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) { 
    557      
    558     // should only have one doc 
    559     for (Document doc : output) { 
    560         //System.out.println(doc); 
    561         System.out.println(doc.toJson()); 
    562     } 
    563     } 
    564      
    565555     
    566556    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java

    r33871 r33883  
    11package org.greenstone.atea; 
    22 
    3 import java.util.*; 
     3import java.util.*; /* includes Random */ 
    44import java.io.*; 
    55 
     
    7171         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
    7272         ) { 
    73         Collections.shuffle(urlsList); 
     73        Collections.shuffle(urlsList, new Random(1000)); 
    7474        for (int i=0; i<numURLs; i++) { 
    7575        String url = urlsList.get(i); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33882 r33883  
    77 
    88/** 
    9 * TO COMPILE OR RUN, FIRST DO: 
     9 * Runs some of the important mongoDB queries I ran. 
     10 * 
     11 * TO COMPILE OR RUN, FIRST DO: 
    1012 *    cd maori-lang-detection/apache-opennlp-1.9.1 
    1113 *    export OPENNLP_HOME=`pwd` 
     
    1820 * TO RUN: 
    1921 *    maori-lang-detection/src$ 
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255 
     22 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 
    2123 * 
    2224*/ 
    2325public class WebPageURLsListing { 
    2426    static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName()); 
     27    static private final long FIXED_SEED = 1000; 
    2528     
    2629    private final MongoDBAccess mongodbAccess; 
    2730    private File outFolder; 
    2831 
     32     
    2933    public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) 
    3034    { 
     
    3236    this.outFolder = outFolder; 
    3337    } 
    34      
    35     public String produceURLsForPagesInMRI(File domainsFile) { 
    36     return writeFile(MongoDBAccess.IS_MRI, domainsFile); 
    37     } 
    38      
    39     public String produceURLsForPagesContainingMRI(File domainsFile) { 
    40     return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile); 
    41     } 
    42  
    43  
    44     public String writeFile(int filterType, File domainsFile/*, int numURLs*/) { 
    45  
     38 
     39    private String getFilePath(File file) { 
     40    try { 
     41        return file.getCanonicalPath(); 
     42    } catch(IOException e) { 
     43        return file.getAbsolutePath(); 
     44    } 
     45    } 
     46     
     47    public void produceURLsForPagesInMRI(File domainsFile) { 
     48    ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
     49    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 
     50    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     51     
     52    System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: " 
     53               + getFilePath(outFile)); 
     54    } 
     55     
     56    public void produceURLsForPagesContainingMRI(File domainsFile) { 
     57    ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);    
     58    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 
     59    writeURLsToFile(urlsList, outFile, urlsList.size()); 
     60 
     61    System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: " 
     62               + getFilePath(outFile)); 
     63    } 
     64     
     65    private ArrayList<String> getURLsForWebPages(int filterType, File domainsFile) { 
    4666    ArrayList<String> urlsList = new ArrayList<String>(); 
    4767     
     
    5272    try ( 
    5373         BufferedReader reader = new BufferedReader(new FileReader(domainsFile)); 
    54          ) { 
    55          
     74         ) {         
    5675         
    5776        String domain; 
     
    6281            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 
    6382            urlsList.addAll(moreURLs); 
     83            if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) { 
     84            System.out.println("Domain " + domain + " had no isMRI webpages (only containsMRI)."); 
     85            } 
    6486        } 
    6587        } 
    6688    } catch(Exception e) { 
    67         logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath()); 
    68         logger.error(e.getMessage(), e); 
    69     } 
    70  
    71     // Shuffle the urlsList, then write out the first numURLs into a file.   
    72     // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 
    73     File parentFolder = domainsFile.getParentFile(); 
    74     //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName()); 
     89        logger.error("Unable to read URLs from file " + getFilePath(domainsFile)); 
     90        logger.error(e.getMessage(), e); 
     91    } 
     92 
     93    return urlsList; 
     94    } 
     95     
     96    /** Given a hand curated list of NZ sites with positive numPagesContainingMRI, 
     97     * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).  
     98     * Total all these pages in MRI (N), then work out the correct sample size (n) 
     99     * at 90% confidence with 5% margin of error. Then generate a random listing 
     100     * of n of these pages in MRI of these trusted sites and output to a file 
     101     * for manual inspection. */ 
     102    /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing 
     103     * of all their web pages IN_MRI (or CONTAINS_MRI). 
     104     * Plus a listing of all the NZ pages IN_MRI. */ 
     105    //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) { 
     106    public void mriWebPageListingForDomainListing(File domainsFile) { 
     107 
     108    int filterType = MongoDBAccess.IS_MRI; 
     109     
     110    // for overseas websites,  
     111    //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile); 
     112 
     113    // 0. get a list of all the web pages in the given domain listing where isMRI = true 
     114    ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 
     115        // produceURLsForPagesInMRI(domainsFile); 
     116     
     117    // 1. calculate the population size, N, the number of all webpages in the given domain 
     118    // site listing where isMRI = true. 
     119    int N_totalNumPages = urlsList.size(); 
     120 
     121    // 2. write all the URLs in urlsList to a file 
     122    //File outFolder = domainsFile.getParentFile(); 
    75123    String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; 
    76     File outFile = new File(parentFolder, fileName+domainsFile.getName()); 
    77  
    78     // write out ALL the URLs 
     124    File outFile = new File(outFolder, fileName+domainsFile.getName()); 
     125 
     126    writeURLsToFile(urlsList, outFile, N_totalNumPages); 
     127    System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile" 
     128               + "\ninto file: " + getFilePath(outFile)); 
     129         
     130    // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error 
     131    int n_numSampleURLs = calcSampleSize(N_totalNumPages);  
     132 
     133    System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages); 
     134    System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs); 
     135     
     136    // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file 
     137    // Using a constant seed for reproducibility 
     138    // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically 
     139    Collections.shuffle(urlsList, new Random(FIXED_SEED));  
     140     
     141    outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName()); 
     142    writeURLsToFile(urlsList, outFile, n_numSampleURLs); 
     143    System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs " 
     144               + "for the sites in input domainsFile\ninto file: " + getFilePath(outFile)); 
     145    } 
     146 
     147    /** 
     148     * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error 
     149     * for given population size N. 
     150     * @return n, the sample size. 
     151     */ 
     152    public int calcSampleSize(int N) { 
     153     
     154    // calculate sample size n for population size N if using 90% confidence and 5% margin of error 
     155    // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 
     156    // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1 
     157    // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/ 
     158     
     159    double m = 0.05; // margin of error = 5% 
     160    // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%. 
     161    // For 90% confidence, use the table of known z_alpha/2 values from step 1 of 
     162    // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/ 
     163    double z_alpha_over_2 = 1.6449; 
     164 
     165    // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2) 
     166    // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 
     167    double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0))); 
     168 
     169    // Round up to get a whole number: 
     170    return (int)Math.ceil(n); 
     171    } 
     172     
     173    /** 
     174     * Writes out the first n URLs in urlsList into outFile. 
     175     */ 
     176    private void writeURLsToFile(ArrayList<String> urlsList, File outFile, final int n) { 
    79177    try ( 
    80178         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
    81179         ) { 
    82180 
    83         for (int i=0; i < urlsList.size(); i++) { 
     181        for (int i=0; i < n; i++) { 
    84182        String url = urlsList.get(i); 
    85183        //System.out.println(list.get(i)); 
     
    87185        } 
    88186    } catch(Exception e) { 
    89         logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
    90         logger.error(e.getMessage(), e); 
    91     } 
    92  
    93     /* 
    94     // shuffle list and take the first n - write to file 
    95     try ( 
    96          Writer writer = new BufferedWriter(new FileWriter(outFile)); 
    97          ) { 
    98         Collections.shuffle(urlsList); 
    99         for (int i=0; i<numURLs; i++) { 
    100         String url = urlsList.get(i); 
    101         //System.out.println(list.get(i)); 
    102         writer.write(url + "\n"); 
    103         } 
    104     } catch(Exception e) { 
    105         logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
    106         logger.error(e.getMessage(), e); 
    107     } 
    108     */ 
    109  
    110     return outFile.getAbsolutePath(); 
     187        logger.error("Unable to write to file " + getFilePath(outFile)); 
     188        logger.error(e.getMessage(), e); 
     189    } 
    111190    } 
    112191 
     
    124203    File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json"); 
    125204 
    126     String filename = outFile.getAbsolutePath(); 
     205    String filename = getFilePath(outFile); 
    127206     
    128207    try ( 
     
    135214        boolean isMiInURLPath = false; 
    136215        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 
    137  
    138         filename = outFile.getCanonicalPath(); 
    139     } catch(Exception e) { 
    140         logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
     216         
     217    } catch(Exception e) { 
     218        logger.error("Unable to write to file " + filename); 
    141219        logger.error(e.getMessage(), e); 
    142220    } 
     
    149227    /** 
    150228     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by 
    151      * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path 
     229     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path. 
     230     * This listing is separate to allow easier weeding out of product sites/autotranslated 
     231     * sites when eyeballing the listing output. 
    152232     */ 
    153233    public String writeOverseasSitesWithMiInURLPath() { 
    154234    File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json"); 
    155235 
    156     String filename = outFile.getAbsolutePath(); 
     236    String filename = getFilePath(outFile); 
    157237    try ( 
    158238         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
     
    160240        boolean isMiInURLPath = true; 
    161241        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 
    162         filename = outFile.getCanonicalPath(); 
    163     } catch(Exception e) { 
    164         logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
     242 
     243    } catch(Exception e) { 
     244        logger.error("Unable to write to file " + filename); 
    165245        logger.error(e.getMessage(), e); 
    166246    } 
     
    171251     
    172252    public static void printUsage() { 
    173     System.err.println("WebPageURLsListing [domains.txt]"); 
    174     } 
    175  
    176     // Depending on args, generates isMRI and containsMRI file listings for: 
    177     // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ 
    178  
    179      
     253    System.err.println("Usage: WebPageURLsListing [domains.txt]"); 
     254    } 
     255 
     256    /** 
     257     * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains), 
     258     * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately. 
     259     * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically 
     260     * translated and really contain at least one webpage containing at least one sentence in MRI. 
     261     * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages  
     262     * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching 
     263     * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving 
     264     * 90% confidence with 5% margin of error for testing binary outcomes, see 
     265     * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome 
     266     */ 
    180267    public static void main(String args[]) { 
    181268    if(args.length >= 2) { 
     
    202289        } 
    203290 
    204          
    205         //int genNumURLs = Integer.parseInt(args[1]); 
    206291        //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile); 
    207292        //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile); 
    208293 
    209         //listing.writeWebPagesOfAllNZSitesAndDomainListing(); 
     294 
     295        // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI 
     296        // then also do the shuffle to gen X num of random web page URLs. 
     297        //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile); 
     298        listing.mriWebPageListingForDomainListing(domainsFile); 
     299 
     300        // TODO: generate the special table (6) 
    210301         
    211302        } else {         
    212303 
     304        // calculating sample size works: 
     305        //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360)); 
     306        //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681)); 
     307         
    213308        String filename = listing.writeTentativeNonAutotranslatedSites(); 
    214309        filename = listing.writeOverseasSitesWithMiInURLPath(); 
     310 
     311        // TODO: generate the tables 
    215312        } 
    216  
    217313         
    218314    } catch(Exception e) {