Changeset 33883


Ignore:
Timestamp:
2020-01-31T21:50:34+13:00 (4 years ago)
Author:
ak19
Message:

Clarifications

Location:
other-projects/maori-lang-detection
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/mongodb-data/5table_tentativeNonProductSites1.csv

    r33848 r33883  
    1 "_id","siteCount","numPagesInMRICount","numPagesContainingMRICount"
     1"_id","siteCount (numPagesContainingMRICount > 0)","numPagesInMRICount","numPagesContainingMRICount"
    22"nz","176.0","4360","9641"
    33"us","117.0","757","2655"
  • other-projects/maori-lang-detection/mongodb-data/6table_nonProductSites1_manualShortlist.json

    r33872 r33883  
    180180
    181181
     182First column: n pages that are in MRI / n sampled isMRI pages
     183Second column: n pages that do contain MRI / n sampled pages that are not isMRI yet contain MRI
    182184
    183185/* 1 */
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33882 r33883  
    537537    // newlines after every array element in the json:
    538538    String jsonStr = prettyPrintJson(doc.toJson());
    539     System.err.println(jsonStr);
     539    //System.err.println(jsonStr);
    540540    try {
    541541        writer.write(jsonStr + NEWLINE);
     
    553553    }
    554554
    555 
    556     public void writeToFile(boolean append, String filename, AggregateIterable<Document> output) {
    557    
    558     // should only have one doc
    559     for (Document doc : output) {
    560         //System.out.println(doc);
    561         System.out.println(doc.toJson());
    562     }
    563     }
    564    
    565555   
    566556    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
  • other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java

    r33871 r33883  
    11package org.greenstone.atea;
    22
    3 import java.util.*;
     3import java.util.*; /* includes Random */
    44import java.io.*;
    55
     
    7171         Writer writer = new BufferedWriter(new FileWriter(outFile));
    7272         ) {
    73         Collections.shuffle(urlsList);
     73        Collections.shuffle(urlsList, new Random(1000));
    7474        for (int i=0; i<numURLs; i++) {
    7575        String url = urlsList.get(i);
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33882 r33883  
    77
    88/**
    9 * TO COMPILE OR RUN, FIRST DO:
     9 * Runs some of the important mongoDB queries I ran.
     10 *
     11 * TO COMPILE OR RUN, FIRST DO:
    1012 *    cd maori-lang-detection/apache-opennlp-1.9.1
    1113 *    export OPENNLP_HOME=`pwd`
     
    1820 * TO RUN:
    1921 *    maori-lang-detection/src$
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
     22 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt
    2123 *
    2224*/
    2325public class WebPageURLsListing {
    2426    static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
     27    static private final long FIXED_SEED = 1000;
    2528   
    2629    private final MongoDBAccess mongodbAccess;
    2730    private File outFolder;
    2831
     32   
    2933    public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
    3034    {
     
    3236    this.outFolder = outFolder;
    3337    }
    34    
    35     public String produceURLsForPagesInMRI(File domainsFile) {
    36     return writeFile(MongoDBAccess.IS_MRI, domainsFile);
    37     }
    38    
    39     public String produceURLsForPagesContainingMRI(File domainsFile) {
    40     return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile);
    41     }
    42 
    43 
    44     public String writeFile(int filterType, File domainsFile/*, int numURLs*/) {
    45 
     38
     39    private String getFilePath(File file) {
     40    try {
     41        return file.getCanonicalPath();
     42    } catch(IOException e) {
     43        return file.getAbsolutePath();
     44    }
     45    }
     46   
     47    public void produceURLsForPagesInMRI(File domainsFile) {
     48    ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
     49    File outFile = new File(outFolder, "isMRI_"+domainsFile.getName());
     50    writeURLsToFile(urlsList, outFile, urlsList.size());
     51   
     52    System.out.println("Wrote all isMRI web page URLs for the sites in input domainsFile\ninto file: "
     53               + getFilePath(outFile));
     54    }
     55   
     56    public void produceURLsForPagesContainingMRI(File domainsFile) {
     57    ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);   
     58    File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName());
     59    writeURLsToFile(urlsList, outFile, urlsList.size());
     60
     61    System.out.println("Wrote all containsMRI web page URLs for the sites in input domainsFile\ninto file: "
     62               + getFilePath(outFile));
     63    }
     64   
     65    private ArrayList<String> getURLsForWebPages(int filterType, File domainsFile) {
    4666    ArrayList<String> urlsList = new ArrayList<String>();
    4767   
     
    5272    try (
    5373         BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
    54          ) {
    55        
     74         ) {       
    5675       
    5776        String domain;
     
    6281            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
    6382            urlsList.addAll(moreURLs);
     83            if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) {
     84            System.out.println("Domain " + domain + " had no isMRI webpages (only containsMRI).");
     85            }
    6486        }
    6587        }
    6688    } catch(Exception e) {
    67         logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath());
    68         logger.error(e.getMessage(), e);
    69     }
    70 
    71     // Shuffle the urlsList, then write out the first numURLs into a file. 
    72     // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
    73     File parentFolder = domainsFile.getParentFile();
    74     //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName());
     89        logger.error("Unable to read URLs from file " + getFilePath(domainsFile));
     90        logger.error(e.getMessage(), e);
     91    }
     92
     93    return urlsList;
     94    }
     95   
     96    /** Given a hand curated list of NZ sites with positive numPagesContainingMRI,
     97     * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?).
     98     * Total all these pages in MRI (N), then work out the correct sample size (n)
     99     * at 90% confidence with 5% margin of error. Then generate a random listing
     100     * of n of these pages in MRI of these trusted sites and output to a file
     101     * for manual inspection. */
     102    /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing
     103     * of all their web pages IN_MRI (or CONTAINS_MRI).
     104     * Plus a listing of all the NZ pages IN_MRI. */
     105    //public void webPagesOfAllNZSitesAndDomainListing(File domainsFile) {
     106    public void mriWebPageListingForDomainListing(File domainsFile) {
     107
     108    int filterType = MongoDBAccess.IS_MRI;
     109   
     110    // for overseas websites,
     111    //produceURLsForPagesContainingMRI(handCuratedOverseasDomainsFile);
     112
     113    // 0. get a list of all the web pages in the given domain listing where isMRI = true
     114    ArrayList<String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);
     115        // produceURLsForPagesInMRI(domainsFile);
     116   
     117    // 1. calculate the population size, N, the number of all webpages in the given domain
     118    // site listing where isMRI = true.
     119    int N_totalNumPages = urlsList.size();
     120
     121    // 2. write all the URLs in urlsList to a file
     122    //File outFolder = domainsFile.getParentFile();
    75123    String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
    76     File outFile = new File(parentFolder, fileName+domainsFile.getName());
    77 
    78     // write out ALL the URLs
     124    File outFile = new File(outFolder, fileName+domainsFile.getName());
     125
     126    writeURLsToFile(urlsList, outFile, N_totalNumPages);
     127    System.out.println("Wrote out full listing of web page URLs for sites in input domainsFile"
     128               + "\ninto file: " + getFilePath(outFile));
     129       
     130    // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
     131    int n_numSampleURLs = calcSampleSize(N_totalNumPages);
     132
     133    System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
     134    System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
     135   
     136    // 4. Shuffle all the URLs and write the first n (sample size) URLs to a file
     137    // Using a constant seed for reproducibility
     138    // https://stackoverflow.com/questions/6284589/setting-a-seed-to-shuffle-arraylist-in-java-deterministically
     139    Collections.shuffle(urlsList, new Random(FIXED_SEED));
     140   
     141    outFile = new File(outFolder, "random"+n_numSampleURLs+"_"+domainsFile.getName());
     142    writeURLsToFile(urlsList, outFile, n_numSampleURLs);
     143    System.out.println("Wrote a sample of n=" + n_numSampleURLs + " of web page URLs "
     144               + "for the sites in input domainsFile\ninto file: " + getFilePath(outFile));
     145    }
     146
     147    /**
     148     * Calculates sample size n for binary outcomes at 90% confidence and 5% margine of error
     149     * for given population size N.
     150     * @return n, the sample size.
     151     */
     152    public int calcSampleSize(int N) {
     153   
     154    // calculate sample size n for population size N if using 90% confidence and 5% margin of error
     155    // https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
     156    // https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/#CI1
     157    // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
     158   
     159    double m = 0.05; // margin of error = 5%
     160    // for 90% confidence, alpha is the remainder = 10% and alpha/2 = 5%.
     161    // For 90% confidence, use the table of known z_alpha/2 values from step 1 of
     162    // https://www.statisticshowto.datasciencecentral.com/z-alpha2-za2/
     163    double z_alpha_over_2 = 1.6449;
     164
     165    // Formula: n = (zalpha2 ^ 2 * N) / ((z-alpha-2 ^ 2) + 4(N-1)*m^2)
     166    // see https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
     167    double n = (Math.pow(z_alpha_over_2, 2.0) * N) / (Math.pow(z_alpha_over_2, 2.0) + (4 * (N-1) * Math.pow(m,2.0)));
     168
     169    // Round up to get a whole number:
     170    return (int)Math.ceil(n);
     171    }
     172   
     173    /**
     174     * Writes out the first n URLs in urlsList into outFile.
     175     */
     176    private void writeURLsToFile(ArrayList<String> urlsList, File outFile, final int n) {
    79177    try (
    80178         Writer writer = new BufferedWriter(new FileWriter(outFile));
    81179         ) {
    82180
    83         for (int i=0; i < urlsList.size(); i++) {
     181        for (int i=0; i < n; i++) {
    84182        String url = urlsList.get(i);
    85183        //System.out.println(list.get(i));
     
    87185        }
    88186    } catch(Exception e) {
    89         logger.error("Unable to write to file " + outFile.getAbsolutePath());
    90         logger.error(e.getMessage(), e);
    91     }
    92 
    93     /*
    94     // shuffle list and take the first n - write to file
    95     try (
    96          Writer writer = new BufferedWriter(new FileWriter(outFile));
    97          ) {
    98         Collections.shuffle(urlsList);
    99         for (int i=0; i<numURLs; i++) {
    100         String url = urlsList.get(i);
    101         //System.out.println(list.get(i));
    102         writer.write(url + "\n");
    103         }
    104     } catch(Exception e) {
    105         logger.error("Unable to write to file " + outFile.getAbsolutePath());
    106         logger.error(e.getMessage(), e);
    107     }
    108     */
    109 
    110     return outFile.getAbsolutePath();
     187        logger.error("Unable to write to file " + getFilePath(outFile));
     188        logger.error(e.getMessage(), e);
     189    }
    111190    }
    112191
     
    124203    File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
    125204
    126     String filename = outFile.getAbsolutePath();
     205    String filename = getFilePath(outFile);
    127206   
    128207    try (
     
    135214        boolean isMiInURLPath = false;
    136215        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
    137 
    138         filename = outFile.getCanonicalPath();
    139     } catch(Exception e) {
    140         logger.error("Unable to write to file " + outFile.getAbsolutePath());
     216       
     217    } catch(Exception e) {
     218        logger.error("Unable to write to file " + filename);
    141219        logger.error(e.getMessage(), e);
    142220    }
     
    149227    /**
    150228     * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
    151      * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path
     229     * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path.
     230     * This listing is separate to allow easier weeding out of product sites/autotranslated
     231     * sites when eyeballing the listing output.
    152232     */
    153233    public String writeOverseasSitesWithMiInURLPath() {
    154234    File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
    155235
    156     String filename = outFile.getAbsolutePath();
     236    String filename = getFilePath(outFile);
    157237    try (
    158238         Writer writer = new BufferedWriter(new FileWriter(outFile));
     
    160240        boolean isMiInURLPath = true;
    161241        mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
    162         filename = outFile.getCanonicalPath();
    163     } catch(Exception e) {
    164         logger.error("Unable to write to file " + outFile.getAbsolutePath());
     242
     243    } catch(Exception e) {
     244        logger.error("Unable to write to file " + filename);
    165245        logger.error(e.getMessage(), e);
    166246    }
     
    171251   
    172252    public static void printUsage() {
    173     System.err.println("WebPageURLsListing [domains.txt]");
    174     }
    175 
    176     // Depending on args, generates isMRI and containsMRI file listings for:
    177     // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
    178 
    179    
     253    System.err.println("Usage: WebPageURLsListing [domains.txt]");
     254    }
     255
     256    /**
     257     * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
     258     * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
     259     * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
     260     * translated and really contain at least one webpage containing at least one sentence in MRI.
     261     * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
     262     * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
     263     * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
     264     * 90% confidence with 5% margin of error for testing binary outcomes, see
     265     * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
     266     */
    180267    public static void main(String args[]) {
    181268    if(args.length >= 2) {
     
    202289        }
    203290
    204        
    205         //int genNumURLs = Integer.parseInt(args[1]);
    206291        //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
    207292        //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
    208293
    209         //listing.writeWebPagesOfAllNZSitesAndDomainListing();
     294
     295        // TODO: for NZ, do IS_MRI. For overseas still CONTAINS_MRI
     296        // then also do the shuffle to gen X num of random web page URLs.
     297        //String filename = listing.webPagesOfAllNZSitesAndDomainListing(domainsFile);
     298        listing.mriWebPageListingForDomainListing(domainsFile);
     299
     300        // TODO: generate the special table (6)
    210301       
    211302        } else {       
    212303
     304        // calculating sample size works:
     305        //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360));
     306        //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681));
     307       
    213308        String filename = listing.writeTentativeNonAutotranslatedSites();
    214309        filename = listing.writeOverseasSitesWithMiInURLPath();
     310
     311        // TODO: generate the tables
    215312        }
    216 
    217313       
    218314    } catch(Exception e) {
Note: See TracChangeset for help on using the changeset viewer.