Changeset 33917

Show
Ignore:
Timestamp:
13.02.2020 18:18:13 (5 days ago)
Author:
ak19
Message:

Added some better reporting when confirming sample size was correct

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33913 r33917  
    177177    return urlsList; 
    178178    } 
    179  
     179     
     180    /** 
     181     * Does a mongoDB query like the following, depending on filter type: 
     182     *    db.getCollection('Webpages').find({isMRI: true}).count() 
     183     * @param filterType can be either IS_MRI or CONTAINS_MRI. 
     184     * @return the number of webpages that matched the filterType setting. 
     185    */ 
     186    public long countOfWebpagesMatching(int filterType) { 
     187    String query = (filterType == IS_MRI) ? "{isMRI: true}" : "{containsMRI: true}"; 
     188    long result = -1;    
     189    MongoCollection<Document> collection = getWebpagesCollection(); 
     190 
     191     
     192    try { 
     193        BasicDBObject queryObj = BasicDBObject.parse(query); 
     194        //result = collection.find(queryObj).count(); 
     195        // https://stackoverflow.com/questions/32683458/how-to-call-count-operation-after-find-with-mongodb-java-driver 
     196        result = collection.countDocuments(queryObj); 
     197         
     198    } catch(Exception e) { 
     199        logger.error("MongoDB couldn't parse provided query " + query); 
     200    } 
     201     
     202    return result; 
     203    } 
     204     
    180205    /**      
    181206     * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java

    r33911 r33917  
    148148    // 2. write all the URLs in urlsList to a file 
    149149    //File outFolder = domainsFile.getParentFile(); 
    150     String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_"; 
    151     File outFile = new File(outFolder, fileName+domainsFile.getName()); 
     150    String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI"; 
     151    File outFile = new File(outFolder, filterName+"_"+domainsFile.getName()); 
    152152 
    153153    writeURLsToFile(urlsList, outFile, N_totalNumPages); 
     
    156156         
    157157    // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error 
    158     int n_numSampleURLs = calcSampleSize(N_totalNumPages);  
    159  
    160     System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages); 
     158    int n_numSampleURLs = calcSampleSize(N_totalNumPages); 
     159 
     160    System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist: " + N_totalNumPages); 
     161    System.err.println("    (out of " + mongodbQueryer.countOfWebpagesMatching(filterType) 
     162               + " web pages across ALL sites for which " + filterName + " = true)"); 
    161163    System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs); 
    162164