Changeset 33917 for other-projects


Ignore:
Timestamp:
2020-02-13T18:18:13+13:00 (4 years ago)
Author:
ak19
Message:

Added some better reporting when confirming sample size was correct

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBQueryer.java

    r33913 r33917  
    177177    return urlsList;
    178178    }
    179 
     179   
     180    /**
     181     * Does a mongoDB query like the following, depending on filter type:
     182     *    db.getCollection('Webpages').find({isMRI: true}).count()
     183     * @param filterType can be either IS_MRI or CONTAINS_MRI.
     184     * @return the number of webpages that matched the filterType setting.
     185    */
     186    public long countOfWebpagesMatching(int filterType) {
     187    String query = (filterType == IS_MRI) ? "{isMRI: true}" : "{containsMRI: true}";
     188    long result = -1;   
     189    MongoCollection<Document> collection = getWebpagesCollection();
     190
     191   
     192    try {
     193        BasicDBObject queryObj = BasicDBObject.parse(query);
     194        //result = collection.find(queryObj).count();
     195        // https://stackoverflow.com/questions/32683458/how-to-call-count-operation-after-find-with-mongodb-java-driver
     196        result = collection.countDocuments(queryObj);
     197       
     198    } catch(Exception e) {
     199        logger.error("MongoDB couldn't parse provided query " + query);
     200    }
     201   
     202    return result;
     203    }
     204   
    180205    /**     
    181206     * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
  • other-projects/maori-lang-detection/src/org/greenstone/atea/SummaryTool.java

    r33911 r33917  
    148148    // 2. write all the URLs in urlsList to a file
    149149    //File outFolder = domainsFile.getParentFile();
    150     String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_";
    151     File outFile = new File(outFolder, fileName+domainsFile.getName());
     150    String filterName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI" : "containsMRI";
     151    File outFile = new File(outFolder, filterName+"_"+domainsFile.getName());
    152152
    153153    writeURLsToFile(urlsList, outFile, N_totalNumPages);
     
    156156       
    157157    // 3. calculate sample size n for population size N if using 90% confidence and 5% margin of error
    158     int n_numSampleURLs = calcSampleSize(N_totalNumPages);
    159 
    160     System.err.println("*** N, total number of web pages that matched: " + N_totalNumPages);
     158    int n_numSampleURLs = calcSampleSize(N_totalNumPages);
     159
     160    System.err.println("*** N, total number of web pages for which " + filterName + "=true from domain shortlist: " + N_totalNumPages);
     161    System.err.println("    (out of " + mongodbQueryer.countOfWebpagesMatching(filterType)
     162               + " web pages across ALL sites for which " + filterName + " = true)");
    161163    System.err.println("*** n, sample size of web page URLs: " + n_numSampleURLs);
    162164   
Note: See TracChangeset for help on using the changeset viewer.