Changeset 33906 for other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
- Timestamp:
- 2020-02-05T23:36:37+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33887 r33906 57 57 58 58 public void produceURLsForPagesInMRI(File domainsFile) { 59 ArrayList<Tuple> urlsList = getURLsFor WebPages(MongoDBAccess.IS_MRI, domainsFile);59 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 60 60 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 61 61 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 66 66 67 67 public void produceURLsForPagesContainingMRI(File domainsFile) { 68 ArrayList<Tuple> urlsList = getURLsFor WebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);68 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.CONTAINS_MRI, domainsFile); 69 69 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 70 70 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 74 74 } 75 75 76 private ArrayList<Tuple> getURLsFor WebPages(int filterType, File domainsFile) {76 private ArrayList<Tuple> getURLsForAllWebPagesInSiteListing(int filterType, File domainsFile) { 77 77 ArrayList<Tuple> urlsList = new ArrayList<Tuple>(); 78 78 … … 120 120 } 121 121 122 /** Given a hand curated list of NZ sites with positive numPagesContainingMRI, 123 * get a listing of all their web pages IN_MRI (or CONTAINS_MRI?). 124 * Total all these pages in MRI (N), then work out the correct sample size (n) 122 /** Given a hand curated list of all sites with positive numPagesContainingMRI 123 * determined by manual inspection, get a listing of all their web pages that 124 * are IN_MRI (or CONTAINS_MRI?). 125 * Total all these pages that are inMRI (N), then work out the correct sample size (n) 125 126 * at 90% confidence with 5% margin of error. Then generate a random listing 126 127 * of n of these pages in MRI of these trusted sites and output to a file 127 * for manual inspection . */128 * for manual inspection of the sample webpage URLs at page-level. */ 128 129 /* OLD: Given a hand curated list of non-NZ sites that CONTAINS_MRI, get a listing 129 130 * of all their web pages IN_MRI (or CONTAINS_MRI). … … 138 139 139 140 // 0. get a list of all the web pages in the given domain listing where isMRI = true 140 ArrayList<Tuple> urlsList = getURLsFor WebPages(MongoDBAccess.IS_MRI, domainsFile);141 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBAccess.IS_MRI, domainsFile); 141 142 // produceURLsForPagesInMRI(domainsFile); 142 143
Note:
See TracChangeset
for help on using the changeset viewer.