Changeset 33909 for other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
- Timestamp:
- 2020-02-12T19:02:44+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33906 r33909 29 29 static private final long FIXED_SEED = 1000; 30 30 31 private final MongoDB Access mongodbAccess;31 private final MongoDBQueryer mongodbQueryer; 32 32 private File outFolder; 33 33 … … 49 49 50 50 51 public WebPageURLsListing(MongoDB Access mongodbAccess, File outFolder)51 public WebPageURLsListing(MongoDBQueryer mongodbQueryer, File outFolder) 52 52 { 53 this.mongodb Access = mongodbAccess;53 this.mongodbQueryer = mongodbQueryer; 54 54 this.outFolder = outFolder; 55 55 } … … 57 57 58 58 public void produceURLsForPagesInMRI(File domainsFile) { 59 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDB Access.IS_MRI, domainsFile);59 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile); 60 60 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 61 61 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 66 66 67 67 public void produceURLsForPagesContainingMRI(File domainsFile) { 68 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDB Access.CONTAINS_MRI, domainsFile);68 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.CONTAINS_MRI, domainsFile); 69 69 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 70 70 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 97 97 domain = domain.substring(0, index); 98 98 } 99 ArrayList<String> moreURLs = mongodb Access.queryAllMatchingURLsFilteredBy(domain, filterType);99 ArrayList<String> moreURLs = mongodbQueryer.queryAllMatchingURLsFilteredBy(domain, filterType); 100 100 101 101 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know 102 if(moreURLs.size() == 0 && filterType == MongoDB Access.IS_MRI) {102 if(moreURLs.size() == 0 && filterType == MongoDBQueryer.IS_MRI) { 103 103 System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI."); 104 104 } … … 133 133 public void mriWebPageListingForDomainListing(File domainsFile) { 134 134 135 int filterType = MongoDB Access.IS_MRI;135 int filterType = MongoDBQueryer.IS_MRI; 136 136 137 137 // for overseas websites, … … 139 139 140 140 // 0. get a list of all the web pages in the given domain listing where isMRI = true 141 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDB Access.IS_MRI, domainsFile);141 ArrayList<Tuple> urlsList = getURLsForAllWebPagesInSiteListing(MongoDBQueryer.IS_MRI, domainsFile); 142 142 // produceURLsForPagesInMRI(domainsFile); 143 143 … … 148 148 // 2. write all the URLs in urlsList to a file 149 149 //File outFolder = domainsFile.getParentFile(); 150 String fileName = (filterType == MongoDB Access.IS_MRI) ? "isMRI_" : "containsMRI_";150 String fileName = (filterType == MongoDBQueryer.IS_MRI) ? "isMRI_" : "containsMRI_"; 151 151 File outFile = new File(outFolder, fileName+domainsFile.getName()); 152 152 … … 219 219 220 220 /* ---------------------------------------- */ 221 222 221 /** 223 * Create the file 5counts_tentativeNonAutotranslatedSites.json 222 * Create the file 5counts_containsMRISites_allNZGrouped.json 223 * that contains the count and domains for NZ sites (NZ origin or nz TLD) with pages 224 * that CONTAIN_MRI, followed by counts and domains listing for overseas sites 225 * that CONTAIN_MRI. 226 * @return full path of file generated 227 */ 228 public String writeContainsMRISites_nzSitesAndTLDsGrouped() { 229 230 File outFile = new File(outFolder, "5counts_containsMRISites_allNZGrouped.json"); 231 232 String filename = Utility.getFilePath(outFile); 233 234 try ( 235 Writer writer = new BufferedWriter(new FileWriter(outFile)); 236 ) { 237 // first write out NZ sites and .nz TLD count and domains 238 mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI); 239 // next write out all overseas sites (not NZ origin or .nz TLD) 240 // that have no "mi" in the URL path as mi.* or */mi 241 boolean isMiInURLPath = false; 242 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI); 243 244 } catch(Exception e) { 245 logger.error("Unable to write to file " + filename); 246 logger.error(e.getMessage(), e); 247 } 248 249 System.err.println("*** Wrote file: " + filename); 250 251 return filename; 252 } 253 254 /** 255 * Create the file 5a_counts_tentativeNonAutotranslatedSites.json 224 256 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI 225 257 * followed by counts and domain listing for overseas sites that are either from Australia … … 237 269 ) { 238 270 // first write out NZ sites and .nz TLD count and domains 239 mongodb Access.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);271 mongodbQueryer.aggregateContainsMRIForNZ(writer, MongoDBQueryer.CONTAINS_MRI); 240 272 // next write out all overseas sites (not NZ origin or .nz TLD) 241 273 // that have no "mi" in the URL path as mi.* or */mi 242 274 boolean isMiInURLPath = false; 243 mongodb Access.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);275 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath); 244 276 245 277 } catch(Exception e) { … … 254 286 255 287 /** 288 * Create the file 5b_counts_overseasSitesWithMiInPath.json 256 289 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by 257 290 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path. … … 267 300 ) { 268 301 boolean isMiInURLPath = true; 269 mongodb Access.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);302 mongodbQueryer.aggregateContainsMRIForOverseas(writer, MongoDBQueryer.CONTAINS_MRI, isMiInURLPath); 270 303 271 304 } catch(Exception e) { … … 300 333 301 334 try ( 302 MongoDB Access mongodb = new MongoDBAccess();335 MongoDBQueryer mongodb = new MongoDBQueryer(); 303 336 ) { 304 337 … … 335 368 //System.err.println("For N = " + 4360 + ", n = " + listing.calcSampleSize(4360)); 336 369 //System.err.println("For N = " + 681 + ", n = " + listing.calcSampleSize(681)); 337 338 String filename = listing.writeTentativeNonAutotranslatedSites(); 370 371 // get all sites where >0 pages have containsMRI=true 372 // grouping NZ sites and .nz TLDs together and remainder under overseas 373 // geolocations. 374 String filename = listing.writeContainsMRISites_nzSitesAndTLDsGrouped(); 375 376 // separately: 377 // - all NZ containsMRI + overseas tentative non-product sites with containMRI 378 // - overseas tentative product sites with containMRI 379 filename = listing.writeTentativeNonAutotranslatedSites(); 339 380 filename = listing.writeOverseasSitesWithMiInURLPath(); 340 381
Note:
See TracChangeset
for help on using the changeset viewer.