Changeset 33871
- Timestamp:
- 2020-01-24T20:59:42+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33870 r33871 276 276 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 277 277 */ 278 public ArrayList<String> oldWorks_queryAllMatchingIsMRIURLs(String domain) {278 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 279 279 280 280 final ArrayList<String> urlsList = new ArrayList<String>(); … … 285 285 286 286 // load the "webpages" db table 287 // in mongodb, the equivalent of db tables are called 'collections' 288 289 290 //Pattern pattern = Pattern.compile(".*"+domain+".*"); 291 292 // escape dots in domain for regex 293 String pattern = "/"+domain.replace(".", "\\.")+"/"; 294 287 // in mongodb, the equivalent of db tables are called 'collections' 295 288 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 296 289 297 290 // code we'll execute in Iterable.forEach() below 298 291 Block<Document> storeURL = new Block<Document>() { 299 292 @Override … … 302 295 String url = document.getString("URL"); 303 296 // add to our urlsList 304 System.out.println(url);297 //System.out.println(url); 305 298 urlsList.add(url); 306 299 } … … 309 302 310 303 311 // do mongodb query: 312 // test example: 313 //collection.find(eq("isMRI", true)).first(); 314 // 315 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 316 collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 317 318 319 return urlsList; 320 } 321 322 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 323 324 final ArrayList<String> urlsList = new ArrayList<String>(); 325 326 // remove any http(s)://(www.) from the start of URL first 327 // since it goes into a regex 328 domain = Utility.stripProtocolAndWWWFromURL(domain); 329 330 // load the "webpages" db table 331 // in mongodb, the equivalent of db tables are called 'collections' 332 333 334 //Pattern pattern = Pattern.compile(".*"+domain+".*"); 335 336 // escape dots in domain for regex 337 String pattern = "/"+domain.replace(".", "\\.")+"/"; 338 339 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 340 341 342 Block<Document> storeURL = new Block<Document>() { 343 @Override 344 public void apply(final Document document) { 345 //System.out.println(document.toJson()); 346 String url = document.getString("URL"); 347 // add to our urlsList 348 System.out.println(url); 349 urlsList.add(url); 350 } 351 }; 352 353 354 355 // do mongodb query 356 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 304 // Run the following mongodb query: 305 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 306 307 // 1. One way that works: 308 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 309 310 // 2. Another way: 357 311 String query = "{URL: /DOMAIN/, isMRI: true}"; 312 domain = domain.replace(".", "\\."); // escape dots in domain for regex 358 313 query = query.replace("DOMAIN", domain); 314 315 //System.err.println("Executing find query: " + query); 359 316 360 317 BasicDBObject findObj = BasicDBObject.parse(query); 361 318 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}"); 362 319 363 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 320 364 321 collection.find(findObj).projection(projectionObj).forEach(storeURL); 365 322 -
other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java
r33870 r33871 18 18 * TO RUN: 19 19 * maori-lang-detection/src$ 20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../ domains.txt20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../mongodb-data/domainsNZ_IsMRI.txt 255 21 21 * 22 22 */ … … 65 65 String fileName = domainsFile.getName(); 66 66 File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName); 67 File fullSetOutFile = new File(parentFolder, "all_"+fileName);67 //File fullSetOutFile = new File(parentFolder, "allPages_"+fileName); 68 68 69 // shuffle list and take the first n 69 // shuffle list and take the first n - write to file 70 70 try ( 71 71 Writer writer = new BufferedWriter(new FileWriter(outFile));
Note:
See TracChangeset
for help on using the changeset viewer.