Changeset 33870 for other-projects
- Timestamp:
- 2020-01-24T20:48:17+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33869 r33870 270 270 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection 271 271 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find 272 * 273 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java 274 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 275 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java 276 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 272 277 */ 273 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {278 public ArrayList<String> oldWorks_queryAllMatchingIsMRIURLs(String domain) { 274 279 275 280 final ArrayList<String> urlsList = new ArrayList<String>(); 281 282 // remove any http(s)://(www.) from the start of URL first 283 // since it goes into a regex 284 domain = Utility.stripProtocolAndWWWFromURL(domain); 276 285 277 286 // load the "webpages" db table … … 280 289 281 290 //Pattern pattern = Pattern.compile(".*"+domain+".*"); 291 292 // escape dots in domain for regex 282 293 String pattern = "/"+domain.replace(".", "\\.")+"/"; 283 294 … … 291 302 String url = document.getString("URL"); 292 303 // add to our urlsList 304 System.out.println(url); 293 305 urlsList.add(url); 294 306 } … … 301 313 //collection.find(eq("isMRI", true)).first(); 302 314 // 303 // db.getCollection('Webpages').find({URL: /.*domain.*/, isMRI: true}, {URL: 1, _id: 0})315 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 304 316 collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 305 317 … … 307 319 return urlsList; 308 320 } 321 322 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 323 324 final ArrayList<String> urlsList = new ArrayList<String>(); 325 326 // remove any http(s)://(www.) from the start of URL first 327 // since it goes into a regex 328 domain = Utility.stripProtocolAndWWWFromURL(domain); 329 330 // load the "webpages" db table 331 // in mongodb, the equivalent of db tables are called 'collections' 332 333 334 //Pattern pattern = Pattern.compile(".*"+domain+".*"); 335 336 // escape dots in domain for regex 337 String pattern = "/"+domain.replace(".", "\\.")+"/"; 338 339 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 340 341 342 Block<Document> storeURL = new Block<Document>() { 343 @Override 344 public void apply(final Document document) { 345 //System.out.println(document.toJson()); 346 String url = document.getString("URL"); 347 // add to our urlsList 348 System.out.println(url); 349 urlsList.add(url); 350 } 351 }; 352 353 354 355 // do mongodb query 356 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 357 String query = "{URL: /DOMAIN/, isMRI: true}"; 358 query = query.replace("DOMAIN", domain); 359 360 BasicDBObject findObj = BasicDBObject.parse(query); 361 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}"); 362 363 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 364 collection.find(findObj).projection(projectionObj).forEach(storeURL); 365 366 return urlsList; 367 } 368 369 309 370 310 371 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ -
other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java
r33869 r33870 60 60 } 61 61 62 // copy into array 63 /* 64 urls = new String[urlsList.size()]; 65 String[] urls = urlsList.toArray(urls); 66 urlsList.clear(); 67 */ 68 69 /* 70 // 2. generate numURLs of UNIQUE numbers between 0 to urls.length 71 // https://stackoverflow.com/questions/8115722/generating-unique-random-numbers-in-java 72 // https://www.geeksforgeeks.org/iterator-vs-foreach-in-java/ 73 // BETTER: https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 74 75 76 // 3. then for each number, write the url at that index in array urls into file. 77 */ 78 79 // Shuffle the urlsList, then write out the first numURLs into a file. 80 81 // BETTER: https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 62 // Shuffle the urlsList, then write out the first numURLs into a file. 63 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 82 64 File parentFolder = domainsFile.getParentFile(); 83 65 String fileName = domainsFile.getName(); 84 66 File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName); 67 File fullSetOutFile = new File(parentFolder, "all_"+fileName); 85 68 86 69 // shuffle list and take the first n
Note:
See TracChangeset
for help on using the changeset viewer.