Changeset 33871


Ignore:
Timestamp:
2020-01-24T20:59:42+13:00 (4 years ago)
Author:
ak19
Message:

Removed mostly duplicated older version of method but left the different parts commented out inside the new method. Improved regex in mongodb find query.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33870 r33871  
    276276     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
    277277*/
    278     public ArrayList<String> oldWorks_queryAllMatchingIsMRIURLs(String domain) {
     278    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
    279279   
    280280    final ArrayList<String> urlsList = new ArrayList<String>();
     
    285285
    286286    // load the "webpages" db table
    287     // in mongodb, the equivalent of db tables are called 'collections'
    288    
    289    
    290     //Pattern pattern = Pattern.compile(".*"+domain+".*");
    291 
    292     // escape dots in domain for regex
    293     String pattern = "/"+domain.replace(".", "\\.")+"/";
    294    
     287    // in mongodb, the equivalent of db tables are called 'collections'
    295288    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
    296289
    297 
     290    // code we'll execute in Iterable.forEach() below
    298291    Block<Document> storeURL = new Block<Document>() {
    299292        @Override
     
    302295            String url = document.getString("URL");
    303296            // add to our urlsList
    304             System.out.println(url);
     297            //System.out.println(url);
    305298            urlsList.add(url);
    306299        }
     
    309302
    310303   
    311     // do mongodb query:
    312     // test example:
    313     //collection.find(eq("isMRI", true)).first();
    314     //
    315     // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
    316     collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
    317 
    318    
    319     return urlsList;
    320     }
    321 
    322     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
    323    
    324     final ArrayList<String> urlsList = new ArrayList<String>();
    325    
    326     // remove any http(s)://(www.) from the start of URL first
    327     // since it goes into a regex
    328     domain = Utility.stripProtocolAndWWWFromURL(domain);
    329 
    330     // load the "webpages" db table
    331     // in mongodb, the equivalent of db tables are called 'collections'
    332    
    333    
    334     //Pattern pattern = Pattern.compile(".*"+domain+".*");
    335 
    336     // escape dots in domain for regex
    337     String pattern = "/"+domain.replace(".", "\\.")+"/";
    338    
    339     MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
    340 
    341 
    342     Block<Document> storeURL = new Block<Document>() {
    343         @Override
    344         public void apply(final Document document) {
    345             //System.out.println(document.toJson());
    346             String url = document.getString("URL");
    347             // add to our urlsList
    348             System.out.println(url);
    349             urlsList.add(url);
    350         }
    351         };
    352 
    353 
    354    
    355     // do mongodb query
    356     // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
     304    // Run the following mongodb query:
     305    //    db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
     306   
     307    // 1. One way that works:
     308    //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
     309
     310    // 2. Another way:
    357311    String query = "{URL: /DOMAIN/, isMRI: true}";
     312    domain = domain.replace(".", "\\."); // escape dots in domain for regex
    358313    query = query.replace("DOMAIN", domain);
     314
     315    //System.err.println("Executing find query: " + query);
    359316   
    360317    BasicDBObject findObj = BasicDBObject.parse(query);
    361318    BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
    362319   
    363     //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
     320
    364321    collection.find(findObj).projection(projectionObj).forEach(storeURL);
    365322   
  • other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java

    r33870 r33871  
    1818 * TO RUN:
    1919 *    maori-lang-detection/src$
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../domains.txt
     20 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../mongodb-data/domainsNZ_IsMRI.txt  255
    2121 *
    2222*/
     
    6565    String fileName = domainsFile.getName();
    6666    File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);
    67     File fullSetOutFile = new File(parentFolder, "all_"+fileName);
     67    //File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);
    6868   
    69     // shuffle list and take the first n
     69    // shuffle list and take the first n - write to file
    7070    try (
    7171         Writer writer = new BufferedWriter(new FileWriter(outFile));
Note: See TracChangeset for help on using the changeset viewer.