Changeset 33871

Show
Ignore:
Timestamp:
24.01.2020 20:59:42 (4 weeks ago)
Author:
ak19
Message:

Removed mostly duplicated older version of method but left the different parts commented out inside the new method. Improved regex in mongodb find query.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33870 r33871  
    276276     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 
    277277*/ 
    278     public ArrayList<String> oldWorks_queryAllMatchingIsMRIURLs(String domain) { 
     278    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
    279279     
    280280    final ArrayList<String> urlsList = new ArrayList<String>(); 
     
    285285 
    286286    // load the "webpages" db table 
    287     // in mongodb, the equivalent of db tables are called 'collections' 
    288      
    289      
    290     //Pattern pattern = Pattern.compile(".*"+domain+".*"); 
    291  
    292     // escape dots in domain for regex 
    293     String pattern = "/"+domain.replace(".", "\\.")+"/"; 
    294      
     287    // in mongodb, the equivalent of db tables are called 'collections'  
    295288    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
    296289 
    297  
     290    // code we'll execute in Iterable.forEach() below 
    298291    Block<Document> storeURL = new Block<Document>() { 
    299292        @Override 
     
    302295            String url = document.getString("URL"); 
    303296            // add to our urlsList 
    304             System.out.println(url); 
     297            //System.out.println(url); 
    305298            urlsList.add(url); 
    306299        } 
     
    309302 
    310303     
    311     // do mongodb query: 
    312     // test example: 
    313     //collection.find(eq("isMRI", true)).first(); 
    314     // 
    315     // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 
    316     collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
    317  
    318      
    319     return urlsList; 
    320     } 
    321  
    322     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
    323      
    324     final ArrayList<String> urlsList = new ArrayList<String>(); 
    325      
    326     // remove any http(s)://(www.) from the start of URL first 
    327     // since it goes into a regex 
    328     domain = Utility.stripProtocolAndWWWFromURL(domain); 
    329  
    330     // load the "webpages" db table 
    331     // in mongodb, the equivalent of db tables are called 'collections' 
    332      
    333      
    334     //Pattern pattern = Pattern.compile(".*"+domain+".*"); 
    335  
    336     // escape dots in domain for regex 
    337     String pattern = "/"+domain.replace(".", "\\.")+"/"; 
    338      
    339     MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
    340  
    341  
    342     Block<Document> storeURL = new Block<Document>() { 
    343         @Override 
    344         public void apply(final Document document) { 
    345             //System.out.println(document.toJson()); 
    346             String url = document.getString("URL"); 
    347             // add to our urlsList 
    348             System.out.println(url); 
    349             urlsList.add(url); 
    350         } 
    351         }; 
    352  
    353  
    354      
    355     // do mongodb query 
    356     // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 
     304    // Run the following mongodb query: 
     305    //    db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 
     306     
     307    // 1. One way that works: 
     308    //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
     309 
     310    // 2. Another way: 
    357311    String query = "{URL: /DOMAIN/, isMRI: true}"; 
     312    domain = domain.replace(".", "\\."); // escape dots in domain for regex 
    358313    query = query.replace("DOMAIN", domain); 
     314 
     315    //System.err.println("Executing find query: " + query); 
    359316     
    360317    BasicDBObject findObj = BasicDBObject.parse(query); 
    361318    BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}"); 
    362319     
    363     //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
     320 
    364321    collection.find(findObj).projection(projectionObj).forEach(storeURL); 
    365322     
  • other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java

    r33870 r33871  
    1818 * TO RUN: 
    1919 *    maori-lang-detection/src$ 
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../domains.txt 
     20 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/RandomURLsForDomainGenerator ../mongodb-data/domainsNZ_IsMRI.txt  255 
    2121 * 
    2222*/ 
     
    6565    String fileName = domainsFile.getName(); 
    6666    File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName); 
    67     File fullSetOutFile = new File(parentFolder, "all_"+fileName); 
     67    //File fullSetOutFile = new File(parentFolder, "allPages_"+fileName); 
    6868     
    69     // shuffle list and take the first n 
     69    // shuffle list and take the first n - write to file 
    7070    try ( 
    7171         Writer writer = new BufferedWriter(new FileWriter(outFile));