Changeset 33870


Ignore:
Timestamp:
2020-01-24T20:48:17+13:00 (4 years ago)
Author:
ak19
Message:

Got the mongodb query working in Java in 2 different ways: the fully Java way and the way where I parse the query and parse the projection portion and then run the mongodb collection.find().projection() on it. Both work now. And thed RandomURLsForDomainGenerator now works fine.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33869 r33870  
    270270     * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
    271271     * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
     272     *
     273     * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
     274     * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
     275     * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
     276     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
    272277*/
    273     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     278    public ArrayList<String> oldWorks_queryAllMatchingIsMRIURLs(String domain) {
    274279   
    275280    final ArrayList<String> urlsList = new ArrayList<String>();
     281   
     282    // remove any http(s)://(www.) from the start of URL first
     283    // since it goes into a regex
     284    domain = Utility.stripProtocolAndWWWFromURL(domain);
    276285
    277286    // load the "webpages" db table
     
    280289   
    281290    //Pattern pattern = Pattern.compile(".*"+domain+".*");
     291
     292    // escape dots in domain for regex
    282293    String pattern = "/"+domain.replace(".", "\\.")+"/";
    283294   
     
    291302            String url = document.getString("URL");
    292303            // add to our urlsList
     304            System.out.println(url);
    293305            urlsList.add(url);
    294306        }
     
    301313    //collection.find(eq("isMRI", true)).first();
    302314    //
    303     // db.getCollection('Webpages').find({URL:/.*domain.*/, isMRI: true}, {URL: 1, _id: 0})
     315    // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
    304316    collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
    305317
     
    307319    return urlsList;
    308320    }
     321
     322    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     323   
     324    final ArrayList<String> urlsList = new ArrayList<String>();
     325   
     326    // remove any http(s)://(www.) from the start of URL first
     327    // since it goes into a regex
     328    domain = Utility.stripProtocolAndWWWFromURL(domain);
     329
     330    // load the "webpages" db table
     331    // in mongodb, the equivalent of db tables are called 'collections'
     332   
     333   
     334    //Pattern pattern = Pattern.compile(".*"+domain+".*");
     335
     336    // escape dots in domain for regex
     337    String pattern = "/"+domain.replace(".", "\\.")+"/";
     338   
     339    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
     340
     341
     342    Block<Document> storeURL = new Block<Document>() {
     343        @Override
     344        public void apply(final Document document) {
     345            //System.out.println(document.toJson());
     346            String url = document.getString("URL");
     347            // add to our urlsList
     348            System.out.println(url);
     349            urlsList.add(url);
     350        }
     351        };
     352
     353
     354   
     355    // do mongodb query
     356    // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
     357    String query = "{URL: /DOMAIN/, isMRI: true}";
     358    query = query.replace("DOMAIN", domain);
     359   
     360    BasicDBObject findObj = BasicDBObject.parse(query);
     361    BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
     362   
     363    //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
     364    collection.find(findObj).projection(projectionObj).forEach(storeURL);
     365   
     366    return urlsList;
     367    }
     368
     369   
    309370   
    310371    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
  • other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java

    r33869 r33870  
    6060    }
    6161
    62     // copy into array
    63     /*
    64     urls = new String[urlsList.size()];
    65     String[] urls = urlsList.toArray(urls);
    66     urlsList.clear();
    67     */
    68 
    69     /*
    70     // 2. generate numURLs of UNIQUE numbers between 0 to urls.length
    71     // https://stackoverflow.com/questions/8115722/generating-unique-random-numbers-in-java
    72     // https://www.geeksforgeeks.org/iterator-vs-foreach-in-java/
    73     // BETTER: https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
    74    
    75    
    76     // 3. then for each number, write the url at that index in array urls into file.
    77     */
    78 
    79     // Shuffle the urlsList, then write out the first numURLs into a file.
    80    
    81     // BETTER: https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
     62    // Shuffle the urlsList, then write out the first numURLs into a file. 
     63    // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
    8264    File parentFolder = domainsFile.getParentFile();
    8365    String fileName = domainsFile.getName();
    8466    File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);
     67    File fullSetOutFile = new File(parentFolder, "all_"+fileName);
    8568   
    8669    // shuffle list and take the first n
Note: See TracChangeset for help on using the changeset viewer.