Changeset 33870

Show
Ignore:
Timestamp:
24.01.2020 20:48:17 (4 weeks ago)
Author:
ak19
Message:

Got the mongodb query working in Java in 2 different ways: the fully Java way and the way where I parse the query and parse the projection portion and then run the mongodb collection.find().projection() on it. Both work now. And thed RandomURLsForDomainGenerator now works fine.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33869 r33870  
    270270     * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection 
    271271     * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find 
     272     * 
     273     * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java 
     274     * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 
     275     * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java 
     276     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 
    272277*/ 
    273     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
     278    public ArrayList<String> oldWorks_queryAllMatchingIsMRIURLs(String domain) { 
    274279     
    275280    final ArrayList<String> urlsList = new ArrayList<String>(); 
     281     
     282    // remove any http(s)://(www.) from the start of URL first 
     283    // since it goes into a regex 
     284    domain = Utility.stripProtocolAndWWWFromURL(domain); 
    276285 
    277286    // load the "webpages" db table 
     
    280289     
    281290    //Pattern pattern = Pattern.compile(".*"+domain+".*"); 
     291 
     292    // escape dots in domain for regex 
    282293    String pattern = "/"+domain.replace(".", "\\.")+"/"; 
    283294     
     
    291302            String url = document.getString("URL"); 
    292303            // add to our urlsList 
     304            System.out.println(url); 
    293305            urlsList.add(url); 
    294306        } 
     
    301313    //collection.find(eq("isMRI", true)).first(); 
    302314    // 
    303     // db.getCollection('Webpages').find({URL:/.*domain.*/, isMRI: true}, {URL: 1, _id: 0}) 
     315    // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 
    304316    collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
    305317 
     
    307319    return urlsList; 
    308320    } 
     321 
     322    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
     323     
     324    final ArrayList<String> urlsList = new ArrayList<String>(); 
     325     
     326    // remove any http(s)://(www.) from the start of URL first 
     327    // since it goes into a regex 
     328    domain = Utility.stripProtocolAndWWWFromURL(domain); 
     329 
     330    // load the "webpages" db table 
     331    // in mongodb, the equivalent of db tables are called 'collections' 
     332     
     333     
     334    //Pattern pattern = Pattern.compile(".*"+domain+".*"); 
     335 
     336    // escape dots in domain for regex 
     337    String pattern = "/"+domain.replace(".", "\\.")+"/"; 
     338     
     339    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
     340 
     341 
     342    Block<Document> storeURL = new Block<Document>() { 
     343        @Override 
     344        public void apply(final Document document) { 
     345            //System.out.println(document.toJson()); 
     346            String url = document.getString("URL"); 
     347            // add to our urlsList 
     348            System.out.println(url); 
     349            urlsList.add(url); 
     350        } 
     351        }; 
     352 
     353 
     354     
     355    // do mongodb query 
     356    // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0}) 
     357    String query = "{URL: /DOMAIN/, isMRI: true}"; 
     358    query = query.replace("DOMAIN", domain); 
     359     
     360    BasicDBObject findObj = BasicDBObject.parse(query); 
     361    BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}"); 
     362     
     363    //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
     364    collection.find(findObj).projection(projectionObj).forEach(storeURL); 
     365     
     366    return urlsList; 
     367    } 
     368 
     369     
    309370     
    310371    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/RandomURLsForDomainGenerator.java

    r33869 r33870  
    6060    } 
    6161 
    62     // copy into array 
    63     /* 
    64     urls = new String[urlsList.size()]; 
    65     String[] urls = urlsList.toArray(urls); 
    66     urlsList.clear(); 
    67     */ 
    68  
    69     /* 
    70     // 2. generate numURLs of UNIQUE numbers between 0 to urls.length 
    71     // https://stackoverflow.com/questions/8115722/generating-unique-random-numbers-in-java 
    72     // https://www.geeksforgeeks.org/iterator-vs-foreach-in-java/ 
    73     // BETTER: https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 
    74      
    75      
    76     // 3. then for each number, write the url at that index in array urls into file. 
    77     */ 
    78  
    79     // Shuffle the urlsList, then write out the first numURLs into a file. 
    80      
    81     // BETTER: https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 
     62    // Shuffle the urlsList, then write out the first numURLs into a file.   
     63    // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 
    8264    File parentFolder = domainsFile.getParentFile(); 
    8365    String fileName = domainsFile.getName(); 
    8466    File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName); 
     67    File fullSetOutFile = new File(parentFolder, "all_"+fileName); 
    8568     
    8669    // shuffle list and take the first n