Changeset 33881

Show
Ignore:
Timestamp:
30.01.2020 22:08:00 (3 weeks ago)
Author:
ak19
Message:

Uses lambda expression to process each doc in a mongodb aggregate result. Hopefully doing so means it processes each result Document as the result is obtained, meaning increased efficiency, rather than still storing all the results and processing them, as that code is more legible.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33880 r33881  
    4242import java.io.FileReader; 
    4343import java.io.IOException; 
     44import java.io.UncheckedIOException; 
    4445import java.io.Writer; 
    4546 
     
    321322 
    322323    // code we'll execute in Iterable.forEach() below 
     324    // see also https://www.baeldung.com/foreach-java 
    323325    Block<Document> storeURL = new Block<Document>() { 
    324326        @Override 
     
    362364    } 
    363365 
    364     /**        
    365        The mongodb aggregate() we want to run this time: 
    366  
     366    /**      
     367     * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: 
     368     * 
     369     * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver 
     370     * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 
     371     * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo 
     372     * 
     373     * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) 
     374     * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates 
     375     * On using group(TExpression) inside collection.aggregate(). 
     376     * 
     377     *  For forEach lamba expressions, see also https://www.baeldung.com/foreach-java 
     378     *  and https://www.javatpoint.com/java-8-foreach 
     379     *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 
     380     * 
     381     * 
     382     * The mongodb aggregate() we want to run this time: 
     383     * 
    367384       db.Websites.aggregate([ 
    368385       { 
     
    400417        orQuery); 
    401418     
    402     AggregateIterable<Document> output 
    403         = collection.aggregate(Arrays.asList( 
     419    // Hopefully the lambda expression (forEach()) at end means 
     420    // we write out each result Document as we get it 
     421    collection.aggregate(Arrays.asList( 
    404422         match(andQuery), 
    405423         unwind("$geoLocationCountryCode"), 
    406424         group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
    407425         sort(BasicDBObject.parse("{count : -1}")) 
    408      )); 
    409  
    410     // should only have one doc 
    411     for (Document doc : output) { 
    412         //System.out.println(doc); 
    413         System.out.println(doc.toJson()); 
    414         // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 
    415         //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE); 
    416         /* 
    417         JsonWriterSettings writeSettings = new JsonWriterSettings(); 
    418         writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();         
    419         writer.write(doc.toJson(writeSettings) + NEWLINE); 
    420         */ 
    421         writer.write(prettyPrintJson(doc.toJson()) + NEWLINE); 
    422     } 
     426     )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
     427 
     428    // should only have one doc for NZ since it's a count by geolocation. 
    423429 
    424430    return; 
     
    426432     
    427433    /** 
    428        RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA: 
    429  
    430        https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver 
    431        https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 
    432        Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo 
    433  
    434        (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) 
    435        https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates 
    436        On using group(TExpression) inside collection.aggregate(). 
    437  
    438         
    439        The aggregate() we want to run: 
    440  
     434     * The aggregate() we want to run this time: 
     435     * 
    441436       db.Websites.aggregate([ 
    442437       { 
     
    460455     { $sort : { count : -1} } 
    461456    ]); 
    462  
    463  
    464     https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line 
    465457    */ 
    466     public void aggregateContainsMRIForOverseas(Writer writer) throws IOException { 
     458    public void aggregateContainsMRIForOverseas(Writer writer) throws UncheckedIOException { 
    467459    // working with the WebSites collection, not WebPages collection! 
    468460    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
    469461 
    470     /*String matchQuery = 
    471     "$and: [" 
    472         + "{geoLocationCountryCode: {$ne: \"NZ\"}}," 
    473         + "{domain: {$not: /\\.nz/}}," 
    474         + "{numPagesContainingMRI: {$gt: 0}}," 
    475         + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}" 
    476             + "]";*/ 
    477  
    478  
    479      
    480462     
    481463    Bson orQuery = or( 
     
    488470        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
    489471        orQuery); 
    490      
     472 
     473 
     474    collection.aggregate(Arrays.asList( 
     475         match(andQuery),  //match(BasicDBObject.parse(matchQuery)) 
     476         // match((List<DBObject>)JSON.parse(matchQuery)), 
     477         unwind("$geoLocationCountryCode"), 
     478         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
     479         sort(BasicDBObject.parse("{count : -1}")) 
     480       )).forEach((Block<Document>)doc -> writeDoc(doc, writer)); 
     481 
     482    // casting to Block<Document> necessary because otherwise we see the error at 
     483    // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java 
     484 
     485    // Less efficient way is to keep all the results in memory and then 
     486    // write them out one at a time 
     487    /* 
    491488    AggregateIterable<Document> output 
    492489        = collection.aggregate(Arrays.asList( 
     
    497494         sort(BasicDBObject.parse("{count : -1}")) 
    498495     )); 
     496 
    499497     
    500498    for (Document doc : output) { 
    501499        //System.out.println(doc); 
    502500        System.out.println(doc.toJson()); 
    503         // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 
    504         //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE); 
    505         /* 
    506         JsonWriterSettings writeSettings = new JsonWriterSettings(); 
    507         writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); 
    508         writer.write(doc.toJson(writeSettings) + NEWLINE); 
    509         */ 
    510         writer.write(prettyPrintJson(doc.toJson()) + NEWLINE); 
    511     } 
    512  
     501         
     502    } 
     503    */ 
    513504    return; 
    514505    } 
    515506 
    516  
     507    /** 
     508     * called by lambda forEach() call on Document objects to write them out to a file. 
     509     * Have to deal with unreported exceptions here that can't be dealt with when doing 
     510     * the actual forEach(). See 
     511     * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach 
     512     */ 
     513     
     514    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException { 
     515    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE); 
     516    // Can't control json output to add newlines after each array element, 
     517    // no matter which JsonMode is used. 
     518     
     519    // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html 
     520    // Still can't control array element output, 
     521    // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too: 
     522    //JsonWriterSettings writeSettings = new JsonWriterSettings(); 
     523    //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build(); 
     524    //writer.write(doc.toJson(writeSettings) + NEWLINE); 
     525 
     526    // Not the JsonWriter of mongodb java driver: 
     527    // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line 
     528     
     529    // Have to use gson's pretty print to produce a json string that contains 
     530    // newlines after every array element in the json: 
     531    String jsonStr = prettyPrintJson(doc.toJson()); 
     532    System.err.println(jsonStr); 
     533    try { 
     534        writer.write(jsonStr + NEWLINE); 
     535    } catch (IOException ex) { 
     536        //throw ex; 
     537        throw new UncheckedIOException(ex); 
     538    }        
     539    } 
    517540    public String prettyPrintJson(String jsonStr) { 
    518541    Gson gson = new GsonBuilder().setPrettyPrinting().create();