Changeset 33881 for other-projects


Ignore:
Timestamp:
2020-01-30T22:08:00+13:00 (4 years ago)
Author:
ak19
Message:

Uses lambda expression to process each doc in a mongodb aggregate result. Hopefully doing so means it processes each result Document as the result is obtained, meaning increased efficiency, rather than still storing all the results and processing them, as that code is more legible.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33880 r33881  
    4242import java.io.FileReader;
    4343import java.io.IOException;
     44import java.io.UncheckedIOException;
    4445import java.io.Writer;
    4546
     
    321322
    322323    // code we'll execute in Iterable.forEach() below
     324    // see also https://www.baeldung.com/foreach-java
    323325    Block<Document> storeURL = new Block<Document>() {
    324326        @Override
     
    362364    }
    363365
    364     /**       
    365        The mongodb aggregate() we want to run this time:
    366 
     366    /**     
     367     * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
     368     *
     369     * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
     370     * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
     371     * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
     372     *
     373     * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
     374     * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
     375     * On using group(TExpression) inside collection.aggregate().
     376     *
     377     *  For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
     378     *  and https://www.javatpoint.com/java-8-foreach
     379     *  and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
     380     *
     381     *
     382     * The mongodb aggregate() we want to run this time:
     383     *
    367384       db.Websites.aggregate([
    368385       {
     
    400417        orQuery);
    401418   
    402     AggregateIterable<Document> output
    403         = collection.aggregate(Arrays.asList(
     419    // Hopefully the lambda expression (forEach()) at end means
     420    // we write out each result Document as we get it
     421    collection.aggregate(Arrays.asList(
    404422         match(andQuery),
    405423         unwind("$geoLocationCountryCode"),
    406424         group("NZ", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
    407425         sort(BasicDBObject.parse("{count : -1}"))
    408      ));
    409 
    410     // should only have one doc
    411     for (Document doc : output) {
    412         //System.out.println(doc);
    413         System.out.println(doc.toJson());
    414         // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
    415         //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE);
    416         /*
    417         JsonWriterSettings writeSettings = new JsonWriterSettings();
    418         writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();       
    419         writer.write(doc.toJson(writeSettings) + NEWLINE);
    420         */
    421         writer.write(prettyPrintJson(doc.toJson()) + NEWLINE);
    422     }
     426     )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
     427
     428    // should only have one doc for NZ since it's a count by geolocation.
    423429
    424430    return;
     
    426432   
    427433    /**
    428        RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
    429 
    430        https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
    431        https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
    432        Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
    433 
    434        (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
    435        https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
    436        On using group(TExpression) inside collection.aggregate().
    437 
    438        
    439        The aggregate() we want to run:
    440 
     434     * The aggregate() we want to run this time:
     435     *
    441436       db.Websites.aggregate([
    442437       {
     
    460455     { $sort : { count : -1} }
    461456    ]);
    462 
    463 
    464     https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
    465457    */
    466     public void aggregateContainsMRIForOverseas(Writer writer) throws IOException {
     458    public void aggregateContainsMRIForOverseas(Writer writer) throws UncheckedIOException {
    467459    // working with the WebSites collection, not WebPages collection!
    468460    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
    469461
    470     /*String matchQuery =
    471     "$and: ["
    472         + "{geoLocationCountryCode: {$ne: \"NZ\"}},"
    473         + "{domain: {$not: /\\.nz/}},"
    474         + "{numPagesContainingMRI: {$gt: 0}},"
    475         + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}"
    476             + "]";*/
    477 
    478 
    479    
    480462   
    481463    Bson orQuery = or(
     
    488470        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
    489471        orQuery);
    490    
     472
     473
     474    collection.aggregate(Arrays.asList(
     475         match(andQuery),  //match(BasicDBObject.parse(matchQuery))
     476         // match((List<DBObject>)JSON.parse(matchQuery)),
     477         unwind("$geoLocationCountryCode"),
     478         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     479         sort(BasicDBObject.parse("{count : -1}"))
     480       )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
     481
     482    // casting to Block<Document> necessary because otherwise we see the error at
     483    // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
     484
     485    // Less efficient way is to keep all the results in memory and then
     486    // write them out one at a time
     487    /*
    491488    AggregateIterable<Document> output
    492489        = collection.aggregate(Arrays.asList(
     
    497494         sort(BasicDBObject.parse("{count : -1}"))
    498495     ));
     496
    499497   
    500498    for (Document doc : output) {
    501499        //System.out.println(doc);
    502500        System.out.println(doc.toJson());
    503         // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
    504         //writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true/*indent*/)) + NEWLINE);
    505         /*
    506         JsonWriterSettings writeSettings = new JsonWriterSettings();
    507         writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
    508         writer.write(doc.toJson(writeSettings) + NEWLINE);
    509         */
    510         writer.write(prettyPrintJson(doc.toJson()) + NEWLINE);
    511     }
    512 
     501       
     502    }
     503    */
    513504    return;
    514505    }
    515506
    516 
     507    /**
     508     * called by lambda forEach() call on Document objects to write them out to a file.
     509     * Have to deal with unreported exceptions here that can't be dealt with when doing
     510     * the actual forEach(). See
     511     * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
     512     */
     513   
     514    public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
     515    //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
     516    // Can't control json output to add newlines after each array element,
     517    // no matter which JsonMode is used.
     518   
     519    // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
     520    // Still can't control array element output,
     521    // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
     522    //JsonWriterSettings writeSettings = new JsonWriterSettings();
     523    //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
     524    //writer.write(doc.toJson(writeSettings) + NEWLINE);
     525
     526    // Not the JsonWriter of mongodb java driver:
     527    // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
     528   
     529    // Have to use gson's pretty print to produce a json string that contains
     530    // newlines after every array element in the json:
     531    String jsonStr = prettyPrintJson(doc.toJson());
     532    System.err.println(jsonStr);
     533    try {
     534        writer.write(jsonStr + NEWLINE);
     535    } catch (IOException ex) {
     536        //throw ex;
     537        throw new UncheckedIOException(ex);
     538    }       
     539    }
    517540    public String prettyPrintJson(String jsonStr) {
    518541    Gson gson = new GsonBuilder().setPrettyPrinting().create();
Note: See TracChangeset for help on using the changeset viewer.