Ignore:
Timestamp:
2020-01-29T21:48:52+13:00 (4 years ago)
Author:
ak19
Message:

Some missteps, but have got complex collection.aggregate() working at last.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33871 r33876  
    33//import org.bson.BSONObject;
    44
     5import com.mongodb.client.AggregateIterable;
    56import com.mongodb.client.MongoCollection;
    67import com.mongodb.client.MongoDatabase;
     
    1112// to use collection.find().projection() filters like include() etc
    1213import static com.mongodb.client.model.Projections.*;
     14// to use aggregation functions like unwind(), match(), sort() etc
     15import static com.mongodb.client.model.Aggregates.*;
     16// to use functions like sum() and addToSet() within aggregation functions
     17import static com.mongodb.client.model.Accumulators.*;
    1318
    1419//import org.bson.conversions.Bson;
     
    2227
    2328import org.bson.Document;
     29import org.bson.conversions.Bson;
     30   
     31import com.mongodb.util.JSON;
     32//import com.mongodb.DBObject;
    2433
    2534import java.io.BufferedReader;
    2635import java.io.File;
    2736import java.io.FileReader;
     37import java.util.Arrays;
    2838import java.util.ArrayList;
    2939import java.util.List;
     
    6878    static final String PROPS_FILENAME = "config.properties";
    6979    public static final String WEBPAGES_COLLECTION = "Webpages";
    70     public static final String WEBSITES_COLLECTION = "Websites";   
     80    public static final String WEBSITES_COLLECTION = "Websites";
     81   
     82    /** mongodb filter types to execute */
     83    public static final int IS_MRI = 0;
     84    public static final int CONTAINS_MRI = 1;
    7185
    7286    // configuration details, some with fallback values
     
    264278    }
    265279    */
    266 
     280   
     281    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     282    return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
     283    }
     284    public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
     285    return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
     286    }
     287   
    267288    /**
    268289     * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
     
    275296     * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
    276297     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
    277 */
    278     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     298     */
     299    public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
    279300   
    280301    final ArrayList<String> urlsList = new ArrayList<String>();
     
    300321        };
    301322
    302 
    303323   
    304324    // Run the following mongodb query:
     
    309329
    310330    // 2. Another way:
    311     String query = "{URL: /DOMAIN/, isMRI: true}";
     331    //String query = "{URL: /DOMAIN/, isMRI: true}";
     332    String query = "{URL: /DOMAIN/, ";
     333    if(filterType == IS_MRI) {
     334        query += "isMRI: true}";
     335    } else if(filterType == CONTAINS_MRI) {
     336        query += "containsMRI: true}";
     337    }
     338   
    312339    domain = domain.replace(".", "\\."); // escape dots in domain for regex
    313340    query = query.replace("DOMAIN", domain);
     
    324351    }
    325352
    326    
     353    /**
     354
     355       db.Websites.aggregate([
     356    {
     357        $match: {
     358            $and: [
     359                {geoLocationCountryCode: {$ne: "NZ"}},
     360                {domain: {$not: /\.nz/}},
     361                {numPagesContainingMRI: {$gt: 0}},
     362                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     363            ]
     364        }
     365    },
     366    { $unwind: "$geoLocationCountryCode" },
     367    {
     368        $group: {
     369            _id: {$toLower: '$geoLocationCountryCode'},
     370            count: { $sum: 1 },
     371            domain: { $addToSet: '$domain' }
     372        }
     373    },
     374    { $sort : { count : -1} }
     375]);
     376
     377       https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
     378       https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
     379       Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
     380
     381       (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
     382       https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
     383       On using group(TExpression) inside collection.aggregate().
     384    */
     385    public String aggregateContainsMRIForOverseas() {
     386    // working with the WebSites collection, not WebPages collection!
     387    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     388
     389    /*String matchQuery =
     390    "$and: ["
     391        + "{geoLocationCountryCode: {$ne: \"NZ\"}},"
     392        + "{domain: {$not: /\\.nz/}},"
     393        + "{numPagesContainingMRI: {$gt: 0}},"
     394        + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}"
     395            + "]";*/
     396
     397
     398   
     399   
     400    Bson orQuery = or(
     401              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
     402              BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
     403              );
     404    Bson andQuery = and(
     405        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
     406        BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
     407        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
     408        orQuery);
     409   
     410    AggregateIterable<Document> output
     411        = collection.aggregate(Arrays.asList(
     412         match(andQuery),  //match(BasicDBObject.parse(matchQuery))
     413         // match((List<DBObject>)JSON.parse(matchQuery)),
     414         unwind("$geoLocationCountryCode"),
     415         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     416         sort(BasicDBObject.parse("{count : -1}"))
     417                         ));
     418   
     419    for (Document doc : output) {
     420        //System.out.println(doc);
     421        System.out.println(doc.toJson());
     422    }
     423
     424    return "";
     425    }
    327426   
    328427    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
Note: See TracChangeset for help on using the changeset viewer.