Changeset 33876


Ignore:
Timestamp:
2020-01-29T21:48:52+13:00 (4 years ago)
Author:
ak19
Message:

Some missteps, but have got complex collection.aggregate() working at last.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33871 r33876  
    33//import org.bson.BSONObject;
    44
     5import com.mongodb.client.AggregateIterable;
    56import com.mongodb.client.MongoCollection;
    67import com.mongodb.client.MongoDatabase;
     
    1112// to use collection.find().projection() filters like include() etc
    1213import static com.mongodb.client.model.Projections.*;
     14// to use aggregation functions like unwind(), match(), sort() etc
     15import static com.mongodb.client.model.Aggregates.*;
     16// to use functions like sum() and addToSet() within aggregation functions
     17import static com.mongodb.client.model.Accumulators.*;
    1318
    1419//import org.bson.conversions.Bson;
     
    2227
    2328import org.bson.Document;
     29import org.bson.conversions.Bson;
     30   
     31import com.mongodb.util.JSON;
     32//import com.mongodb.DBObject;
    2433
    2534import java.io.BufferedReader;
    2635import java.io.File;
    2736import java.io.FileReader;
     37import java.util.Arrays;
    2838import java.util.ArrayList;
    2939import java.util.List;
     
    6878    static final String PROPS_FILENAME = "config.properties";
    6979    public static final String WEBPAGES_COLLECTION = "Webpages";
    70     public static final String WEBSITES_COLLECTION = "Websites";   
     80    public static final String WEBSITES_COLLECTION = "Websites";
     81   
     82    /** mongodb filter types to execute */
     83    public static final int IS_MRI = 0;
     84    public static final int CONTAINS_MRI = 1;
    7185
    7286    // configuration details, some with fallback values
     
    264278    }
    265279    */
    266 
     280   
     281    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     282    return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
     283    }
     284    public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
     285    return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
     286    }
     287   
    267288    /**
    268289     * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
     
    275296     * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
    276297     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
    277 */
    278     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     298     */
     299    public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
    279300   
    280301    final ArrayList<String> urlsList = new ArrayList<String>();
     
    300321        };
    301322
    302 
    303323   
    304324    // Run the following mongodb query:
     
    309329
    310330    // 2. Another way:
    311     String query = "{URL: /DOMAIN/, isMRI: true}";
     331    //String query = "{URL: /DOMAIN/, isMRI: true}";
     332    String query = "{URL: /DOMAIN/, ";
     333    if(filterType == IS_MRI) {
     334        query += "isMRI: true}";
     335    } else if(filterType == CONTAINS_MRI) {
     336        query += "containsMRI: true}";
     337    }
     338   
    312339    domain = domain.replace(".", "\\."); // escape dots in domain for regex
    313340    query = query.replace("DOMAIN", domain);
     
    324351    }
    325352
    326    
     353    /**
     354
     355       db.Websites.aggregate([
     356    {
     357        $match: {
     358            $and: [
     359                {geoLocationCountryCode: {$ne: "NZ"}},
     360                {domain: {$not: /\.nz/}},
     361                {numPagesContainingMRI: {$gt: 0}},
     362                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}           
     363            ]
     364        }
     365    },
     366    { $unwind: "$geoLocationCountryCode" },
     367    {
     368        $group: {
     369            _id: {$toLower: '$geoLocationCountryCode'},
     370            count: { $sum: 1 },
     371            domain: { $addToSet: '$domain' }
     372        }
     373    },
     374    { $sort : { count : -1} }
     375]);
     376
     377       https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
     378       https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
     379       Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
     380
     381       (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
     382       https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
     383       On using group(TExpression) inside collection.aggregate().
     384    */
     385    public String aggregateContainsMRIForOverseas() {
     386    // working with the WebSites collection, not WebPages collection!
     387    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
     388
     389    /*String matchQuery =
     390    "$and: ["
     391        + "{geoLocationCountryCode: {$ne: \"NZ\"}},"
     392        + "{domain: {$not: /\\.nz/}},"
     393        + "{numPagesContainingMRI: {$gt: 0}},"
     394        + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}"
     395            + "]";*/
     396
     397
     398   
     399   
     400    Bson orQuery = or(
     401              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
     402              BasicDBObject.parse("{urlContainsLangCodeInPath: false}")
     403              );
     404    Bson andQuery = and(
     405        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
     406        BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
     407        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"),
     408        orQuery);
     409   
     410    AggregateIterable<Document> output
     411        = collection.aggregate(Arrays.asList(
     412         match(andQuery),  //match(BasicDBObject.parse(matchQuery))
     413         // match((List<DBObject>)JSON.parse(matchQuery)),
     414         unwind("$geoLocationCountryCode"),
     415         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
     416         sort(BasicDBObject.parse("{count : -1}"))
     417                         ));
     418   
     419    for (Document doc : output) {
     420        //System.out.println(doc);
     421        System.out.println(doc.toJson());
     422    }
     423
     424    return "";
     425    }
    327426   
    328427    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33873 r33876  
    1818 * TO RUN:
    1919 *    maori-lang-detection/src$
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt  255
     20 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
    2121 *
    2222*/
     
    2525   
    2626    private final MongoDBAccess mongodbAccess;
    27     private final int numURLs;
    28 
    29     //private String[] urls;
    30     ArrayList<String> urlsList = new ArrayList<String>();
     27    private int numURLs;
     28    private File domainsFile;
     29   
     30    public WebPageURLsListing(MongoDBAccess mongodbAccess,
     31                    File domainsFile)
     32    {
     33    this.mongodbAccess = mongodbAccess;
     34    this.domainsFile = domainsFile;
     35    }
    3136   
    3237    public WebPageURLsListing(MongoDBAccess mongodbAccess,
     
    3439                    int numURLs)
    3540    {
    36     this.mongodbAccess = mongodbAccess;
     41    this(mongodbAccess, domainsFile);
    3742    this.numURLs = numURLs;
     43    }
    3844
     45    public String produceURLsForPagesInMRI() {
     46    return writeFile(MongoDBAccess.IS_MRI);
     47    }
     48   
     49    public String produceURLsForPagesContainingMRI() {
     50    return writeFile(MongoDBAccess.CONTAINS_MRI);
     51    }
     52
     53
     54    public String writeFile(int filterType) {
     55
     56    ArrayList<String> urlsList = new ArrayList<String>();
     57   
    3958    // 1. read each url from the domainsFile
    40       // 1a. do the query
    41       // 1b. add the arraylist result to urls
    42 
     59    // 1a. do the query
     60    // 1b. add the arraylist result to urls
     61   
    4362    try (
    4463         BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
     
    5170        domain = domain.trim();
    5271        if(!domain.equals("")) {
    53             ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingIsMRIURLs(domain);
     72            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
    5473            urlsList.addAll(moreURLs);
    5574        }
     
    6382    // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
    6483    File parentFolder = domainsFile.getParentFile();
    65     String fileName = domainsFile.getName();
    66     //File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);
    67     File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);
     84    //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName());
     85    String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
     86    File outFile = new File(parentFolder, fileName+domainsFile.getName());
    6887
    6988    // write out ALL the URLs
    7089    try (
    71          Writer writer = new BufferedWriter(new FileWriter(fullSetOutFile));
     90         Writer writer = new BufferedWriter(new FileWriter(outFile));
    7291         ) {
    7392
     
    7897        }
    7998    } catch(Exception e) {
    80         logger.error("Unable to write to file " + fullSetOutFile.getAbsolutePath());
     99        logger.error("Unable to write to file " + outFile.getAbsolutePath());
    81100        logger.error(e.getMessage(), e);
    82101    }
     
    98117    }
    99118    */
     119
     120    return outFile.getAbsolutePath();
    100121    }
     122   
    101123   
    102124    public static void printUsage() {
     
    107129    // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
    108130
    109    
    110131   
    111132    public static void main(String args[]) {
     
    124145        }
    125146
    126         int genNumURLs = Integer.parseInt(args[1]);
     147        //int genNumURLs = Integer.parseInt(args[1]);
    127148
    128149        mongodb.connectToDB();
    129150       
    130151        WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile);
     152        //String isMRIFile = listing.produceURLsForPagesInMRI();
     153        //String containsMRIFile = listing.produceURLsForPagesContainingMRI();
     154        mongodb.aggregateContainsMRIForOverseas();
    131155       
    132156    } catch(Exception e) {
Note: See TracChangeset for help on using the changeset viewer.