Changeset 33876

Show
Ignore:
Timestamp:
29.01.2020 21:48:52 (3 weeks ago)
Author:
ak19
Message:

Some missteps, but have got complex collection.aggregate() working at last.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33871 r33876  
    33//import org.bson.BSONObject; 
    44 
     5import com.mongodb.client.AggregateIterable; 
    56import com.mongodb.client.MongoCollection;  
    67import com.mongodb.client.MongoDatabase;  
     
    1112// to use collection.find().projection() filters like include() etc 
    1213import static com.mongodb.client.model.Projections.*; 
     14// to use aggregation functions like unwind(), match(), sort() etc 
     15import static com.mongodb.client.model.Aggregates.*; 
     16// to use functions like sum() and addToSet() within aggregation functions 
     17import static com.mongodb.client.model.Accumulators.*; 
    1318 
    1419//import org.bson.conversions.Bson; 
     
    2227 
    2328import org.bson.Document; 
     29import org.bson.conversions.Bson; 
     30     
     31import com.mongodb.util.JSON; 
     32//import com.mongodb.DBObject; 
    2433 
    2534import java.io.BufferedReader; 
    2635import java.io.File; 
    2736import java.io.FileReader; 
     37import java.util.Arrays; 
    2838import java.util.ArrayList; 
    2939import java.util.List; 
     
    6878    static final String PROPS_FILENAME = "config.properties"; 
    6979    public static final String WEBPAGES_COLLECTION = "Webpages"; 
    70     public static final String WEBSITES_COLLECTION = "Websites";     
     80    public static final String WEBSITES_COLLECTION = "Websites"; 
     81     
     82    /** mongodb filter types to execute */ 
     83    public static final int IS_MRI = 0; 
     84    public static final int CONTAINS_MRI = 1; 
    7185 
    7286    // configuration details, some with fallback values 
     
    264278    } 
    265279    */ 
    266  
     280     
     281    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
     282    return queryAllMatchingURLsFilteredBy(domain, IS_MRI); 
     283    } 
     284    public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) { 
     285    return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI); 
     286    } 
     287     
    267288    /**  
    268289     * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/  
     
    275296     * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java 
    276297     * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/ 
    277 */ 
    278     public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
     298     */ 
     299    public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) { 
    279300     
    280301    final ArrayList<String> urlsList = new ArrayList<String>(); 
     
    300321        }; 
    301322 
    302  
    303323     
    304324    // Run the following mongodb query: 
     
    309329 
    310330    // 2. Another way: 
    311     String query = "{URL: /DOMAIN/, isMRI: true}"; 
     331    //String query = "{URL: /DOMAIN/, isMRI: true}"; 
     332    String query = "{URL: /DOMAIN/, "; 
     333    if(filterType == IS_MRI) { 
     334        query += "isMRI: true}"; 
     335    } else if(filterType == CONTAINS_MRI) { 
     336        query += "containsMRI: true}"; 
     337    } 
     338     
    312339    domain = domain.replace(".", "\\."); // escape dots in domain for regex 
    313340    query = query.replace("DOMAIN", domain); 
     
    324351    } 
    325352 
    326      
     353    /** 
     354 
     355       db.Websites.aggregate([ 
     356    { 
     357        $match: { 
     358            $and: [ 
     359                {geoLocationCountryCode: {$ne: "NZ"}}, 
     360                {domain: {$not: /\.nz/}}, 
     361                {numPagesContainingMRI: {$gt: 0}}, 
     362                {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}             
     363            ] 
     364        } 
     365    }, 
     366    { $unwind: "$geoLocationCountryCode" }, 
     367    { 
     368        $group: { 
     369            _id: {$toLower: '$geoLocationCountryCode'}, 
     370            count: { $sum: 1 }, 
     371            domain: { $addToSet: '$domain' } 
     372        } 
     373    }, 
     374    { $sort : { count : -1} } 
     375]); 
     376 
     377       https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver 
     378       https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria 
     379       Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo 
     380 
     381       (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java) 
     382       https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates 
     383       On using group(TExpression) inside collection.aggregate(). 
     384    */ 
     385    public String aggregateContainsMRIForOverseas() { 
     386    // working with the WebSites collection, not WebPages collection! 
     387    MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION); 
     388 
     389    /*String matchQuery = 
     390    "$and: [" 
     391        + "{geoLocationCountryCode: {$ne: \"NZ\"}}," 
     392        + "{domain: {$not: /\\.nz/}}," 
     393        + "{numPagesContainingMRI: {$gt: 0}}," 
     394        + "{$or: [{geoLocationCountryCode: \"AU\"}, {urlContainsLangCodeInPath: false}]}" 
     395            + "]";*/ 
     396 
     397 
     398     
     399     
     400    Bson orQuery = or( 
     401              BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"), 
     402              BasicDBObject.parse("{urlContainsLangCodeInPath: false}") 
     403              ); 
     404    Bson andQuery = and( 
     405        BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"), 
     406        BasicDBObject.parse("{domain: {$not: /\\.nz/}}"), 
     407        BasicDBObject.parse("{numPagesContainingMRI: {$gt: 0}}"), 
     408        orQuery); 
     409     
     410    AggregateIterable<Document> output 
     411        = collection.aggregate(Arrays.asList( 
     412         match(andQuery),  //match(BasicDBObject.parse(matchQuery)) 
     413         // match((List<DBObject>)JSON.parse(matchQuery)), 
     414         unwind("$geoLocationCountryCode"), 
     415         group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))), 
     416         sort(BasicDBObject.parse("{count : -1}")) 
     417                         )); 
     418     
     419    for (Document doc : output) { 
     420        //System.out.println(doc); 
     421        System.out.println(doc.toJson()); 
     422    } 
     423 
     424    return ""; 
     425    } 
    327426     
    328427    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java

    r33873 r33876  
    1818 * TO RUN: 
    1919 *    maori-lang-detection/src$ 
    20  *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt  255 
     20 *       java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255 
    2121 * 
    2222*/ 
     
    2525     
    2626    private final MongoDBAccess mongodbAccess; 
    27     private final int numURLs; 
    28  
    29     //private String[] urls; 
    30     ArrayList<String> urlsList = new ArrayList<String>(); 
     27    private int numURLs; 
     28    private File domainsFile; 
     29     
     30    public WebPageURLsListing(MongoDBAccess mongodbAccess, 
     31                    File domainsFile) 
     32    { 
     33    this.mongodbAccess = mongodbAccess; 
     34    this.domainsFile = domainsFile; 
     35    } 
    3136     
    3237    public WebPageURLsListing(MongoDBAccess mongodbAccess, 
     
    3439                    int numURLs) 
    3540    { 
    36     this.mongodbAccess = mongodbAccess; 
     41    this(mongodbAccess, domainsFile); 
    3742    this.numURLs = numURLs; 
     43    } 
    3844 
     45    public String produceURLsForPagesInMRI() { 
     46    return writeFile(MongoDBAccess.IS_MRI); 
     47    } 
     48     
     49    public String produceURLsForPagesContainingMRI() { 
     50    return writeFile(MongoDBAccess.CONTAINS_MRI); 
     51    } 
     52 
     53 
     54    public String writeFile(int filterType) { 
     55 
     56    ArrayList<String> urlsList = new ArrayList<String>(); 
     57     
    3958    // 1. read each url from the domainsFile 
    40       // 1a. do the query 
    41       // 1b. add the arraylist result to urls 
    42  
     59    // 1a. do the query 
     60    // 1b. add the arraylist result to urls 
     61     
    4362    try ( 
    4463         BufferedReader reader = new BufferedReader(new FileReader(domainsFile)); 
     
    5170        domain = domain.trim(); 
    5271        if(!domain.equals("")) { 
    53             ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingIsMRIURLs(domain); 
     72            ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 
    5473            urlsList.addAll(moreURLs); 
    5574        } 
     
    6382    // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 
    6483    File parentFolder = domainsFile.getParentFile(); 
    65     String fileName = domainsFile.getName(); 
    66     //File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName); 
    67     File fullSetOutFile = new File(parentFolder, "allPages_"+fileName); 
     84    //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName()); 
     85    String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; 
     86    File outFile = new File(parentFolder, fileName+domainsFile.getName()); 
    6887 
    6988    // write out ALL the URLs 
    7089    try ( 
    71          Writer writer = new BufferedWriter(new FileWriter(fullSetOutFile)); 
     90         Writer writer = new BufferedWriter(new FileWriter(outFile)); 
    7291         ) { 
    7392 
     
    7897        } 
    7998    } catch(Exception e) { 
    80         logger.error("Unable to write to file " + fullSetOutFile.getAbsolutePath()); 
     99        logger.error("Unable to write to file " + outFile.getAbsolutePath()); 
    81100        logger.error(e.getMessage(), e); 
    82101    } 
     
    98117    } 
    99118    */ 
     119 
     120    return outFile.getAbsolutePath(); 
    100121    } 
     122     
    101123     
    102124    public static void printUsage() { 
     
    107129    // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ 
    108130 
    109      
    110131     
    111132    public static void main(String args[]) { 
     
    124145        } 
    125146 
    126         int genNumURLs = Integer.parseInt(args[1]); 
     147        //int genNumURLs = Integer.parseInt(args[1]); 
    127148 
    128149        mongodb.connectToDB(); 
    129150         
    130151        WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile); 
     152        //String isMRIFile = listing.produceURLsForPagesInMRI(); 
     153        //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 
     154        mongodb.aggregateContainsMRIForOverseas(); 
    131155         
    132156    } catch(Exception e) {