Changeset 33869


Ignore:
Timestamp:
2020-01-23T22:59:46+13:00 (4 years ago)
Author:
ak19
Message:

First cut at the RandomURLsForDomainGenerator.java class and the mongodb method it needs added to MongoDBAccess. Still need to generate a domainURLs file to start testing whether the code even works. But at least it finally compiles.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java

    r33867 r33869  
    601601    try {
    602602        File countsFile = new File(args[0]);
    603        
     603        if(!countsFile.exists()) {
     604        System.err.println("File " + countsFile + " does not exist");
     605        System.exit(-1);
     606        }
     607       
    604608        CountryCodeCountsMapData mapData = new CountryCodeCountsMapData(args[0]);
    605609
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33653 r33869  
    22
    33//import org.bson.BSONObject;
    4    
     4
    55import com.mongodb.client.MongoCollection;
    66import com.mongodb.client.MongoDatabase;
    77//import com.mongodb.client.MongoIterable;
     8
     9// to use collection.find() filters like eq(), regex() etc
     10import static com.mongodb.client.model.Filters.*;
     11// to use collection.find().projection() filters like include() etc
     12import static com.mongodb.client.model.Projections.*;
     13
     14//import org.bson.conversions.Bson;
    815import com.mongodb.BasicDBObject;
    916import com.mongodb.MongoClient;
     
    1118import com.mongodb.ServerAddress;
    1219import com.mongodb.MongoClientOptions;
     20
     21import com.mongodb.Block;
    1322
    1423import org.bson.Document;
     
    2029import java.util.List;
    2130import java.util.Properties;
    22 
     31import java.util.regex.Pattern;
    2332
    2433import org.apache.log4j.Logger;
     
    5867   
    5968    static final String PROPS_FILENAME = "config.properties";
    60     public static final String WEBPAGES_COLLECTION = "webpages";
    61     public static final String WEBSITES_COLLECTION = "websites";   
     69    public static final String WEBPAGES_COLLECTION = "Webpages";
     70    public static final String WEBSITES_COLLECTION = "Websites";   
    6271
    6372    // configuration details, some with fallback values
     
    256265    */
    257266
     267    /**
     268     * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
     269     * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
     270     * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
     271     * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
     272*/
     273    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
     274   
     275    final ArrayList<String> urlsList = new ArrayList<String>();
     276
     277    // load the "webpages" db table
     278    // in mongodb, the equivalent of db tables are called 'collections'
     279   
     280   
     281    //Pattern pattern = Pattern.compile(".*"+domain+".*");
     282    String pattern = "/"+domain.replace(".", "\\.")+"/";
     283   
     284    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
     285
     286
     287    Block<Document> storeURL = new Block<Document>() {
     288        @Override
     289        public void apply(final Document document) {
     290            //System.out.println(document.toJson());
     291            String url = document.getString("URL");
     292            // add to our urlsList
     293            urlsList.add(url);
     294        }
     295        };
     296
     297
     298   
     299    // do mongodb query:
     300    // test example:
     301    //collection.find(eq("isMRI", true)).first();
     302    //
     303    // db.getCollection('Webpages').find({URL:/.*domain.*/, isMRI: true}, {URL: 1, _id: 0})
     304    collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
     305
     306   
     307    return urlsList;
     308    }
     309   
    258310    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
    259311    public void close() {}
Note: See TracChangeset for help on using the changeset viewer.