Changeset 33869

Show
Ignore:
Timestamp:
23.01.2020 22:59:46 (4 weeks ago)
Author:
ak19
Message:

First cut at the RandomURLsForDomainGenerator.java class and the mongodb method it needs added to MongoDBAccess. Still need to generate a domainURLs file to start testing whether the code even works. But at least it finally compiles.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java

    r33867 r33869  
    601601    try { 
    602602        File countsFile = new File(args[0]); 
    603          
     603        if(!countsFile.exists()) { 
     604        System.err.println("File " + countsFile + " does not exist"); 
     605        System.exit(-1); 
     606        } 
     607         
    604608        CountryCodeCountsMapData mapData = new CountryCodeCountsMapData(args[0]); 
    605609 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33653 r33869  
    22 
    33//import org.bson.BSONObject; 
    4      
     4 
    55import com.mongodb.client.MongoCollection;  
    66import com.mongodb.client.MongoDatabase;  
    77//import com.mongodb.client.MongoIterable; 
     8 
     9// to use collection.find() filters like eq(), regex() etc 
     10import static com.mongodb.client.model.Filters.*; 
     11// to use collection.find().projection() filters like include() etc 
     12import static com.mongodb.client.model.Projections.*; 
     13 
     14//import org.bson.conversions.Bson; 
    815import com.mongodb.BasicDBObject; 
    916import com.mongodb.MongoClient;  
     
    1118import com.mongodb.ServerAddress; 
    1219import com.mongodb.MongoClientOptions; 
     20 
     21import com.mongodb.Block; 
    1322 
    1423import org.bson.Document; 
     
    2029import java.util.List; 
    2130import java.util.Properties; 
    22  
     31import java.util.regex.Pattern; 
    2332 
    2433import org.apache.log4j.Logger; 
     
    5867     
    5968    static final String PROPS_FILENAME = "config.properties"; 
    60     public static final String WEBPAGES_COLLECTION = "webpages"; 
    61     public static final String WEBSITES_COLLECTION = "websites";     
     69    public static final String WEBPAGES_COLLECTION = "Webpages"; 
     70    public static final String WEBSITES_COLLECTION = "Websites";     
    6271 
    6372    // configuration details, some with fallback values 
     
    256265    */ 
    257266 
     267    /**  
     268     * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/  
     269     * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html 
     270     * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection 
     271     * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find 
     272*/ 
     273    public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) { 
     274     
     275    final ArrayList<String> urlsList = new ArrayList<String>(); 
     276 
     277    // load the "webpages" db table 
     278    // in mongodb, the equivalent of db tables are called 'collections' 
     279     
     280     
     281    //Pattern pattern = Pattern.compile(".*"+domain+".*"); 
     282    String pattern = "/"+domain.replace(".", "\\.")+"/"; 
     283     
     284    MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION); 
     285 
     286 
     287    Block<Document> storeURL = new Block<Document>() { 
     288        @Override 
     289        public void apply(final Document document) { 
     290            //System.out.println(document.toJson()); 
     291            String url = document.getString("URL"); 
     292            // add to our urlsList 
     293            urlsList.add(url); 
     294        } 
     295        }; 
     296 
     297 
     298     
     299    // do mongodb query: 
     300    // test example: 
     301    //collection.find(eq("isMRI", true)).first(); 
     302    // 
     303    // db.getCollection('Webpages').find({URL:/.*domain.*/, isMRI: true}, {URL: 1, _id: 0}) 
     304    collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL); 
     305 
     306     
     307    return urlsList; 
     308    } 
     309     
    258310    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 
    259311    public void close() {}