Changeset 33653


Ignore:
Timestamp:
2019-11-12T20:51:48+13:00 (4 years ago)
Author:
ak19
Message:
  1. As suggested by Dr Bainbridge, made the code changes to use Morphia as ODM for MongoDB (Object Document Mapper, ODM for MongoDB is equivalent to what ORM is to RDBMS). 2. Adding jar files to get this to work. 3. Further changes to store site folder names of form ##### as primary key of Websites collection. However, may in a future commit decide to store a reference to a WebsiteInfo object (representing a JSON document in a Websites MongoDB collection) inside a WebpageInfo object. 4. The MongoDB collections are now called Websites and Webpages, not websites and webpages. 5. geolocation of site now stored as field in Websites mongodb collection. And containsMRI now stored as field in Webpages collection of mongoDB. 6. Tried out some mongodb query commands based on what Dr Bainbridge did yesterday.
Location:
other-projects/maori-lang-detection
Files:
4 added
2 edited
3 moved

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33646 r33653  
    316316db.getCollection('webpages').find({"isMRI": true, "singleSentences.langCode": "mri"})
    317317db.getCollection('webpages').find({"singleSentences": { $elemMatch: {"langCode":"mri"} } }, {"singleSentences.$": "mri"})
    318 
    319 
     318db.getCollection('Webpages').find({"isMRI": true, "singleSentences": { $elemMatch: {"langCode":"eng"} } }, {"singleSentences.$": "eng"}) [single English lang sentence]
     319db.getCollection('Webpages').find({"containsMRI": true, "singleSentences": { $elemMatch: {"langCode":"mri"} } }, {"singleSentences.$": "mri"}) [gets 1st sentence of docs which have sentences containing MRI]
    320320
    321321
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33652 r33653  
    165165    }
    166166   
    167    
     167    /*
    168168    public void insertWebsiteInfo(WebsiteInfo website)
    169169    {
     
    188188               + " inserted successfully into " + WEBSITES_COLLECTION); 
    189189    }
    190 
    191     /* TODO:
    192     https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex
    193190    */
     191
    194192    /**
    195193     * Inserts a web page into the mongodb. Besides page related metadata and full body text
    196194     * the language information per sentence and per 2 adjacent sentences also get stored
    197195     * into the mongodb.
    198      */   
     196     */
     197    /*
    199198    public void insertWebpageInfo(WebpageInfo webpage)
    200199    {
     
    255254    logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION); 
    256255    }
     256    */
    257257
    258258    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java

    r33652 r33653  
    1 package org.greenstone.atea;
     1package org.greenstone.atea.morphia;
    22
     3import dev.morphia.annotations.*;
     4
     5@Entity("Sentences")
    36public class SentenceInfo {
    47    public final double confidenceLevel;
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebpageInfo.java

    r33652 r33653  
    1 package org.greenstone.atea;
     1package org.greenstone.atea.morphia;
    22
     3import dev.morphia.annotations.*;
    34import java.util.ArrayList;
     5import java.util.List;
    46
     7/**
     8 * Morphia provides the Object Document Mapper for MongoDB
     9 * https://www.baeldung.com/mongodb-morphia
     10 *
     11 */
     12@Entity("Webpages")
    513public class WebpageInfo {
    614
    7     private int mriSentenceCount;
    8    
    915    /** db table ids */
     16    @Id
    1017    public final long webpageID;
    11     public final int websiteID;
     18    // TODO: should this be a "Reference" to the WebsiteInfo object instead?
     19    // See section 5.2 of https://www.baeldung.com/mongodb-morphia
     20    public final String websiteID; //int websiteID;
    1221
    1322    public final int totalSentences;
     
    2029    public final String modifiedTime;
    2130    public final String fetchTime;
    22     public final ArrayList<SentenceInfo> singleSentences;
    23     public final ArrayList<SentenceInfo> overlappingSentences;
    24    
    25     public WebpageInfo (long webpageID, int websiteID,
     31
     32    @Embedded
     33    public final List<SentenceInfo> singleSentences;
     34    @Embedded
     35    public final List<SentenceInfo> overlappingSentences;
     36
     37    private int mriSentenceCount;   
     38    private boolean containsMRI;
     39   
     40    public WebpageInfo (long webpageID, String siteID/*int websiteID,*/,
    2641            String pageText, String pageURL, boolean isMRI, int totalSentences,
    2742            String charEncoding, String modifiedTime, String fetchTime,
    28             ArrayList<SentenceInfo> singleSentences,
    29             ArrayList<SentenceInfo> overlappingSentences)
     43            List<SentenceInfo> singleSentences,
     44            List<SentenceInfo> overlappingSentences)
    3045    {
    3146
    3247    this.webpageID = webpageID;
    33     this.websiteID = websiteID;
     48    //this.websiteID = websiteID;
     49    this.websiteID = siteID;
    3450
    3551    this.totalSentences = totalSentences;
     
    5167    this.mriSentenceCount = count;
    5268    }
     69    public void setContainsMRI(boolean containsMRI) {
     70    this.containsMRI = containsMRI;
     71    }
    5372
    5473    public int getMRISentenceCount() { return this.mriSentenceCount; }
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33652 r33653  
    1 package org.greenstone.atea;
     1package org.greenstone.atea.morphia;
    22
     3import dev.morphia.annotations.*;
     4
     5@Entity("Websites")
    36public class WebsiteInfo {
    4 
    5     public final int id;
     7    //public final int id;
     8    @Id
    69    public final String siteFolderName;
    710    public final String domain;
     
    1821    public final boolean urlContainsLangCodeInpath;
    1922   
    20     public WebsiteInfo(int siteCount, String siteFolderName, String domainOfSite,
     23    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite,
    2124               int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI,
    2225               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl,
    2326               String geoLocationCountryCode, boolean urlContainsLangCodeInpath)
    2427    {
    25     this.id = siteCount;
     28    //this.id = siteCount;
    2629    this.siteFolderName = siteFolderName;
    2730    this.domain = domainOfSite;
Note: See TracChangeset for help on using the changeset viewer.