Changeset 33653

Show
Ignore:
Timestamp:
12.11.2019 20:51:48 (3 weeks ago)
Author:
ak19
Message:

1. As suggested by Dr Bainbridge, made the code changes to use Morphia as ODM for MongoDB (Object Document Mapper, ODM for MongoDB is equivalent to what ORM is to RDBMS). 2. Adding jar files to get this to work. 3. Further changes to store site folder names of form ##### as primary key of Websites collection. However, may in a future commit decide to store a reference to a WebsiteInfo? object (representing a JSON document in a Websites MongoDB collection) inside a WebpageInfo? object. 4. The MongoDB collections are now called Websites and Webpages, not websites and webpages. 5. geolocation of site now stored as field in Websites mongodb collection. And containsMRI now stored as field in Webpages collection of mongoDB. 6. Tried out some mongodb query commands based on what Dr Bainbridge did yesterday.

Location:
other-projects/maori-lang-detection
Files:
4 added
2 modified
3 moved

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33646 r33653  
    316316db.getCollection('webpages').find({"isMRI": true, "singleSentences.langCode": "mri"}) 
    317317db.getCollection('webpages').find({"singleSentences": { $elemMatch: {"langCode":"mri"} } }, {"singleSentences.$": "mri"}) 
    318  
    319  
     318db.getCollection('Webpages').find({"isMRI": true, "singleSentences": { $elemMatch: {"langCode":"eng"} } }, {"singleSentences.$": "eng"}) [single English lang sentence] 
     319db.getCollection('Webpages').find({"containsMRI": true, "singleSentences": { $elemMatch: {"langCode":"mri"} } }, {"singleSentences.$": "mri"}) [gets 1st sentence of docs which have sentences containing MRI] 
    320320 
    321321 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33652 r33653  
    165165    } 
    166166     
    167      
     167    /* 
    168168    public void insertWebsiteInfo(WebsiteInfo website) 
    169169    { 
     
    188188               + " inserted successfully into " + WEBSITES_COLLECTION);   
    189189    } 
    190  
    191     /* TODO: 
    192     https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex 
    193190    */ 
     191 
    194192    /** 
    195193     * Inserts a web page into the mongodb. Besides page related metadata and full body text 
    196194     * the language information per sentence and per 2 adjacent sentences also get stored 
    197195     * into the mongodb. 
    198      */     
     196     */ 
     197    /* 
    199198    public void insertWebpageInfo(WebpageInfo webpage) 
    200199    { 
     
    255254    logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);   
    256255    } 
     256    */ 
    257257 
    258258    /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */ 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java

    r33634 r33653  
    1 package org.greenstone.atea; 
     1package org.greenstone.atea.morphia; 
    22 
     3import dev.morphia.annotations.*; 
     4 
     5@Entity("Sentences") 
    36public class SentenceInfo { 
    47    public final double confidenceLevel; 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebpageInfo.java

    r33651 r33653  
    1 package org.greenstone.atea; 
     1package org.greenstone.atea.morphia; 
    22 
     3import dev.morphia.annotations.*; 
    34import java.util.ArrayList; 
     5import java.util.List; 
    46 
     7/** 
     8 * Morphia provides the Object Document Mapper for MongoDB 
     9 * https://www.baeldung.com/mongodb-morphia  
     10 *  
     11 */ 
     12@Entity("Webpages") 
    513public class WebpageInfo { 
    614 
    7     private int mriSentenceCount; 
    8      
    915    /** db table ids */ 
     16    @Id 
    1017    public final long webpageID; 
    11     public final int websiteID; 
     18    // TODO: should this be a "Reference" to the WebsiteInfo object instead? 
     19    // See section 5.2 of https://www.baeldung.com/mongodb-morphia 
     20    public final String websiteID; //int websiteID; 
    1221 
    1322    public final int totalSentences; 
     
    2029    public final String modifiedTime; 
    2130    public final String fetchTime; 
    22     public final ArrayList<SentenceInfo> singleSentences; 
    23     public final ArrayList<SentenceInfo> overlappingSentences; 
    24      
    25     public WebpageInfo (long webpageID, int websiteID, 
     31 
     32    @Embedded 
     33    public final List<SentenceInfo> singleSentences; 
     34    @Embedded 
     35    public final List<SentenceInfo> overlappingSentences; 
     36 
     37    private int mriSentenceCount;     
     38    private boolean containsMRI; 
     39     
     40    public WebpageInfo (long webpageID, String siteID/*int websiteID,*/, 
    2641            String pageText, String pageURL, boolean isMRI, int totalSentences, 
    2742            String charEncoding, String modifiedTime, String fetchTime, 
    28             ArrayList<SentenceInfo> singleSentences, 
    29             ArrayList<SentenceInfo> overlappingSentences) 
     43            List<SentenceInfo> singleSentences, 
     44            List<SentenceInfo> overlappingSentences) 
    3045    { 
    3146 
    3247    this.webpageID = webpageID; 
    33     this.websiteID = websiteID; 
     48    //this.websiteID = websiteID; 
     49    this.websiteID = siteID; 
    3450 
    3551    this.totalSentences = totalSentences; 
     
    5167    this.mriSentenceCount = count; 
    5268    } 
     69    public void setContainsMRI(boolean containsMRI) { 
     70    this.containsMRI = containsMRI; 
     71    } 
    5372 
    5473    public int getMRISentenceCount() { return this.mriSentenceCount; } 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33634 r33653  
    1 package org.greenstone.atea; 
     1package org.greenstone.atea.morphia; 
    22 
     3import dev.morphia.annotations.*; 
     4 
     5@Entity("Websites") 
    36public class WebsiteInfo { 
    4  
    5     public final int id; 
     7    //public final int id; 
     8    @Id 
    69    public final String siteFolderName; 
    710    public final String domain; 
     
    1821    public final boolean urlContainsLangCodeInpath; 
    1922     
    20     public WebsiteInfo(int siteCount, String siteFolderName, String domainOfSite, 
     23    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 
    2124               int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI, 
    2225               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 
    2326               String geoLocationCountryCode, boolean urlContainsLangCodeInpath) 
    2427    { 
    25     this.id = siteCount; 
     28    //this.id = siteCount; 
    2629    this.siteFolderName = siteFolderName; 
    2730    this.domain = domainOfSite;