Changeset 33652


Ignore:
Timestamp:
2019-11-12T20:41:13+13:00 (4 years ago)
Author:
ak19
Message:

Introducing morphia subpackage

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33651 r33652  
    2424import org.apache.log4j.Logger;
    2525
     26import org.greenstone.atea.morphia.*;
     27import dev.morphia.*;
    2628
    2729/**
     
    6870    private MongoClient mongo = null;
    6971    private MongoDatabase database = null;
    70    
    71    
     72
     73    /**
     74     * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
     75     * for MongoDB
     76    */
     77    public Datastore datastore = null;
     78   
    7279    public MongoDBAccess() throws Exception {
    7380    boolean success = false;
     
    141148    this.database = mongo.getDatabase(DB_NAME);
    142149    */
     150
     151    Morphia morphia = new Morphia();
     152    morphia.mapPackage("com.greenstone.atea.morphia");
     153    datastore = morphia.createDatastore(mongo, DB_NAME);
     154    datastore.ensureIndexes();
    143155   
    144156    }
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33634 r33652  
    99import org.apache.commons.csv.*;
    1010import org.apache.log4j.Logger;
     11
     12//import org.bson.types.ObjectId;
     13   
     14import org.greenstone.atea.morphia.*;
    1115
    1216
     
    223227        String[] sentences = maoriTxtDetector.getAllSentences(text);
    224228        int totalSentences = sentences.length;
     229        int numSentencesInMRI = 0;
    225230        ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences);
    226231        ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences);
    227        
    228         WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER,
    229                                      SITE_COUNTER,
     232
     233        WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
     234                                     this.siteID/*SITE_COUNTER*/,
    230235                                     isMRI,
    231236                                     totalSentences,
     
    233238                                     overlappingSentences);
    234239
    235        
    236         mongodbAccess.insertWebpageInfo(webpage);
     240        for(SentenceInfo si : singleSentences) {
     241            if(si.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     242            numSentencesInMRI++;
     243            }
     244        }
     245        webpage.setMRISentenceCount(numSentencesInMRI);
     246        webpage.setContainsMRI((numSentencesInMRI > 0));
     247       
     248        //mongodbAccess.insertWebpageInfo(webpage);
     249        mongodbAccess.datastore.save(webpage);
    237250        }
    238251    }
     
    291304    }
    292305   
    293     //File geoLiteCityDatFile = new File(this.getClass().getResource("GeoLiteCity.dat").getFile());
    294     //this.geoLocationCountryCode = getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
     306    File geoLiteCityDatFile = new File(this.getClass().getClassLoader().getResource("GeoLiteCity.dat").getFile());
     307    try {
     308        this.geoLocationCountryCode = Utility.getCountryCodeOfDomain(this.domainOfSite, geoLiteCityDatFile);
     309    } catch(Exception e) {
     310        e.printStackTrace();
     311        this.geoLocationCountryCode = null;
     312    }
    295313
    296314    int totalPages = pages.size(); 
    297315
    298     WebsiteInfo website = new WebsiteInfo(SITE_COUNTER, this.siteID, this.domainOfSite,
     316    WebsiteInfo website = new WebsiteInfo(/*SITE_COUNTER,*/ this.siteID, this.domainOfSite,
    299317          totalPages, this.countOfWebPagesWithBodyText, this.numPagesInMRI,
    300318          this.siteCrawledTimestamp, this.siteCrawlUnfinished, redoCrawl,
    301319          this.geoLocationCountryCode, this.urlContainsLangCodeInPath);
    302320
    303     mongodbAccess.insertWebsiteInfo(website);
    304    
     321    //mongodbAccess.insertWebsiteInfo(website);
     322    mongodbAccess.datastore.save(website);
    305323    }
    306324
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextDumpPage.java

    r33634 r33652  
    88import org.apache.log4j.Logger;
    99
     10import org.greenstone.atea.morphia.*;
    1011
    1112public class TextDumpPage {
     
    178179     */
    179180    public WebpageInfo convertStoredDataToWebpageInfo(
    180       long webpageID, int websiteID, boolean isMRI, int totalSentences,
     181      long webpageID, String siteID /*int websiteID*/, boolean isMRI, int totalSentences,
    181182      ArrayList<SentenceInfo> singleSentences, ArrayList<SentenceInfo> overlappingSentences)
    182183    {
     
    188189    String fetchTime = getFetchTime();
    189190
    190     WebpageInfo webpage = new WebpageInfo(webpageID, websiteID,
     191    WebpageInfo webpage = new WebpageInfo(webpageID, siteID/*websiteID,*/,
    191192                          pageText, pageURL, isMRI, totalSentences,
    192193                          charEncoding, modifiedTime, fetchTime,
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33651 r33652  
    2929import java.util.ArrayList;
    3030
     31import org.greenstone.atea.morphia.*;
     32
    3133/**
    3234 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
Note: See TracChangeset for help on using the changeset viewer.