Changeset 33651

Show
Ignore:
Timestamp:
12.11.2019 18:11:39 (3 weeks ago)
Author:
ak19
Message:

1. Bugfix: overlappingSentences works. 2. storing numSentencesInMaor

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33645 r33651  
    180180    https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex 
    181181    */ 
     182    /** 
     183     * Inserts a web page into the mongodb. Besides page related metadata and full body text 
     184     * the language information per sentence and per 2 adjacent sentences also get stored 
     185     * into the mongodb. 
     186     */     
    182187    public void insertWebpageInfo(WebpageInfo webpage) 
    183188    { 
     189    int mri_sentence_count = 0; 
     190     
    184191    // load the webpages db 'table' 
    185192    // in mongodb, the equivalent of db tables are called 'collections' 
     
    195202        .append("fetchTime", webpage.fetchTime); 
    196203 
    197     // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT: 
    198     //document.put("singleSentences", webpage.singleSentences); 
    199     //document.put("overlappingSentences", webpage.overlappingSentences); 
    200  
    201204    // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER: 
    202205    // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java 
    203206    List<BasicDBObject> sentencesList = new ArrayList<>(); 
    204207    for(SentenceInfo sentenceInfo : webpage.singleSentences) { 
    205         //sentencesList.add(new BasicDBObject("langCode", sentenceInfo.langCode)); 
    206         //sentencesList.add(new BasicDBObject("confidence", sentenceInfo.confidenceLevel)); 
    207         //sentencesList.add(new BasicDBObject("sentence", sentenceInfo.sentence)); 
     208         
    208209        BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode); 
     210         
    209211        bsonRecord.put("confidence", sentenceInfo.confidenceLevel); 
    210212        bsonRecord.put("sentence", sentenceInfo.sentence); 
    211213         
    212214        sentencesList.add(bsonRecord); 
     215 
     216        if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 
     217        mri_sentence_count++; 
     218        } 
     219 
    213220    }    
    214221    document.put("singleSentences", sentencesList); 
     
    216223    List<BasicDBObject> overlappingSentencesList = new ArrayList<>(); 
    217224    for(SentenceInfo sentenceInfo : webpage.overlappingSentences) { 
     225 
    218226        BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode); 
    219227        bsonRecord.put("confidence", sentenceInfo.confidenceLevel); 
    220228        bsonRecord.put("sentence", sentenceInfo.sentence); 
    221          
    222         sentencesList.add(bsonRecord); 
     229         
     230        overlappingSentencesList.add(bsonRecord); 
    223231    }    
    224232    document.put("overlappingSentences", overlappingSentencesList); 
     
    226234    // also put the full text in there 
    227235    document.put("text", webpage.text); 
     236 
     237    // also store the count of sentences in MRI 
     238    webpage.setMRISentenceCount(mri_sentence_count); 
     239    document.put("mriSentenceCount", mri_sentence_count); 
     240 
    228241     
    229242    collection.insertOne(document); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33634 r33651  
    184184    for(int i = 1; i < sentences.length; i++) { 
    185185        // glue every two adjacent sentences together 
    186         String sentence = sentences[i-1]; 
     186        String doubleSentence = sentences[i-1]; 
    187187         
    188188        String separator = ". "; 
    189189        // if the sentence already ends with a terminating punctuation character, 
    190190        // then separator is just a space 
    191         sentence = sentence.trim(); 
    192         if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) { 
     191        doubleSentence = doubleSentence.trim(); 
     192        if(doubleSentence.endsWith(".") || doubleSentence.endsWith("?") || doubleSentence.endsWith("!")) { 
    193193        separator = " "; 
    194194        } 
    195         sentence = sentence + separator + sentences[i];  
     195        doubleSentence = doubleSentence + separator + sentences[i];  
    196196         
    197197        //System.err.println(sentence); 
    198198 
    199         Language bestLanguage = myCategorizer.predictLanguage(sentence); 
     199        Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 
    200200        double confidence = bestLanguage.getConfidence(); 
    201201 
    202         sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 
     202        sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 
    203203    } 
    204204 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebpageInfo.java

    r33634 r33651  
    55public class WebpageInfo { 
    66 
     7    private int mriSentenceCount; 
     8     
    79    /** db table ids */ 
    810    public final long webpageID; 
     
    1315    public final String text; 
    1416    public final String URL; 
    15     public final boolean isMRI; 
    16      
     17    public final boolean isMRI;     
     18     
    1719    public final String charEncoding; 
    1820    public final String modifiedTime; 
     
    4547     
    4648    } 
     49 
     50    public void setMRISentenceCount(int count) { 
     51    this.mriSentenceCount = count; 
     52    } 
     53 
     54    public int getMRISentenceCount() { return this.mriSentenceCount; } 
    4755}