Changeset 33651


Ignore:
Timestamp:
2019-11-12T18:11:39+13:00 (4 years ago)
Author:
ak19
Message:
  1. Bugfix: overlappingSentences works. 2. storing numSentencesInMaor
Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java

    r33645 r33651  
    180180    https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex
    181181    */
     182    /**
     183     * Inserts a web page into the mongodb. Besides page related metadata and full body text
     184     * the language information per sentence and per 2 adjacent sentences also get stored
     185     * into the mongodb.
     186     */   
    182187    public void insertWebpageInfo(WebpageInfo webpage)
    183188    {
     189    int mri_sentence_count = 0;
     190   
    184191    // load the webpages db 'table'
    185192    // in mongodb, the equivalent of db tables are called 'collections'
     
    195202        .append("fetchTime", webpage.fetchTime);
    196203
    197     // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT:
    198     //document.put("singleSentences", webpage.singleSentences);
    199     //document.put("overlappingSentences", webpage.overlappingSentences);
    200 
    201204    // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
    202205    // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
    203206    List<BasicDBObject> sentencesList = new ArrayList<>();
    204207    for(SentenceInfo sentenceInfo : webpage.singleSentences) {
    205         //sentencesList.add(new BasicDBObject("langCode", sentenceInfo.langCode));
    206         //sentencesList.add(new BasicDBObject("confidence", sentenceInfo.confidenceLevel));
    207         //sentencesList.add(new BasicDBObject("sentence", sentenceInfo.sentence));
     208       
    208209        BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
     210       
    209211        bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
    210212        bsonRecord.put("sentence", sentenceInfo.sentence);
    211213       
    212214        sentencesList.add(bsonRecord);
     215
     216        if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     217        mri_sentence_count++;
     218        }
     219
    213220    }   
    214221    document.put("singleSentences", sentencesList);
     
    216223    List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
    217224    for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
     225
    218226        BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
    219227        bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
    220228        bsonRecord.put("sentence", sentenceInfo.sentence);
    221        
    222         sentencesList.add(bsonRecord);
     229       
     230        overlappingSentencesList.add(bsonRecord);
    223231    }   
    224232    document.put("overlappingSentences", overlappingSentencesList);
     
    226234    // also put the full text in there
    227235    document.put("text", webpage.text);
     236
     237    // also store the count of sentences in MRI
     238    webpage.setMRISentenceCount(mri_sentence_count);
     239    document.put("mriSentenceCount", mri_sentence_count);
     240
    228241   
    229242    collection.insertOne(document);
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33634 r33651  
    184184    for(int i = 1; i < sentences.length; i++) {
    185185        // glue every two adjacent sentences together
    186         String sentence = sentences[i-1];
     186        String doubleSentence = sentences[i-1];
    187187       
    188188        String separator = ". ";
    189189        // if the sentence already ends with a terminating punctuation character,
    190190        // then separator is just a space
    191         sentence = sentence.trim();
    192         if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) {
     191        doubleSentence = doubleSentence.trim();
     192        if(doubleSentence.endsWith(".") || doubleSentence.endsWith("?") || doubleSentence.endsWith("!")) {
    193193        separator = " ";
    194194        }
    195         sentence = sentence + separator + sentences[i];
     195        doubleSentence = doubleSentence + separator + sentences[i];
    196196       
    197197        //System.err.println(sentence);
    198198
    199         Language bestLanguage = myCategorizer.predictLanguage(sentence);
     199        Language bestLanguage = myCategorizer.predictLanguage(doubleSentence);
    200200        double confidence = bestLanguage.getConfidence();
    201201
    202         sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
     202        sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence));
    203203    }
    204204
  • other-projects/maori-lang-detection/src/org/greenstone/atea/WebpageInfo.java

    r33634 r33651  
    55public class WebpageInfo {
    66
     7    private int mriSentenceCount;
     8   
    79    /** db table ids */
    810    public final long webpageID;
     
    1315    public final String text;
    1416    public final String URL;
    15     public final boolean isMRI;
    16    
     17    public final boolean isMRI;   
     18   
    1719    public final String charEncoding;
    1820    public final String modifiedTime;
     
    4547   
    4648    }
     49
     50    public void setMRISentenceCount(int count) {
     51    this.mriSentenceCount = count;
     52    }
     53
     54    public int getMRISentenceCount() { return this.mriSentenceCount; }
    4755}
Note: See TracChangeset for help on using the changeset viewer.