Changeset 33674 for other-projects


Ignore:
Timestamp:
2019-11-15T00:21:31+13:00 (4 years ago)
Author:
ak19
Message:

Changes to support the top 5 predicted langcodes and their confidence values per sentence/overlapping sentence (all 103 made some documents, like of site 00006, too big too go into mongodb). Have re-run the NutchTextDumpToMongDB to send the new form of the docs into mongodb.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33657 r33674  
    243243
    244244        for(SentenceInfo si : singleSentences) {
    245             if(si.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     245            LanguageInfo bestLanguage = si.languagesInfo[0];
     246            if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
    246247            numSentencesInMRI++;
    247248            }
     
    251252       
    252253        //mongodbAccess.insertWebpageInfo(webpage);
     254        // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
    253255        mongodbAccess.datastore.save(webpage);
    254256        }
    255257    }
    256258    }
    257 
    258     /*
    259     public void printSiteStats() {
    260    
    261    
    262     logger.info("------------- " + this.siteID + " SITE STATS -----------");
    263 
    264     logger.info("SITE DOMAIN: " + this.domainOfSite);
    265     logger.info("Total number of web pages in site: " + pages.size());
    266     logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size());
    267    
    268     if(pagesInMRI.size() > 0) {
    269         logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");
    270         for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {
    271         logger.info(mriWebPageInfo.toString());
    272         }
    273     }
    274 
    275     logger.info("                      -----------                   ");
    276     if(pagesContainingMRI.size() > 0) {     
    277         logger.info("The following pages weren't detected as primarily being in Māori");
    278         logger.info("But still contained sentences detected as Māori");
    279         for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {
    280         logger.info(mriWebPageInfo.toString());
    281         }
    282        
    283     } else {
    284         logger.info("No further pages detected as containing any sentences in MRI");       
    285     }
    286     logger.info("                      -----------                   ");
    287     }
    288     */
    289 
    290259   
    291260
     
    328297
    329298    //mongodbAccess.insertWebsiteInfo(website);
     299    // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia
    330300    mongodbAccess.datastore.save(website);
    331301    }
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33652 r33674  
    2929import java.util.ArrayList;
    3030
    31 import org.greenstone.atea.morphia.*;
     31import org.greenstone.atea.morphia.SentenceInfo;
     32import org.greenstone.atea.morphia.LanguageInfo;
    3233
    3334/**
     
    5253     */
    5354    public final double MINIMUM_CONFIDENCE;
     55
     56    /** Number of language and confidence results to return for storing in MongoDB
     57     * MongoDB runs out of space if storing too many, as we store this info per sentence
     58     * and a long text document becomes a very large MongoDB document presumable*/
     59    public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model
    5460   
    5561    /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
     
    168174        //System.err.println(sentence);
    169175
    170         Language bestLanguage = myCategorizer.predictLanguage(sentence);
    171         double confidence = bestLanguage.getConfidence();
    172 
    173         sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
     176        //Language bestLanguage = myCategorizer.predictLanguage(sentence);
     177        //double confidence = bestLanguage.getConfidence();
     178        //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
     179       
     180        Language languages[] = myCategorizer.predictLanguages(sentence);
     181        // languages array already sorted in order of descending confidence
     182        LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
     183        for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
     184        String langCode = languages[j].getLang();
     185        double confidence = languages[j].getConfidence();
     186        languagesInfo[j] = new LanguageInfo(confidence, langCode);
     187        }
     188       
     189        sentencesList.add(new SentenceInfo(sentence, languagesInfo));
     190       
    174191    }
    175192
     
    199216        //System.err.println(sentence);
    200217
    201         Language bestLanguage = myCategorizer.predictLanguage(doubleSentence);
    202         double confidence = bestLanguage.getConfidence();
    203 
    204         sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence));
     218        //Language bestLanguage = myCategorizer.predictLanguage(doubleSentence);
     219        //double confidence = bestLanguage.getConfidence();
     220        //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence));
     221       
     222        Language languages[] = myCategorizer.predictLanguages(doubleSentence);
     223        // languages array already sorted in order of descending confidence
     224        LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
     225       
     226        for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
     227        String langCode = languages[j].getLang();
     228        double confidence = languages[j].getConfidence();
     229        languagesInfo[j] = new LanguageInfo(confidence, langCode);
     230        }
     231        sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo));
     232       
    205233    }
    206234
     
    305333        text.append(line + "\n"); // add back (unix style) line ending
    306334    }
     335
    307336    return isTextInLanguage(langCode, text.toString());
    308337    }
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java

    r33653 r33674  
    11package org.greenstone.atea.morphia;
     2
     3import java.util.Map;
     4import java.util.HashMap;
    25
    36import dev.morphia.annotations.*;
    47
     8
     9@Entity("Sentences")
     10public class SentenceInfo {
     11
     12    public final String sentence;
     13    public final Map<String, Double> languageToConfidenceMap;
     14    @Embedded
     15    public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs
     16
     17   
     18    public SentenceInfo(String sentence, LanguageInfo[] languages) {
     19    this.sentence = sentence;
     20    this.languagesInfo = languages;
     21
     22    // let's store (langCode -> confidence) lookup in Map:
     23    this.languageToConfidenceMap = new HashMap<String, Double>();   
     24    for(LanguageInfo li : languages) {
     25        String langCode = li.langCode;
     26        Double confidence = new Double(li.confidenceLevel);
     27        languageToConfidenceMap.put(langCode, confidence);
     28    }
     29    }
     30   
     31}
     32
     33// BACK WHEN WE ONLY STORED THE BEST PREDICTED LANGUAGE META FOR EACH SENTENCE:
     34/*
    535@Entity("Sentences")
    636public class SentenceInfo {
    737    public final double confidenceLevel;
    8     /** 3 letter lang code */
     38    // 3 letter lang code
    939    public final String langCode;
    1040    public final String sentence;
     
    1646    }
    1747}
     48*/
Note: See TracChangeset for help on using the changeset viewer.