Changeset 33674

Show
Ignore:
Timestamp:
15.11.2019 00:21:31 (3 weeks ago)
Author:
ak19
Message:

Changes to support the top 5 predicted langcodes and their confidence values per sentence/overlapping sentence (all 103 made some documents, like of site 00006, too big too go into mongodb). Have re-run the NutchTextDumpToMongDB to send the new form of the docs into mongodb.

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
1 added
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33657 r33674  
    243243 
    244244        for(SentenceInfo si : singleSentences) { 
    245             if(si.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 
     245            LanguageInfo bestLanguage = si.languagesInfo[0]; 
     246            if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 
    246247            numSentencesInMRI++; 
    247248            } 
     
    251252         
    252253        //mongodbAccess.insertWebpageInfo(webpage); 
     254        // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia 
    253255        mongodbAccess.datastore.save(webpage); 
    254256        } 
    255257    } 
    256258    } 
    257  
    258     /* 
    259     public void printSiteStats() { 
    260      
    261      
    262     logger.info("------------- " + this.siteID + " SITE STATS -----------"); 
    263  
    264     logger.info("SITE DOMAIN: " + this.domainOfSite); 
    265     logger.info("Total number of web pages in site: " + pages.size()); 
    266     logger.info("Of these, the number of pages in Māori (mri) were: " + this.pagesInMRI.size()); 
    267      
    268     if(pagesInMRI.size() > 0) { 
    269         logger.info("The following were the pages detected by OpenNLP as being in Māori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence"); 
    270         for(MRIWebPageStats mriWebPageInfo : pagesInMRI) { 
    271         logger.info(mriWebPageInfo.toString()); 
    272         } 
    273     } 
    274  
    275     logger.info("                      -----------                   "); 
    276     if(pagesContainingMRI.size() > 0) {      
    277         logger.info("The following pages weren't detected as primarily being in Māori"); 
    278         logger.info("But still contained sentences detected as Māori"); 
    279         for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) { 
    280         logger.info(mriWebPageInfo.toString()); 
    281         } 
    282          
    283     } else { 
    284         logger.info("No further pages detected as containing any sentences in MRI");        
    285     } 
    286     logger.info("                      -----------                   "); 
    287     } 
    288     */ 
    289  
    290259     
    291260 
     
    328297 
    329298    //mongodbAccess.insertWebsiteInfo(website); 
     299    // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia 
    330300    mongodbAccess.datastore.save(website); 
    331301    } 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33652 r33674  
    2929import java.util.ArrayList; 
    3030 
    31 import org.greenstone.atea.morphia.*; 
     31import org.greenstone.atea.morphia.SentenceInfo; 
     32import org.greenstone.atea.morphia.LanguageInfo; 
    3233 
    3334/** 
     
    5253     */ 
    5354    public final double MINIMUM_CONFIDENCE; 
     55 
     56    /** Number of language and confidence results to return for storing in MongoDB 
     57     * MongoDB runs out of space if storing too many, as we store this info per sentence 
     58     * and a long text document becomes a very large MongoDB document presumable*/ 
     59    public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model 
    5460     
    5561    /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */ 
     
    168174        //System.err.println(sentence); 
    169175 
    170         Language bestLanguage = myCategorizer.predictLanguage(sentence); 
    171         double confidence = bestLanguage.getConfidence(); 
    172  
    173         sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 
     176        //Language bestLanguage = myCategorizer.predictLanguage(sentence); 
     177        //double confidence = bestLanguage.getConfidence(); 
     178        //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 
     179         
     180        Language languages[] = myCategorizer.predictLanguages(sentence); 
     181        // languages array already sorted in order of descending confidence 
     182        LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES]; 
     183        for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) { 
     184        String langCode = languages[j].getLang(); 
     185        double confidence = languages[j].getConfidence(); 
     186        languagesInfo[j] = new LanguageInfo(confidence, langCode); 
     187        } 
     188         
     189        sentencesList.add(new SentenceInfo(sentence, languagesInfo)); 
     190         
    174191    } 
    175192 
     
    199216        //System.err.println(sentence); 
    200217 
    201         Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 
    202         double confidence = bestLanguage.getConfidence(); 
    203  
    204         sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 
     218        //Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 
     219        //double confidence = bestLanguage.getConfidence(); 
     220        //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 
     221         
     222        Language languages[] = myCategorizer.predictLanguages(doubleSentence); 
     223        // languages array already sorted in order of descending confidence 
     224        LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES]; 
     225         
     226        for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) { 
     227        String langCode = languages[j].getLang(); 
     228        double confidence = languages[j].getConfidence(); 
     229        languagesInfo[j] = new LanguageInfo(confidence, langCode); 
     230        } 
     231        sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo)); 
     232         
    205233    } 
    206234 
     
    305333        text.append(line + "\n"); // add back (unix style) line ending 
    306334    } 
     335 
    307336    return isTextInLanguage(langCode, text.toString()); 
    308337    } 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java

    r33653 r33674  
    11package org.greenstone.atea.morphia; 
     2 
     3import java.util.Map; 
     4import java.util.HashMap; 
    25 
    36import dev.morphia.annotations.*; 
    47 
     8 
     9@Entity("Sentences") 
     10public class SentenceInfo { 
     11 
     12    public final String sentence; 
     13    public final Map<String, Double> languageToConfidenceMap; 
     14    @Embedded 
     15    public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs 
     16 
     17     
     18    public SentenceInfo(String sentence, LanguageInfo[] languages) { 
     19    this.sentence = sentence; 
     20    this.languagesInfo = languages; 
     21 
     22    // let's store (langCode -> confidence) lookup in Map: 
     23    this.languageToConfidenceMap = new HashMap<String, Double>();    
     24    for(LanguageInfo li : languages) { 
     25        String langCode = li.langCode; 
     26        Double confidence = new Double(li.confidenceLevel); 
     27        languageToConfidenceMap.put(langCode, confidence); 
     28    } 
     29    } 
     30     
     31} 
     32 
     33// BACK WHEN WE ONLY STORED THE BEST PREDICTED LANGUAGE META FOR EACH SENTENCE: 
     34/* 
    535@Entity("Sentences") 
    636public class SentenceInfo { 
    737    public final double confidenceLevel; 
    8     /** 3 letter lang code */ 
     38    // 3 letter lang code 
    939    public final String langCode; 
    1040    public final String sentence; 
     
    1646    } 
    1747} 
     48*/