Changeset 33698

Show
Ignore:
Timestamp:
15.11.2019 23:14:48 (3 weeks ago)
Author:
ak19
Message:

Links to more reading

Location:
other-projects/maori-lang-detection
Files:
6 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33675 r33698  
    346346https://docs.mongodb.com/manual/reference/method/db.collection.find/ 
    347347https://docs.mongodb.com/manual/reference/method/db.collection.find/#find-projection 
     348https://stackoverflow.com/questions/39641925/mongodb-aggregation-framework-to-get-frequencies-of-fields-values 
     349 
     350https://exploratory.io/note/kanaugust/0961813761939766 
     351https://docs.mongodb.com/manual/tutorial/project-fields-from-query-results/ 
     352https://docs.mongodb.com/manual/aggregation/ 
     353 
    348354 
    349355Mongo Studio 3T documentation: 
     
    407413 
    408414 
    409  
     415# PROJECTION: 
     416db.getCollection('Websites').find({geoLocationCountryCode: {$ne:"nz"}}, {geoLocationCountryCode:1, urlContainsLangCodeInpath: 1}) 
     417 
     418https://docs.mongodb.com/manual/aggregation/ 
     419db.orders.aggregate([ 
     420   { $match: { status: "A" } }, 
     421   { $group: { _id: "$cust_id", total: { $sum: "$amount" } } } 
     422]) 
     423db.Websites.aggregate({ $match:{urlContainsLangCodeInpath:true}}, $group: {geoLocationCountryCode:1, total: $count}) 
     424 
     425 
     426AIMS: 
    410427* Identify where Maori language is online. 
    411428* How can we identify high quality sites that would be good for a corpus. 
    412429(Related work for other languages to quantifiably answer that) 
    413430 
     431 
     432 
     433 
     434data-preparation 
     435docs 
     436 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33674 r33698  
    8181    /** keep a list to store the text of each page */ 
    8282    private ArrayList<TextDumpPage> pages; 
     83 
     84 
     85     
     86    /** Number of language and confidence results to return for storing in MongoDB 
     87     * MongoDB runs out of space if storing too many, as we store this info per sentence 
     88     * and a long text document becomes a very large MongoDB document presumable*/ 
     89    private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model 
     90 
    8391     
    8492    private boolean isStartOfNewWebPageRecord(String prevLine, String line) { 
     
    232240        int totalSentences = sentences.length; 
    233241        int numSentencesInMRI = 0; 
    234         ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences); 
    235         ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences); 
     242        ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES); 
     243        ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES); 
    236244 
    237245        WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/, 
     
    242250                                     overlappingSentences); 
    243251 
     252         
    244253        for(SentenceInfo si : singleSentences) { 
    245             LanguageInfo bestLanguage = si.languagesInfo[0]; 
    246             if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 
     254            //LanguageInfo bestLanguage = si.languagesInfo[0]; 
     255            //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 
     256            if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 
    247257            numSentencesInMRI++; 
    248258            } 
    249259        } 
     260         
     261         
    250262        webpage.setMRISentenceCount(numSentencesInMRI); 
    251263        webpage.setContainsMRI((numSentencesInMRI > 0)); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33674 r33698  
    5353     */ 
    5454    public final double MINIMUM_CONFIDENCE; 
    55  
    56     /** Number of language and confidence results to return for storing in MongoDB 
    57      * MongoDB runs out of space if storing too many, as we store this info per sentence 
    58      * and a long text document becomes a very large MongoDB document presumable*/ 
    59     public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model 
    6055     
    6156    /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */ 
     
    162157    } 
    163158  
    164     public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) { 
     159    public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) { 
    165160 
    166161    if(sentences == null) { 
     
    194189    } 
    195190 
    196     public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) { 
     191    public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) { 
    197192     
    198193    if(sentences == null) { 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/LanguageInfo.java

    r33674 r33698  
    55@Entity("Languages") 
    66public class LanguageInfo { 
    7     public final double confidenceLevel; 
     7    public final double confidence; 
    88    /** 3 letter lang code */ 
    99    public final String langCode; 
    1010    public LanguageInfo(double confidence, String langCode) { 
    11         this.confidenceLevel = confidence; 
     11        this.confidence = confidence; 
    1212        this.langCode = langCode;        
    1313    } 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java

    r33674 r33698  
    44import java.util.HashMap; 
    55 
     6// morphia is used to map Java objects to mongodb 
    67import dev.morphia.annotations.*; 
    78 
     
    1112 
    1213    public final String sentence; 
    13     public final Map<String, Double> languageToConfidenceMap; 
    14     @Embedded 
    15     public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs 
    16  
     14    //public final Map<String, Double> languageToConfidenceMap; 
     15    public final String bestLangCode; 
     16    public final double bestLangConfidence;     
     17    //@Embedded 
     18    //public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs 
     19     
    1720     
    1821    public SentenceInfo(String sentence, LanguageInfo[] languages) { 
    19     this.sentence = sentence; 
    20     this.languagesInfo = languages; 
     22    this.sentence = sentence;    
     23    if(languages.length >= 1) { 
     24        this.bestLangCode = languages[0].langCode; 
     25        this.bestLangConfidence = languages[0].confidence; 
     26    } else { 
     27        this.bestLangCode = null; 
     28        this.bestLangConfidence = 0.0; 
     29    } 
    2130 
     31    //this.languagesInfo = null; 
     32    //this.languagesInfo = languages; 
     33 
     34     
    2235    // let's store (langCode -> confidence) lookup in Map: 
    23     this.languageToConfidenceMap = new HashMap<String, Double>();    
     36    //this.languageToConfidenceMap = new HashMap<String, Double>(); 
     37    /* 
    2438    for(LanguageInfo li : languages) { 
    2539        String langCode = li.langCode; 
    26         Double confidence = new Double(li.confidenceLevel); 
     40        Double confidence = new Double(li.confidence); 
    2741        languageToConfidenceMap.put(langCode, confidence); 
    2842    } 
     43    */ 
    2944    } 
    3045     
    3146} 
    3247 
    33 // BACK WHEN WE ONLY STORED THE BEST PREDICTED LANGUAGE META FOR EACH SENTENCE: 
    34 /* 
    35 @Entity("Sentences") 
    36 public class SentenceInfo { 
    37     public final double confidenceLevel; 
    38     // 3 letter lang code 
    39     public final String langCode; 
    40     public final String sentence; 
    41      
    42     public SentenceInfo(double confidence, String langCode, String sentence) { 
    43     this.confidenceLevel = confidence; 
    44     this.langCode = langCode; 
    45     this.sentence = sentence; 
    46     } 
    47 } 
    48 */ 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33653 r33698  
    1919     
    2020    public final String geoLocationCountryCode; 
    21     public final boolean urlContainsLangCodeInpath; 
     21    public final boolean urlContainsLangCodeInPath; 
    2222     
    2323    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 
    2424               int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI, 
    2525               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 
    26                String geoLocationCountryCode, boolean urlContainsLangCodeInpath) 
     26               String geoLocationCountryCode, boolean urlContainsLangCodeInPath) 
    2727    { 
    2828    //this.id = siteCount; 
     
    3939     
    4040    this.geoLocationCountryCode = geoLocationCountryCode; 
    41     this.urlContainsLangCodeInpath = urlContainsLangCodeInpath; 
     41    this.urlContainsLangCodeInPath = urlContainsLangCodeInPath; 
    4242    } 
    4343}