Changeset 33698


Ignore:
Timestamp:
2019-11-15T23:14:48+13:00 (4 years ago)
Author:
ak19
Message:

Links to more reading

Location:
other-projects/maori-lang-detection
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/MoreReading/mongodb.txt

    r33675 r33698  
    346346https://docs.mongodb.com/manual/reference/method/db.collection.find/
    347347https://docs.mongodb.com/manual/reference/method/db.collection.find/#find-projection
     348https://stackoverflow.com/questions/39641925/mongodb-aggregation-framework-to-get-frequencies-of-fields-values
     349
     350https://exploratory.io/note/kanaugust/0961813761939766
     351https://docs.mongodb.com/manual/tutorial/project-fields-from-query-results/
     352https://docs.mongodb.com/manual/aggregation/
     353
    348354
    349355Mongo Studio 3T documentation:
     
    407413
    408414
    409 
     415# PROJECTION:
     416db.getCollection('Websites').find({geoLocationCountryCode: {$ne:"nz"}}, {geoLocationCountryCode:1, urlContainsLangCodeInpath: 1})
     417
     418https://docs.mongodb.com/manual/aggregation/
     419db.orders.aggregate([
     420   { $match: { status: "A" } },
     421   { $group: { _id: "$cust_id", total: { $sum: "$amount" } } }
     422])
     423db.Websites.aggregate({ $match:{urlContainsLangCodeInpath:true}}, $group: {geoLocationCountryCode:1, total: $count})
     424
     425
     426AIMS:
    410427* Identify where Maori language is online.
    411428* How can we identify high quality sites that would be good for a corpus.
    412429(Related work for other languages to quantifiably answer that)
    413430
     431
     432
     433
     434data-preparation
     435docs
     436
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33674 r33698  
    8181    /** keep a list to store the text of each page */
    8282    private ArrayList<TextDumpPage> pages;
     83
     84
     85   
     86    /** Number of language and confidence results to return for storing in MongoDB
     87     * MongoDB runs out of space if storing too many, as we store this info per sentence
     88     * and a long text document becomes a very large MongoDB document presumable*/
     89    private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
     90
    8391   
    8492    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
     
    232240        int totalSentences = sentences.length;
    233241        int numSentencesInMRI = 0;
    234         ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences);
    235         ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences);
     242        ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
     243        ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
    236244
    237245        WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
     
    242250                                     overlappingSentences);
    243251
     252       
    244253        for(SentenceInfo si : singleSentences) {
    245             LanguageInfo bestLanguage = si.languagesInfo[0];
    246             if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     254            //LanguageInfo bestLanguage = si.languagesInfo[0];
     255            //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     256            if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
    247257            numSentencesInMRI++;
    248258            }
    249259        }
     260       
     261       
    250262        webpage.setMRISentenceCount(numSentencesInMRI);
    251263        webpage.setContainsMRI((numSentencesInMRI > 0));
  • other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java

    r33674 r33698  
    5353     */
    5454    public final double MINIMUM_CONFIDENCE;
    55 
    56     /** Number of language and confidence results to return for storing in MongoDB
    57      * MongoDB runs out of space if storing too many, as we store this info per sentence
    58      * and a long text document becomes a very large MongoDB document presumable*/
    59     public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model
    6055   
    6156    /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
     
    162157    }
    163158 
    164     public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
     159    public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) {
    165160
    166161    if(sentences == null) {
     
    194189    }
    195190
    196     public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
     191    public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) {
    197192   
    198193    if(sentences == null) {
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/LanguageInfo.java

    r33674 r33698  
    55@Entity("Languages")
    66public class LanguageInfo {
    7     public final double confidenceLevel;
     7    public final double confidence;
    88    /** 3 letter lang code */
    99    public final String langCode;
    1010    public LanguageInfo(double confidence, String langCode) {
    11         this.confidenceLevel = confidence;
     11        this.confidence = confidence;
    1212        this.langCode = langCode;       
    1313    }
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java

    r33674 r33698  
    44import java.util.HashMap;
    55
     6// morphia is used to map Java objects to mongodb
    67import dev.morphia.annotations.*;
    78
     
    1112
    1213    public final String sentence;
    13     public final Map<String, Double> languageToConfidenceMap;
    14     @Embedded
    15     public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs
    16 
     14    //public final Map<String, Double> languageToConfidenceMap;
     15    public final String bestLangCode;
     16    public final double bestLangConfidence;   
     17    //@Embedded
     18    //public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs
     19   
    1720   
    1821    public SentenceInfo(String sentence, LanguageInfo[] languages) {
    19     this.sentence = sentence;
    20     this.languagesInfo = languages;
     22    this.sentence = sentence;   
     23    if(languages.length >= 1) {
     24        this.bestLangCode = languages[0].langCode;
     25        this.bestLangConfidence = languages[0].confidence;
     26    } else {
     27        this.bestLangCode = null;
     28        this.bestLangConfidence = 0.0;
     29    }
    2130
     31    //this.languagesInfo = null;
     32    //this.languagesInfo = languages;
     33
     34   
    2235    // let's store (langCode -> confidence) lookup in Map:
    23     this.languageToConfidenceMap = new HashMap<String, Double>();   
     36    //this.languageToConfidenceMap = new HashMap<String, Double>();
     37    /*
    2438    for(LanguageInfo li : languages) {
    2539        String langCode = li.langCode;
    26         Double confidence = new Double(li.confidenceLevel);
     40        Double confidence = new Double(li.confidence);
    2741        languageToConfidenceMap.put(langCode, confidence);
    2842    }
     43    */
    2944    }
    3045   
    3146}
    3247
    33 // BACK WHEN WE ONLY STORED THE BEST PREDICTED LANGUAGE META FOR EACH SENTENCE:
    34 /*
    35 @Entity("Sentences")
    36 public class SentenceInfo {
    37     public final double confidenceLevel;
    38     // 3 letter lang code
    39     public final String langCode;
    40     public final String sentence;
    41    
    42     public SentenceInfo(double confidence, String langCode, String sentence) {
    43     this.confidenceLevel = confidence;
    44     this.langCode = langCode;
    45     this.sentence = sentence;
    46     }
    47 }
    48 */
  • other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java

    r33653 r33698  
    1919   
    2020    public final String geoLocationCountryCode;
    21     public final boolean urlContainsLangCodeInpath;
     21    public final boolean urlContainsLangCodeInPath;
    2222   
    2323    public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite,
    2424               int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI,
    2525               long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl,
    26                String geoLocationCountryCode, boolean urlContainsLangCodeInpath)
     26               String geoLocationCountryCode, boolean urlContainsLangCodeInPath)
    2727    {
    2828    //this.id = siteCount;
     
    3939   
    4040    this.geoLocationCountryCode = geoLocationCountryCode;
    41     this.urlContainsLangCodeInpath = urlContainsLangCodeInpath;
     41    this.urlContainsLangCodeInPath = urlContainsLangCodeInPath;
    4242    }
    4343}
Note: See TracChangeset for help on using the changeset viewer.