Ignore:
Timestamp:
2019-11-15T23:14:48+13:00 (4 years ago)
Author:
ak19
Message:

Links to more reading

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java

    r33674 r33698  
    8181    /** keep a list to store the text of each page */
    8282    private ArrayList<TextDumpPage> pages;
     83
     84
     85   
     86    /** Number of language and confidence results to return for storing in MongoDB
     87     * MongoDB runs out of space if storing too many, as we store this info per sentence
     88     * and a long text document becomes a very large MongoDB document presumable*/
     89    private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model
     90
    8391   
    8492    private boolean isStartOfNewWebPageRecord(String prevLine, String line) {
     
    232240        int totalSentences = sentences.length;
    233241        int numSentencesInMRI = 0;
    234         ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences);
    235         ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences);
     242        ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES);
     243        ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES);
    236244
    237245        WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/,
     
    242250                                     overlappingSentences);
    243251
     252       
    244253        for(SentenceInfo si : singleSentences) {
    245             LanguageInfo bestLanguage = si.languagesInfo[0];
    246             if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     254            //LanguageInfo bestLanguage = si.languagesInfo[0];
     255            //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
     256            if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
    247257            numSentencesInMRI++;
    248258            }
    249259        }
     260       
     261       
    250262        webpage.setMRISentenceCount(numSentencesInMRI);
    251263        webpage.setContainsMRI((numSentencesInMRI > 0));
Note: See TracChangeset for help on using the changeset viewer.