Changeset 33698 for other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
- Timestamp:
- 2019-11-15T23:14:48+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33674 r33698 81 81 /** keep a list to store the text of each page */ 82 82 private ArrayList<TextDumpPage> pages; 83 84 85 86 /** Number of language and confidence results to return for storing in MongoDB 87 * MongoDB runs out of space if storing too many, as we store this info per sentence 88 * and a long text document becomes a very large MongoDB document presumable*/ 89 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model 90 83 91 84 92 private boolean isStartOfNewWebPageRecord(String prevLine, String line) { … … 232 240 int totalSentences = sentences.length; 233 241 int numSentencesInMRI = 0; 234 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences );235 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences );242 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES); 243 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES); 236 244 237 245 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/, … … 242 250 overlappingSentences); 243 251 252 244 253 for(SentenceInfo si : singleSentences) { 245 LanguageInfo bestLanguage = si.languagesInfo[0]; 246 if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 254 //LanguageInfo bestLanguage = si.languagesInfo[0]; 255 //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 256 if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 247 257 numSentencesInMRI++; 248 258 } 249 259 } 260 261 250 262 webpage.setMRISentenceCount(numSentencesInMRI); 251 263 webpage.setContainsMRI((numSentencesInMRI > 0));
Note:
See TracChangeset
for help on using the changeset viewer.