Changeset 33674 for other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
- Timestamp:
- 2019-11-15T00:21:31+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33652 r33674 29 29 import java.util.ArrayList; 30 30 31 import org.greenstone.atea.morphia.*; 31 import org.greenstone.atea.morphia.SentenceInfo; 32 import org.greenstone.atea.morphia.LanguageInfo; 32 33 33 34 /** … … 52 53 */ 53 54 public final double MINIMUM_CONFIDENCE; 55 56 /** Number of language and confidence results to return for storing in MongoDB 57 * MongoDB runs out of space if storing too many, as we store this info per sentence 58 * and a long text document becomes a very large MongoDB document presumable*/ 59 public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model 54 60 55 61 /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */ … … 168 174 //System.err.println(sentence); 169 175 170 Language bestLanguage = myCategorizer.predictLanguage(sentence); 171 double confidence = bestLanguage.getConfidence(); 172 173 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 176 //Language bestLanguage = myCategorizer.predictLanguage(sentence); 177 //double confidence = bestLanguage.getConfidence(); 178 //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 179 180 Language languages[] = myCategorizer.predictLanguages(sentence); 181 // languages array already sorted in order of descending confidence 182 LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES]; 183 for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) { 184 String langCode = languages[j].getLang(); 185 double confidence = languages[j].getConfidence(); 186 languagesInfo[j] = new LanguageInfo(confidence, langCode); 187 } 188 189 sentencesList.add(new SentenceInfo(sentence, languagesInfo)); 190 174 191 } 175 192 … … 199 216 //System.err.println(sentence); 200 217 201 Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 202 double confidence = bestLanguage.getConfidence(); 203 204 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 218 //Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 219 //double confidence = bestLanguage.getConfidence(); 220 //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 221 222 Language languages[] = myCategorizer.predictLanguages(doubleSentence); 223 // languages array already sorted in order of descending confidence 224 LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES]; 225 226 for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) { 227 String langCode = languages[j].getLang(); 228 double confidence = languages[j].getConfidence(); 229 languagesInfo[j] = new LanguageInfo(confidence, langCode); 230 } 231 sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo)); 232 205 233 } 206 234 … … 305 333 text.append(line + "\n"); // add back (unix style) line ending 306 334 } 335 307 336 return isTextInLanguage(langCode, text.toString()); 308 337 }
Note:
See TracChangeset
for help on using the changeset viewer.