Changeset 33674
- Timestamp:
- 2019-11-15T00:21:31+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33657 r33674 243 243 244 244 for(SentenceInfo si : singleSentences) { 245 if(si.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 245 LanguageInfo bestLanguage = si.languagesInfo[0]; 246 if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 246 247 numSentencesInMRI++; 247 248 } … … 251 252 252 253 //mongodbAccess.insertWebpageInfo(webpage); 254 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia 253 255 mongodbAccess.datastore.save(webpage); 254 256 } 255 257 } 256 258 } 257 258 /*259 public void printSiteStats() {260 261 262 logger.info("------------- " + this.siteID + " SITE STATS -----------");263 264 logger.info("SITE DOMAIN: " + this.domainOfSite);265 logger.info("Total number of web pages in site: " + pages.size());266 logger.info("Of these, the number of pages in MÄori (mri) were: " + this.pagesInMRI.size());267 268 if(pagesInMRI.size() > 0) {269 logger.info("The following were the pages detected by OpenNLP as being in MÄori with " + maoriTxtDetector.MINIMUM_CONFIDENCE + " confidence");270 for(MRIWebPageStats mriWebPageInfo : pagesInMRI) {271 logger.info(mriWebPageInfo.toString());272 }273 }274 275 logger.info(" ----------- ");276 if(pagesContainingMRI.size() > 0) {277 logger.info("The following pages weren't detected as primarily being in MÄori");278 logger.info("But still contained sentences detected as MÄori");279 for(MRIWebPageStats mriWebPageInfo : pagesContainingMRI) {280 logger.info(mriWebPageInfo.toString());281 }282 283 } else {284 logger.info("No further pages detected as containing any sentences in MRI");285 }286 logger.info(" ----------- ");287 }288 */289 290 259 291 260 … … 328 297 329 298 //mongodbAccess.insertWebsiteInfo(website); 299 // Uses morphia to save to mongodb, see https://www.baeldung.com/mongodb-morphia 330 300 mongodbAccess.datastore.save(website); 331 301 } -
other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33652 r33674 29 29 import java.util.ArrayList; 30 30 31 import org.greenstone.atea.morphia.*; 31 import org.greenstone.atea.morphia.SentenceInfo; 32 import org.greenstone.atea.morphia.LanguageInfo; 32 33 33 34 /** … … 52 53 */ 53 54 public final double MINIMUM_CONFIDENCE; 55 56 /** Number of language and confidence results to return for storing in MongoDB 57 * MongoDB runs out of space if storing too many, as we store this info per sentence 58 * and a long text document becomes a very large MongoDB document presumable*/ 59 public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model 54 60 55 61 /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */ … … 168 174 //System.err.println(sentence); 169 175 170 Language bestLanguage = myCategorizer.predictLanguage(sentence); 171 double confidence = bestLanguage.getConfidence(); 172 173 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 176 //Language bestLanguage = myCategorizer.predictLanguage(sentence); 177 //double confidence = bestLanguage.getConfidence(); 178 //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence)); 179 180 Language languages[] = myCategorizer.predictLanguages(sentence); 181 // languages array already sorted in order of descending confidence 182 LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES]; 183 for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) { 184 String langCode = languages[j].getLang(); 185 double confidence = languages[j].getConfidence(); 186 languagesInfo[j] = new LanguageInfo(confidence, langCode); 187 } 188 189 sentencesList.add(new SentenceInfo(sentence, languagesInfo)); 190 174 191 } 175 192 … … 199 216 //System.err.println(sentence); 200 217 201 Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 202 double confidence = bestLanguage.getConfidence(); 203 204 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 218 //Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 219 //double confidence = bestLanguage.getConfidence(); 220 //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 221 222 Language languages[] = myCategorizer.predictLanguages(doubleSentence); 223 // languages array already sorted in order of descending confidence 224 LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES]; 225 226 for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) { 227 String langCode = languages[j].getLang(); 228 double confidence = languages[j].getConfidence(); 229 languagesInfo[j] = new LanguageInfo(confidence, langCode); 230 } 231 sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo)); 232 205 233 } 206 234 … … 305 333 text.append(line + "\n"); // add back (unix style) line ending 306 334 } 335 307 336 return isTextInLanguage(langCode, text.toString()); 308 337 } -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java
r33653 r33674 1 1 package org.greenstone.atea.morphia; 2 3 import java.util.Map; 4 import java.util.HashMap; 2 5 3 6 import dev.morphia.annotations.*; 4 7 8 9 @Entity("Sentences") 10 public class SentenceInfo { 11 12 public final String sentence; 13 public final Map<String, Double> languageToConfidenceMap; 14 @Embedded 15 public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs 16 17 18 public SentenceInfo(String sentence, LanguageInfo[] languages) { 19 this.sentence = sentence; 20 this.languagesInfo = languages; 21 22 // let's store (langCode -> confidence) lookup in Map: 23 this.languageToConfidenceMap = new HashMap<String, Double>(); 24 for(LanguageInfo li : languages) { 25 String langCode = li.langCode; 26 Double confidence = new Double(li.confidenceLevel); 27 languageToConfidenceMap.put(langCode, confidence); 28 } 29 } 30 31 } 32 33 // BACK WHEN WE ONLY STORED THE BEST PREDICTED LANGUAGE META FOR EACH SENTENCE: 34 /* 5 35 @Entity("Sentences") 6 36 public class SentenceInfo { 7 37 public final double confidenceLevel; 8 / ** 3 letter lang code */38 // 3 letter lang code 9 39 public final String langCode; 10 40 public final String sentence; … … 16 46 } 17 47 } 48 */
Note:
See TracChangeset
for help on using the changeset viewer.