Changeset 33698 for other-projects
- Timestamp:
- 2019-11-15T23:14:48+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/MoreReading/mongodb.txt
r33675 r33698 346 346 https://docs.mongodb.com/manual/reference/method/db.collection.find/ 347 347 https://docs.mongodb.com/manual/reference/method/db.collection.find/#find-projection 348 https://stackoverflow.com/questions/39641925/mongodb-aggregation-framework-to-get-frequencies-of-fields-values 349 350 https://exploratory.io/note/kanaugust/0961813761939766 351 https://docs.mongodb.com/manual/tutorial/project-fields-from-query-results/ 352 https://docs.mongodb.com/manual/aggregation/ 353 348 354 349 355 Mongo Studio 3T documentation: … … 407 413 408 414 409 415 # PROJECTION: 416 db.getCollection('Websites').find({geoLocationCountryCode: {$ne:"nz"}}, {geoLocationCountryCode:1, urlContainsLangCodeInpath: 1}) 417 418 https://docs.mongodb.com/manual/aggregation/ 419 db.orders.aggregate([ 420 { $match: { status: "A" } }, 421 { $group: { _id: "$cust_id", total: { $sum: "$amount" } } } 422 ]) 423 db.Websites.aggregate({ $match:{urlContainsLangCodeInpath:true}}, $group: {geoLocationCountryCode:1, total: $count}) 424 425 426 AIMS: 410 427 * Identify where Maori language is online. 411 428 * How can we identify high quality sites that would be good for a corpus. 412 429 (Related work for other languages to quantifiably answer that) 413 430 431 432 433 434 data-preparation 435 docs 436 -
other-projects/maori-lang-detection/src/org/greenstone/atea/NutchTextDumpToMongoDB.java
r33674 r33698 81 81 /** keep a list to store the text of each page */ 82 82 private ArrayList<TextDumpPage> pages; 83 84 85 86 /** Number of language and confidence results to return for storing in MongoDB 87 * MongoDB runs out of space if storing too many, as we store this info per sentence 88 * and a long text document becomes a very large MongoDB document presumable*/ 89 private static final int NUM_TOP_LANGUAGES = 3; // 103 max, in current version of opennlp lang model 90 83 91 84 92 private boolean isStartOfNewWebPageRecord(String prevLine, String line) { … … 232 240 int totalSentences = sentences.length; 233 241 int numSentencesInMRI = 0; 234 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences );235 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences );242 ArrayList<SentenceInfo> singleSentences = maoriTxtDetector.getAllSentencesInfo(sentences, NUM_TOP_LANGUAGES); 243 ArrayList<SentenceInfo> overlappingSentences = maoriTxtDetector.getAllOverlappingSentencesInfo(sentences, NUM_TOP_LANGUAGES); 236 244 237 245 WebpageInfo webpage = page.convertStoredDataToWebpageInfo(WEBPAGE_COUNTER/*new ObjectId()*/, … … 242 250 overlappingSentences); 243 251 252 244 253 for(SentenceInfo si : singleSentences) { 245 LanguageInfo bestLanguage = si.languagesInfo[0]; 246 if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 254 //LanguageInfo bestLanguage = si.languagesInfo[0]; 255 //if(bestLanguage.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 256 if(si.bestLangCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 247 257 numSentencesInMRI++; 248 258 } 249 259 } 260 261 250 262 webpage.setMRISentenceCount(numSentencesInMRI); 251 263 webpage.setContainsMRI((numSentencesInMRI > 0)); -
other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33674 r33698 53 53 */ 54 54 public final double MINIMUM_CONFIDENCE; 55 56 /** Number of language and confidence results to return for storing in MongoDB57 * MongoDB runs out of space if storing too many, as we store this info per sentence58 * and a long text document becomes a very large MongoDB document presumable*/59 public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model60 55 61 56 /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */ … … 162 157 } 163 158 164 public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences ) {159 public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) { 165 160 166 161 if(sentences == null) { … … 194 189 } 195 190 196 public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences ) {191 public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) { 197 192 198 193 if(sentences == null) { -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/LanguageInfo.java
r33674 r33698 5 5 @Entity("Languages") 6 6 public class LanguageInfo { 7 public final double confidence Level;7 public final double confidence; 8 8 /** 3 letter lang code */ 9 9 public final String langCode; 10 10 public LanguageInfo(double confidence, String langCode) { 11 this.confidence Level= confidence;11 this.confidence = confidence; 12 12 this.langCode = langCode; 13 13 } -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/SentenceInfo.java
r33674 r33698 4 4 import java.util.HashMap; 5 5 6 // morphia is used to map Java objects to mongodb 6 7 import dev.morphia.annotations.*; 7 8 … … 11 12 12 13 public final String sentence; 13 public final Map<String, Double> languageToConfidenceMap; 14 @Embedded 15 public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs 16 14 //public final Map<String, Double> languageToConfidenceMap; 15 public final String bestLangCode; 16 public final double bestLangConfidence; 17 //@Embedded 18 //public final LanguageInfo[] languagesInfo; // array of langCode and confidence value pairs 19 17 20 18 21 public SentenceInfo(String sentence, LanguageInfo[] languages) { 19 this.sentence = sentence; 20 this.languagesInfo = languages; 22 this.sentence = sentence; 23 if(languages.length >= 1) { 24 this.bestLangCode = languages[0].langCode; 25 this.bestLangConfidence = languages[0].confidence; 26 } else { 27 this.bestLangCode = null; 28 this.bestLangConfidence = 0.0; 29 } 21 30 31 //this.languagesInfo = null; 32 //this.languagesInfo = languages; 33 34 22 35 // let's store (langCode -> confidence) lookup in Map: 23 this.languageToConfidenceMap = new HashMap<String, Double>(); 36 //this.languageToConfidenceMap = new HashMap<String, Double>(); 37 /* 24 38 for(LanguageInfo li : languages) { 25 39 String langCode = li.langCode; 26 Double confidence = new Double(li.confidence Level);40 Double confidence = new Double(li.confidence); 27 41 languageToConfidenceMap.put(langCode, confidence); 28 42 } 43 */ 29 44 } 30 45 31 46 } 32 47 33 // BACK WHEN WE ONLY STORED THE BEST PREDICTED LANGUAGE META FOR EACH SENTENCE:34 /*35 @Entity("Sentences")36 public class SentenceInfo {37 public final double confidenceLevel;38 // 3 letter lang code39 public final String langCode;40 public final String sentence;41 42 public SentenceInfo(double confidence, String langCode, String sentence) {43 this.confidenceLevel = confidence;44 this.langCode = langCode;45 this.sentence = sentence;46 }47 }48 */ -
other-projects/maori-lang-detection/src/org/greenstone/atea/morphia/WebsiteInfo.java
r33653 r33698 19 19 20 20 public final String geoLocationCountryCode; 21 public final boolean urlContainsLangCodeIn path;21 public final boolean urlContainsLangCodeInPath; 22 22 23 23 public WebsiteInfo(/*int siteCount,*/ String siteFolderName, String domainOfSite, 24 24 int totalPages, int countOfWebPagesWithBodyText, int numPagesInMRI, 25 25 long siteCrawledTimestamp, boolean siteCrawlUnfinished, boolean redoCrawl, 26 String geoLocationCountryCode, boolean urlContainsLangCodeIn path)26 String geoLocationCountryCode, boolean urlContainsLangCodeInPath) 27 27 { 28 28 //this.id = siteCount; … … 39 39 40 40 this.geoLocationCountryCode = geoLocationCountryCode; 41 this.urlContainsLangCodeIn path = urlContainsLangCodeInpath;41 this.urlContainsLangCodeInPath = urlContainsLangCodeInPath; 42 42 } 43 43 }
Note:
See TracChangeset
for help on using the changeset viewer.