Changeset 33651
- Timestamp:
- 2019-11-12T18:11:39+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java
r33645 r33651 180 180 https://stackoverflow.com/questions/39433775/mongodb-java-inserting-throws-org-bson-codecs-configuration-codecconfigurationex 181 181 */ 182 /** 183 * Inserts a web page into the mongodb. Besides page related metadata and full body text 184 * the language information per sentence and per 2 adjacent sentences also get stored 185 * into the mongodb. 186 */ 182 187 public void insertWebpageInfo(WebpageInfo webpage) 183 188 { 189 int mri_sentence_count = 0; 190 184 191 // load the webpages db 'table' 185 192 // in mongodb, the equivalent of db tables are called 'collections' … … 195 202 .append("fetchTime", webpage.fetchTime); 196 203 197 // DOESN'T WORK, AS EXPECTED, BUT DIDN'T KNOW HOW TO DO IT:198 //document.put("singleSentences", webpage.singleSentences);199 //document.put("overlappingSentences", webpage.overlappingSentences);200 201 204 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER: 202 205 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java 203 206 List<BasicDBObject> sentencesList = new ArrayList<>(); 204 207 for(SentenceInfo sentenceInfo : webpage.singleSentences) { 205 //sentencesList.add(new BasicDBObject("langCode", sentenceInfo.langCode)); 206 //sentencesList.add(new BasicDBObject("confidence", sentenceInfo.confidenceLevel)); 207 //sentencesList.add(new BasicDBObject("sentence", sentenceInfo.sentence)); 208 208 209 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode); 210 209 211 bsonRecord.put("confidence", sentenceInfo.confidenceLevel); 210 212 bsonRecord.put("sentence", sentenceInfo.sentence); 211 213 212 214 sentencesList.add(bsonRecord); 215 216 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) { 217 mri_sentence_count++; 218 } 219 213 220 } 214 221 document.put("singleSentences", sentencesList); … … 216 223 List<BasicDBObject> overlappingSentencesList = new ArrayList<>(); 217 224 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) { 225 218 226 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode); 219 227 bsonRecord.put("confidence", sentenceInfo.confidenceLevel); 220 228 bsonRecord.put("sentence", sentenceInfo.sentence); 221 222 sentencesList.add(bsonRecord);229 230 overlappingSentencesList.add(bsonRecord); 223 231 } 224 232 document.put("overlappingSentences", overlappingSentencesList); … … 226 234 // also put the full text in there 227 235 document.put("text", webpage.text); 236 237 // also store the count of sentences in MRI 238 webpage.setMRISentenceCount(mri_sentence_count); 239 document.put("mriSentenceCount", mri_sentence_count); 240 228 241 229 242 collection.insertOne(document); -
other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java
r33634 r33651 184 184 for(int i = 1; i < sentences.length; i++) { 185 185 // glue every two adjacent sentences together 186 String sentence = sentences[i-1];186 String doubleSentence = sentences[i-1]; 187 187 188 188 String separator = ". "; 189 189 // if the sentence already ends with a terminating punctuation character, 190 190 // then separator is just a space 191 sentence = sentence.trim();192 if( sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) {191 doubleSentence = doubleSentence.trim(); 192 if(doubleSentence.endsWith(".") || doubleSentence.endsWith("?") || doubleSentence.endsWith("!")) { 193 193 separator = " "; 194 194 } 195 sentence = sentence + separator + sentences[i];195 doubleSentence = doubleSentence + separator + sentences[i]; 196 196 197 197 //System.err.println(sentence); 198 198 199 Language bestLanguage = myCategorizer.predictLanguage( sentence);199 Language bestLanguage = myCategorizer.predictLanguage(doubleSentence); 200 200 double confidence = bestLanguage.getConfidence(); 201 201 202 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));202 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence)); 203 203 } 204 204 -
other-projects/maori-lang-detection/src/org/greenstone/atea/WebpageInfo.java
r33634 r33651 5 5 public class WebpageInfo { 6 6 7 private int mriSentenceCount; 8 7 9 /** db table ids */ 8 10 public final long webpageID; … … 13 15 public final String text; 14 16 public final String URL; 15 public final boolean isMRI; 16 17 public final boolean isMRI; 18 17 19 public final String charEncoding; 18 20 public final String modifiedTime; … … 45 47 46 48 } 49 50 public void setMRISentenceCount(int count) { 51 this.mriSentenceCount = count; 52 } 53 54 public int getMRISentenceCount() { return this.mriSentenceCount; } 47 55 }
Note:
See TracChangeset
for help on using the changeset viewer.