source: other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33674

Last change on this file since 33674 was 33674, checked in by ak19, 4 years ago

Changes to support the top 5 predicted langcodes and their confidence values per sentence/overlapping sentence (all 103 made some documents, like of site 00006, too big too go into mongodb). Have re-run the NutchTextDumpToMongDB to send the new form of the docs into mongodb.

File size: 18.0 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in a given
4 * language or not.
5 * Internal functions can be used for detecting any of the 103 languages currently supported by
6 * the OpenNLP Language Detection Model.
7 *
8 * http://opennlp.apache.org/news/model-langdetect-183.html
9 * language detector model: http://opennlp.apache.org/models.html
10 * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11 * Use of Apache OpenNLP in general:
12 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13 * Use of OpenNLP for language detection:
14 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15 *
16 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17 * See also the accompanying README file.
18 *
19 * July 2019
20 */
21
22package org.greenstone.atea;
23
24import java.io.*;
25import opennlp.tools.langdetect.*;
26import opennlp.tools.sentdetect.*;
27import opennlp.tools.util.*;
28
29import java.util.ArrayList;
30
31import org.greenstone.atea.morphia.SentenceInfo;
32import org.greenstone.atea.morphia.LanguageInfo;
33
34/**
35 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
36 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
37 * (which is the language detection model zipped up and renamed to .bin extension).
38 *
39 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
40 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
41 *
42 * Only the subclass MaoriTextDetector.java has a main method at present that can be run.
43 *
44 */
45public class TextLanguageDetector {
46
47 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
48
49 /**
50 * Configurable: cut off minimum confidence value,
51 * greater or equal to which determines that the best predicted language is
52 * acceptable to user of TextLanguageDetector.
53 */
54 public final double MINIMUM_CONFIDENCE;
55
56 /** Number of language and confidence results to return for storing in MongoDB
57 * MongoDB runs out of space if storing too many, as we store this info per sentence
58 * and a long text document becomes a very large MongoDB document presumable*/
59 public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model
60
61 /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
62 public final boolean silentMode;
63
64 private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
65
66 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
67 private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
68
69 /**
70 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
71 * Created once in the constructor, can be used as often as needed thereafter.
72 */
73 private LanguageDetector myCategorizer = null;
74
75 /**
76 * The Sentence Detection object that does the sentence splitting for the language
77 * the sentece model was trained for.
78 */
79 private SentenceDetectorME sentenceDetector = null;
80
81
82 /** Constructor with default confidence for language detection.
83 * Does not create sentence model, just the language detection model.
84 */
85 public TextLanguageDetector(boolean silentMode) throws Exception {
86 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
87 }
88
89 /** Constructor with configurable min_confidence for language detection
90 * Does not create sentence model, just the language detection model.
91 */
92 public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
93 this.silentMode = silentMode;
94 this.MINIMUM_CONFIDENCE = min_confidence;
95
96 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
97 String langDetectModelPath = System.getenv("OPENNLP_HOME");
98 if(System.getenv("OPENNLP_HOME") == null) {
99 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
100 }
101 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
102 File langDetectModelBinFile = new File(langDetectModelPath);
103 if(!langDetectModelBinFile.exists()) {
104 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
105 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
106 + "\n\t*** with the model file 'langdetect-183.bin' in it.");
107 }
108
109
110 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
111 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
112 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
113 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
114
115 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
116
117 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
118 this.myCategorizer = new LanguageDetectorME(model);
119 }/*catch(Exception e) {
120 e.printStackTrace();
121 }*/
122
123 // instantiating function should handle critical exceptions. Constructors shouldn't.
124
125 }
126
127 /** More general constructor that additionally can load up the sentence detector model
128 * for other languages, as long as the provided trained sentence model .bin file exists
129 * in the OPENNLP_MODELS_RELATIVE_PATH folder. */
130 public TextLanguageDetector(boolean silentMode, double min_confidence,
131 String sentenceModelFileName) throws Exception
132 {
133 this(silentMode, min_confidence);
134
135 // 3. Set up our sentence model and SentenceDetector object
136 String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
137 + OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
138 File sentenceModelBinFile = new File(sentenceModelPath);
139 if(!sentenceModelBinFile.exists()) {
140 throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
141 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
142 + "\n\t*** with the model file "+sentenceModelFileName+" in it.");
143 }
144 try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
145 // https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
146 SentenceModel sentenceModel = new SentenceModel(modelIn);
147 this.sentenceDetector = new SentenceDetectorME(sentenceModel);
148
149 } // instantiating function should handle this critical exception
150 }
151
152 /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
153 * into sentences? What if the text in any other language or a mix of languages?
154 * Doesn't this assume that all languages split sentences alike? */
155 public String[] getAllSentences(String text) {
156
157 // This function doesn't work if the sentenceDetector object wasn't set up
158 if(sentenceDetector == null) return null;
159
160 String[] sentences = sentenceDetector.sentDetect(text);
161 return sentences;
162 }
163
164 public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
165
166 if(sentences == null) {
167 return null;
168 }
169
170 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
171 for(int i = 0; i < sentences.length; i++) {
172 String sentence = sentences[i];
173
174 //System.err.println(sentence);
175
176 //Language bestLanguage = myCategorizer.predictLanguage(sentence);
177 //double confidence = bestLanguage.getConfidence();
178 //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
179
180 Language languages[] = myCategorizer.predictLanguages(sentence);
181 // languages array already sorted in order of descending confidence
182 LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
183 for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
184 String langCode = languages[j].getLang();
185 double confidence = languages[j].getConfidence();
186 languagesInfo[j] = new LanguageInfo(confidence, langCode);
187 }
188
189 sentencesList.add(new SentenceInfo(sentence, languagesInfo));
190
191 }
192
193 return sentencesList;
194 }
195
196 public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
197
198 if(sentences == null) {
199 return null;
200 }
201
202 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
203 for(int i = 1; i < sentences.length; i++) {
204 // glue every two adjacent sentences together
205 String doubleSentence = sentences[i-1];
206
207 String separator = ". ";
208 // if the sentence already ends with a terminating punctuation character,
209 // then separator is just a space
210 doubleSentence = doubleSentence.trim();
211 if(doubleSentence.endsWith(".") || doubleSentence.endsWith("?") || doubleSentence.endsWith("!")) {
212 separator = " ";
213 }
214 doubleSentence = doubleSentence + separator + sentences[i];
215
216 //System.err.println(sentence);
217
218 //Language bestLanguage = myCategorizer.predictLanguage(doubleSentence);
219 //double confidence = bestLanguage.getConfidence();
220 //sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence));
221
222 Language languages[] = myCategorizer.predictLanguages(doubleSentence);
223 // languages array already sorted in order of descending confidence
224 LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
225
226 for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
227 String langCode = languages[j].getLang();
228 double confidence = languages[j].getConfidence();
229 languagesInfo[j] = new LanguageInfo(confidence, langCode);
230 }
231 sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo));
232
233 }
234
235 return sentencesList;
236 }
237
238 /**
239 * In this class' constructor, need to have set up the Sentence Detection Model
240 * for the langCode passed in to this function in order for the output to make
241 * sense for that language.
242 * Function that takes a text and returns those sentences in the requested language.
243 * @param text: the string of text from which sentences in the requested
244 * language are to be identified and returned.
245 * @param langCode: 3 letter code of requested language
246 * @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
247 * even if the language detector determined the requested language as the primary one
248 * for that sentence. The confidence cutoff provides an additional check.
249 * @return null if no Sentence Detection Model set up in constructor
250 * else returns an ArrayList where:
251 * - the first element is the total number of sentences in the text parameter
252 * - remaining elements are the sentences in the text parameter that were in the
253 * requested language.
254 */
255 public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
256 {
257 // big assumption here: that we can split incoming text into sentences
258 // for any language using the sentence model trained for a given language (that of
259 // langCode), despite not knowing what language each sentence in the text param are in.
260 // Hinges on sentence detection in langCode being similar to all others?
261
262
263 // This function doesn't work if the sentenceDetector object wasn't set up
264 if(sentenceDetector == null) return null;
265
266 // we'll be storing just those sentences in text that are in the denoted language code
267 ArrayList<String> sentencesInLang = new ArrayList<String>();
268 // OpenNLP language detection works best with a minimum of 2 sentences
269 // See https://opennlp.apache.org/news/model-langdetect-183.html
270 // "It is important to note that this model is trained for and works well with
271 // longer texts that have at least 2 sentences or more from the same language."
272
273 // For evaluating single languages, I used a very small data set and found that
274 // if the primary language detected is MRI AND if the confidence is >= 0.1, the
275 // results appear reasonably to be in te reo Māori.
276
277 String[] sentences = sentenceDetector.sentDetect(text);
278 if(sentences == null) {
279 sentencesInLang.add("0"); // to indicate 0 sentences in requested language
280 return sentencesInLang;
281 }
282
283 // add in first element: how many sentences there were in text.
284 sentencesInLang.add(Integer.toString(sentences.length));
285
286 for(int i = 0; i < sentences.length; i++) {
287 String sentence = sentences[i];
288
289 //System.err.println(sentence);
290
291 Language bestLanguage = myCategorizer.predictLanguage(sentence);
292 double confidence = bestLanguage.getConfidence();
293
294 if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
295 //System.err.println("Adding sentence: " + sentence + "\n");
296 sentencesInLang.add(sentence);
297 } //else {
298 //System.err.println("SKIPPING sentence: " + sentence + "\n");
299 //}
300 }
301 return sentencesInLang;
302 }
303
304
305 /** @param langCode is 3 letter language code, ISO 639-2/3
306 * https://www.loc.gov/standards/iso639-2/php/code_list.php
307 * https://en.wikipedia.org/wiki/ISO_639-3
308 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
309 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
310 */
311 public boolean isTextInLanguage(String langCode, String text) {
312 // Get the most probable language
313 Language bestLanguage = myCategorizer.predictLanguage(text);
314 doPrint("Best language: " + bestLanguage.getLang());
315 doPrint("Best language confidence: " + bestLanguage.getConfidence());
316
317 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
318 }
319
320
321 /**
322 * Handle "smaller" textfiles/streams of text read in.
323 * Return value is the same as for isTextInLanguage(String langCode, String text);
324 */
325 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
326 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
327
328 StringBuilder text = new StringBuilder();
329 String line = null;
330
331
332 while((line = reader.readLine()) != null) { // readLine removes newline separator
333 text.append(line + "\n"); // add back (unix style) line ending
334 }
335
336 return isTextInLanguage(langCode, text.toString());
337 }
338
339
340 /**
341 * Rudimentary attempt to deal with very large files.
342 * Return value is the same as for isTextInLanguage(String langCode, String text);
343 */
344 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
345 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
346
347 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
348
349 StringBuilder text = new StringBuilder();
350 String line = null;
351
352 double cumulativeConfidence = 0;
353 int numLoops = 0;
354
355 int i = 0;
356 String language = null;
357
358 while((line = reader.readLine()) != null) { // readLine removes newline separator
359 text.append(line + "\n"); // add back (unix style) line ending
360
361 i++; // read nth line of numLoop
362
363
364 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
365
366 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
367 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
368 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
369 }
370 language = bestLanguage.getLang();
371 cumulativeConfidence += bestLanguage.getConfidence();
372
373 doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
374
375 // finished analysing language of NUM_LINES of text
376 text = new StringBuilder();
377 i = 0;
378 numLoops++;
379 }
380 }
381
382 // process any (remaining) text that was less than n NUM_LINES
383 if(!text.toString().equals("")) {
384 text.append(line + "\n"); // add back (unix style) line ending
385 i++;
386
387 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
388
389 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
390 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
391 }
392 language = bestLanguage.getLang();
393 cumulativeConfidence += bestLanguage.getConfidence();
394 doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
395 }
396
397
398 int totalLinesRead = numLoops * NUM_LINES + i; // not used
399 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
400
401
402 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
403 }
404
405
406 /**
407 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
408 * UNUSED.
409 */
410 public void predictedLanguages(String text) {
411 // Get an array with the most probable languages
412
413 Language[] languages = myCategorizer.predictLanguages(text);
414
415 if(languages == null || languages.length <= 0) {
416 doPrintErr("No languages predicted for the input text");
417 } else {
418 for(int i = 0; i < languages.length; i++) {
419 doPrint("Language prediction " + i + ": " + languages[i]);
420 }
421 }
422
423 }
424
425 public void doPrint(String msg) {
426 if(!this.silentMode) System.out.println(msg);
427 }
428 public void doPrintErr(String msg) {
429 if(!this.silentMode) System.err.println(msg);
430 }
431
432}
Note: See TracBrowser for help on using the repository browser.