source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33633

Last change on this file since 33633 was 33633, checked in by ak19, 4 years ago
  1. TextLanguageDetector now has methods for collecting all sentences and all overlapping sentences. 2. Renaming NutchTextDumpProcessor.java to NutchTextDumpToCSV.java, since there a new class, NutchTextDumpToMongoDB.java that needs slightly different data structures.
File size: 16.7 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in a given
4 * language or not.
5 * Internal functions can be used for detecting any of the 103 languages currently supported by
6 * the OpenNLP Language Detection Model.
7 *
8 * http://opennlp.apache.org/news/model-langdetect-183.html
9 * language detector model: http://opennlp.apache.org/models.html
10 * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11 * Use of Apache OpenNLP in general:
12 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13 * Use of OpenNLP for language detection:
14 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15 *
16 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17 * See also the accompanying README file.
18 *
19 * July 2019
20 */
21
22package org.greenstone.atea;
23
24import java.io.*;
25import opennlp.tools.langdetect.*;
26import opennlp.tools.sentdetect.*;
27import opennlp.tools.util.*;
28
29import java.util.ArrayList;
30
31/**
32 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
33 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
34 * (which is the language detection model zipped up and renamed to .bin extension).
35 *
36 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
37 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
38 *
39 * Only the subclass MaoriTextDetector.java has a main method at present that can be run.
40 *
41 */
42public class TextLanguageDetector {
43
44 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
45
46 /**
47 * Configurable: cut off minimum confidence value,
48 * greater or equal to which determines that the best predicted language is
49 * acceptable to user of TextLanguageDetector.
50 */
51 public final double MINIMUM_CONFIDENCE;
52
53 /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
54 public final boolean silentMode;
55
56 private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
57
58 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
59 private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
60
61 /**
62 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
63 * Created once in the constructor, can be used as often as needed thereafter.
64 */
65 private LanguageDetector myCategorizer = null;
66
67 /**
68 * The Sentence Detection object that does the sentence splitting for the language
69 * the sentece model was trained for.
70 */
71 private SentenceDetectorME sentenceDetector = null;
72
73
74 /** Constructor with default confidence for language detection.
75 * Does not create sentence model, just the language detection model.
76 */
77 public TextLanguageDetector(boolean silentMode) throws Exception {
78 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
79 }
80
81 /** Constructor with configurable min_confidence for language detection
82 * Does not create sentence model, just the language detection model.
83 */
84 public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
85 this.silentMode = silentMode;
86 this.MINIMUM_CONFIDENCE = min_confidence;
87
88 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
89 String langDetectModelPath = System.getenv("OPENNLP_HOME");
90 if(System.getenv("OPENNLP_HOME") == null) {
91 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
92 }
93 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
94 File langDetectModelBinFile = new File(langDetectModelPath);
95 if(!langDetectModelBinFile.exists()) {
96 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
97 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
98 + "\n\t*** with the model file 'langdetect-183.bin' in it.");
99 }
100
101
102 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
103 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
104 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
105 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
106
107 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
108
109 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
110 this.myCategorizer = new LanguageDetectorME(model);
111 }/*catch(Exception e) {
112 e.printStackTrace();
113 }*/
114
115 // instantiating function should handle critical exceptions. Constructors shouldn't.
116
117 }
118
119 /** More general constructor that additionally can load up the sentence detector model
120 * for other languages, as long as the provided trained sentence model .bin file exists
121 * in the OPENNLP_MODELS_RELATIVE_PATH folder. */
122 public TextLanguageDetector(boolean silentMode, double min_confidence,
123 String sentenceModelFileName) throws Exception
124 {
125 this(silentMode, min_confidence);
126
127 // 3. Set up our sentence model and SentenceDetector object
128 String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
129 + OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
130 File sentenceModelBinFile = new File(sentenceModelPath);
131 if(!sentenceModelBinFile.exists()) {
132 throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
133 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
134 + "\n\t*** with the model file "+sentenceModelFileName+" in it.");
135 }
136 try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
137 // https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
138 SentenceModel sentenceModel = new SentenceModel(modelIn);
139 this.sentenceDetector = new SentenceDetectorME(sentenceModel);
140
141 } // instantiating function should handle this critical exception
142 }
143
144 /** inner class */
145 public class SentenceInfo {
146 public final double confidenceLevel;
147 /** 3 letter lang code */
148 public final String langCode;
149 public final String sentence;
150
151 public SentenceInfo(double confidence, String langCode, String sentence) {
152 confidenceLevel = confidence;
153 this.langCode = langCode;
154 this.sentence = sentence;
155 }
156 }
157
158 /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
159 * into sentences? What if the text in any other language or a mix of languages?
160 * Doesn't this assume that all languages split sentences alike? */
161 public String[] getAllSentences(String text) {
162
163 // This function doesn't work if the sentenceDetector object wasn't set up
164 if(sentenceDetector == null) return null;
165
166 String[] sentences = sentenceDetector.sentDetect(text);
167 return sentences;
168 }
169
170 public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
171
172 if(sentences == null) {
173 return null;
174 }
175
176 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
177 for(int i = 0; i < sentences.length; i++) {
178 String sentence = sentences[i];
179
180 //System.err.println(sentence);
181
182 Language bestLanguage = myCategorizer.predictLanguage(sentence);
183 double confidence = bestLanguage.getConfidence();
184
185 sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
186 }
187
188 return sentencesList;
189 }
190
191 public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
192
193 if(sentences == null) {
194 return null;
195 }
196
197 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
198 for(int i = 1; i < sentences.length; i++) {
199 // glue every two adjacent sentences together
200 String sentence = sentences[i-1];
201
202 String separator = ". ";
203 // if the sentence already ends with a terminating punctuation character,
204 // then separator is just a space
205 sentence = sentence.trim();
206 if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) {
207 separator = " ";
208 }
209 sentence = sentence + separator + sentence[i];
210
211 //System.err.println(sentence);
212
213 Language bestLanguage = myCategorizer.predictLanguage(sentence);
214 double confidence = bestLanguage.getConfidence();
215
216 sentencesList.add(new SentenceInfo(confidence, bestLanguage, sentence));
217 }
218
219 return sentencesList;
220 }
221
222 /**
223 * In this class' constructor, need to have set up the Sentence Detection Model
224 * for the langCode passed in to this function in order for the output to make
225 * sense for that language.
226 * Function that takes a text and returns those sentences in the requested language.
227 * @param text: the string of text from which sentences in the requested
228 * language are to be identified and returned.
229 * @param langCode: 3 letter code of requested language
230 * @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
231 * even if the language detector determined the requested language as the primary one
232 * for that sentence. The confidence cutoff provides an additional check.
233 * @return null if no Sentence Detection Model set up in constructor
234 * else returns an ArrayList where:
235 * - the first element is the total number of sentences in the text parameter
236 * - remaining elements are the sentences in the text parameter that were in the
237 * requested language.
238 */
239 public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
240 {
241 // big assumption here: that we can split incoming text into sentences
242 // for any language using the sentence model trained for a given language (that of
243 // langCode), despite not knowing what language each sentence in the text param are in.
244 // Hinges on sentence detection in langCode being similar to all others?
245
246
247 // This function doesn't work if the sentenceDetector object wasn't set up
248 if(sentenceDetector == null) return null;
249
250 // we'll be storing just those sentences in text that are in the denoted language code
251 ArrayList<String> sentencesInLang = new ArrayList<String>();
252 // OpenNLP language detection works best with a minimum of 2 sentences
253 // See https://opennlp.apache.org/news/model-langdetect-183.html
254 // "It is important to note that this model is trained for and works well with
255 // longer texts that have at least 2 sentences or more from the same language."
256
257 // For evaluating single languages, I used a very small data set and found that
258 // if the primary language detected is MRI AND if the confidence is >= 0.1, the
259 // results appear reasonably to be in te reo Māori.
260
261 String[] sentences = sentenceDetector.sentDetect(text);
262 if(sentences == null) {
263 sentencesInLang.add("0"); // to indicate 0 sentences in requested language
264 return sentencesInLang;
265 }
266
267 // add in first element: how many sentences there were in text.
268 sentencesInLang.add(Integer.toString(sentences.length));
269
270 for(int i = 0; i < sentences.length; i++) {
271 String sentence = sentences[i];
272
273 //System.err.println(sentence);
274
275 Language bestLanguage = myCategorizer.predictLanguage(sentence);
276 double confidence = bestLanguage.getConfidence();
277
278 if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
279 //System.err.println("Adding sentence: " + sentence + "\n");
280 sentencesInLang.add(sentence);
281 } //else {
282 //System.err.println("SKIPPING sentence: " + sentence + "\n");
283 //}
284 }
285 return sentencesInLang;
286 }
287
288
289 /** @param langCode is 3 letter language code, ISO 639-2/3
290 * https://www.loc.gov/standards/iso639-2/php/code_list.php
291 * https://en.wikipedia.org/wiki/ISO_639-3
292 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
293 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
294 */
295 public boolean isTextInLanguage(String langCode, String text) {
296 // Get the most probable language
297 Language bestLanguage = myCategorizer.predictLanguage(text);
298 doPrint("Best language: " + bestLanguage.getLang());
299 doPrint("Best language confidence: " + bestLanguage.getConfidence());
300
301 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
302 }
303
304
305 /**
306 * Handle "smaller" textfiles/streams of text read in.
307 * Return value is the same as for isTextInLanguage(String langCode, String text);
308 */
309 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
310 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
311
312 StringBuilder text = new StringBuilder();
313 String line = null;
314
315
316 while((line = reader.readLine()) != null) { // readLine removes newline separator
317 text.append(line + "\n"); // add back (unix style) line ending
318 }
319 return isTextInLanguage(langCode, text.toString());
320 }
321
322
323 /**
324 * Rudimentary attempt to deal with very large files.
325 * Return value is the same as for isTextInLanguage(String langCode, String text);
326 */
327 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
328 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
329
330 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
331
332 StringBuilder text = new StringBuilder();
333 String line = null;
334
335 double cumulativeConfidence = 0;
336 int numLoops = 0;
337
338 int i = 0;
339 String language = null;
340
341 while((line = reader.readLine()) != null) { // readLine removes newline separator
342 text.append(line + "\n"); // add back (unix style) line ending
343
344 i++; // read nth line of numLoop
345
346
347 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
348
349 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
350 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
351 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
352 }
353 language = bestLanguage.getLang();
354 cumulativeConfidence += bestLanguage.getConfidence();
355
356 doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
357
358 // finished analysing language of NUM_LINES of text
359 text = new StringBuilder();
360 i = 0;
361 numLoops++;
362 }
363 }
364
365 // process any (remaining) text that was less than n NUM_LINES
366 if(!text.toString().equals("")) {
367 text.append(line + "\n"); // add back (unix style) line ending
368 i++;
369
370 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
371
372 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
373 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
374 }
375 language = bestLanguage.getLang();
376 cumulativeConfidence += bestLanguage.getConfidence();
377 doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
378 }
379
380
381 int totalLinesRead = numLoops * NUM_LINES + i; // not used
382 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
383
384
385 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
386 }
387
388
389 /**
390 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
391 * UNUSED.
392 */
393 public void predictedLanguages(String text) {
394 // Get an array with the most probable languages
395
396 Language[] languages = myCategorizer.predictLanguages(text);
397
398 if(languages == null || languages.length <= 0) {
399 doPrintErr("No languages predicted for the input text");
400 } else {
401 for(int i = 0; i < languages.length; i++) {
402 doPrint("Language prediction " + i + ": " + languages[i]);
403 }
404 }
405
406 }
407
408 public void doPrint(String msg) {
409 if(!this.silentMode) System.out.println(msg);
410 }
411 public void doPrintErr(String msg) {
412 if(!this.silentMode) System.err.println(msg);
413 }
414
415}
Note: See TracBrowser for help on using the repository browser.