Context Navigation

TextLanguageDetector.java@ 33790

Last change on this file since 33790 was 33698, checked in by ak19, 5 years ago
Links to more reading
File size: 17.7 KB

Line
1	/**
2	* Class that uses OpenNLP with the Language Detection Model to determine, with a default
3	* or configurable level of confidence, whether text (from a file or stdin) is in a given
4	* language or not.
5	* Internal functions can be used for detecting any of the 103 languages currently supported by
6	* the OpenNLP Language Detection Model.
7	*
8	* http://opennlp.apache.org/news/model-langdetect-183.html
9	* language detector model: http://opennlp.apache.org/models.html
10	* Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11	* Use of Apache OpenNLP in general:
12	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13	* Use of OpenNLP for language detection:
14	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15	*
16	* This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17	* See also the accompanying README file.
18	*
19	* July 2019
20	*/
21
22	package org.greenstone.atea;
23
24	import java.io.*;
25	import opennlp.tools.langdetect.*;
26	import opennlp.tools.sentdetect.*;
27	import opennlp.tools.util.*;
28
29	import java.util.ArrayList;
30
31	import org.greenstone.atea.morphia.SentenceInfo;
32	import org.greenstone.atea.morphia.LanguageInfo;
33
34	/**
35	* EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
36	* Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
37	* (which is the language detection model zipped up and renamed to .bin extension).
38	*
39	* Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
40	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
41	*
42	* Only the subclass MaoriTextDetector.java has a main method at present that can be run.
43	*
44	*/
45	public class TextLanguageDetector {
46
47	public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
48
49	/**
50	* Configurable: cut off minimum confidence value,
51	* greater or equal to which determines that the best predicted language is
52	* acceptable to user of TextLanguageDetector.
53	*/
54	public final double MINIMUM_CONFIDENCE;
55
56	/** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
57	public final boolean silentMode;
58
59	private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
60
61	/** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
62	private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
63
64	/**
65	* The LanguageDetectorModel object that will do the actual language detection/prediction for us.
66	* Created once in the constructor, can be used as often as needed thereafter.
67	*/
68	private LanguageDetector myCategorizer = null;
69
70	/**
71	* The Sentence Detection object that does the sentence splitting for the language
72	* the sentece model was trained for.
73	*/
74	private SentenceDetectorME sentenceDetector = null;
75
76
77	/** Constructor with default confidence for language detection.
78	* Does not create sentence model, just the language detection model.
79	*/
80	public TextLanguageDetector(boolean silentMode) throws Exception {
81	this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
82	}
83
84	/** Constructor with configurable min_confidence for language detection
85	* Does not create sentence model, just the language detection model.
86	*/
87	public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
88	this.silentMode = silentMode;
89	this.MINIMUM_CONFIDENCE = min_confidence;
90
91	// 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
92	String langDetectModelPath = System.getenv("OPENNLP_HOME");
93	if(System.getenv("OPENNLP_HOME") == null) {
94	throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
95	}
96	langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
97	File langDetectModelBinFile = new File(langDetectModelPath);
98	if(!langDetectModelBinFile.exists()) {
99	throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
100	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
101	+ "\n\t*** with the model file 'langdetect-183.bin' in it.");
102	}
103
104
105	// 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
106	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
107	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
108	try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
109
110	LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
111
112	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
113	this.myCategorizer = new LanguageDetectorME(model);
114	}/*catch(Exception e) {
115	e.printStackTrace();
116	}*/
117
118	// instantiating function should handle critical exceptions. Constructors shouldn't.
119
120	}
121
122	/** More general constructor that additionally can load up the sentence detector model
123	* for other languages, as long as the provided trained sentence model .bin file exists
124	* in the OPENNLP_MODELS_RELATIVE_PATH folder. */
125	public TextLanguageDetector(boolean silentMode, double min_confidence,
126	String sentenceModelFileName) throws Exception
127	{
128	this(silentMode, min_confidence);
129
130	// 3. Set up our sentence model and SentenceDetector object
131	String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
132	+ OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
133	File sentenceModelBinFile = new File(sentenceModelPath);
134	if(!sentenceModelBinFile.exists()) {
135	throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
136	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
137	+ "\n\t*** with the model file "+sentenceModelFileName+" in it.");
138	}
139	try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
140	// https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
141	SentenceModel sentenceModel = new SentenceModel(modelIn);
142	this.sentenceDetector = new SentenceDetectorME(sentenceModel);
143
144	} // instantiating function should handle this critical exception
145	}
146
147	/** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
148	* into sentences? What if the text in any other language or a mix of languages?
149	* Doesn't this assume that all languages split sentences alike? */
150	public String[] getAllSentences(String text) {
151
152	// This function doesn't work if the sentenceDetector object wasn't set up
153	if(sentenceDetector == null) return null;
154
155	String[] sentences = sentenceDetector.sentDetect(text);
156	return sentences;
157	}
158
159	public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) {
160
161	if(sentences == null) {
162	return null;
163	}
164
165	ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
166	for(int i = 0; i < sentences.length; i++) {
167	String sentence = sentences[i];
168
169	//System.err.println(sentence);
170
171	//Language bestLanguage = myCategorizer.predictLanguage(sentence);
172	//double confidence = bestLanguage.getConfidence();
173	//sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
174
175	Language languages[] = myCategorizer.predictLanguages(sentence);
176	// languages array already sorted in order of descending confidence
177	LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
178	for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
179	String langCode = languages[j].getLang();
180	double confidence = languages[j].getConfidence();
181	languagesInfo[j] = new LanguageInfo(confidence, langCode);
182	}
183
184	sentencesList.add(new SentenceInfo(sentence, languagesInfo));
185
186	}
187
188	return sentencesList;
189	}
190
191	public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences, int NUM_TOP_LANGUAGES) {
192
193	if(sentences == null) {
194	return null;
195	}
196
197	ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
198	for(int i = 1; i < sentences.length; i++) {
199	// glue every two adjacent sentences together
200	String doubleSentence = sentences[i-1];
201
202	String separator = ". ";
203	// if the sentence already ends with a terminating punctuation character,
204	// then separator is just a space
205	doubleSentence = doubleSentence.trim();
206	if(doubleSentence.endsWith(".") \|\| doubleSentence.endsWith("?") \|\| doubleSentence.endsWith("!")) {
207	separator = " ";
208	}
209	doubleSentence = doubleSentence + separator + sentences[i];
210
211	//System.err.println(sentence);
212
213	//Language bestLanguage = myCategorizer.predictLanguage(doubleSentence);
214	//double confidence = bestLanguage.getConfidence();
215	//sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence));
216
217	Language languages[] = myCategorizer.predictLanguages(doubleSentence);
218	// languages array already sorted in order of descending confidence
219	LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
220
221	for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
222	String langCode = languages[j].getLang();
223	double confidence = languages[j].getConfidence();
224	languagesInfo[j] = new LanguageInfo(confidence, langCode);
225	}
226	sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo));
227
228	}
229
230	return sentencesList;
231	}
232
233	/**
234	* In this class' constructor, need to have set up the Sentence Detection Model
235	* for the langCode passed in to this function in order for the output to make
236	* sense for that language.
237	* Function that takes a text and returns those sentences in the requested language.
238	* @param text: the string of text from which sentences in the requested
239	* language are to be identified and returned.
240	* @param langCode: 3 letter code of requested language
241	* @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
242	* even if the language detector determined the requested language as the primary one
243	* for that sentence. The confidence cutoff provides an additional check.
244	* @return null if no Sentence Detection Model set up in constructor
245	* else returns an ArrayList where:
246	* - the first element is the total number of sentences in the text parameter
247	* - remaining elements are the sentences in the text parameter that were in the
248	* requested language.
249	*/
250	public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
251	{
252	// big assumption here: that we can split incoming text into sentences
253	// for any language using the sentence model trained for a given language (that of
254	// langCode), despite not knowing what language each sentence in the text param are in.
255	// Hinges on sentence detection in langCode being similar to all others?
256
257
258	// This function doesn't work if the sentenceDetector object wasn't set up
259	if(sentenceDetector == null) return null;
260
261	// we'll be storing just those sentences in text that are in the denoted language code
262	ArrayList<String> sentencesInLang = new ArrayList<String>();
263	// OpenNLP language detection works best with a minimum of 2 sentences
264	// See https://opennlp.apache.org/news/model-langdetect-183.html
265	// "It is important to note that this model is trained for and works well with
266	// longer texts that have at least 2 sentences or more from the same language."
267
268	// For evaluating single languages, I used a very small data set and found that
269	// if the primary language detected is MRI AND if the confidence is >= 0.1, the
270	// results appear reasonably to be in te reo MÄori.
271
272	String[] sentences = sentenceDetector.sentDetect(text);
273	if(sentences == null) {
274	sentencesInLang.add("0"); // to indicate 0 sentences in requested language
275	return sentencesInLang;
276	}
277
278	// add in first element: how many sentences there were in text.
279	sentencesInLang.add(Integer.toString(sentences.length));
280
281	for(int i = 0; i < sentences.length; i++) {
282	String sentence = sentences[i];
283
284	//System.err.println(sentence);
285
286	Language bestLanguage = myCategorizer.predictLanguage(sentence);
287	double confidence = bestLanguage.getConfidence();
288
289	if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
290	//System.err.println("Adding sentence: " + sentence + "\n");
291	sentencesInLang.add(sentence);
292	} //else {
293	//System.err.println("SKIPPING sentence: " + sentence + "\n");
294	//}
295	}
296	return sentencesInLang;
297	}
298
299
300	/** @param langCode is 3 letter language code, ISO 639-2/3
301	* https://www.loc.gov/standards/iso639-2/php/code_list.php
302	* https://en.wikipedia.org/wiki/ISO_639-3
303	* @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
304	* else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
305	*/
306	public boolean isTextInLanguage(String langCode, String text) {
307	// Get the most probable language
308	Language bestLanguage = myCategorizer.predictLanguage(text);
309	doPrint("Best language: " + bestLanguage.getLang());
310	doPrint("Best language confidence: " + bestLanguage.getConfidence());
311
312	return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
313	}
314
315
316	/**
317	* Handle "smaller" textfiles/streams of text read in.
318	* Return value is the same as for isTextInLanguage(String langCode, String text);
319	*/
320	public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
321	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
322
323	StringBuilder text = new StringBuilder();
324	String line = null;
325
326
327	while((line = reader.readLine()) != null) { // readLine removes newline separator
328	text.append(line + "\n"); // add back (unix style) line ending
329	}
330
331	return isTextInLanguage(langCode, text.toString());
332	}
333
334
335	/**
336	* Rudimentary attempt to deal with very large files.
337	* Return value is the same as for isTextInLanguage(String langCode, String text);
338	*/
339	public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
340	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
341
342	final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
343
344	StringBuilder text = new StringBuilder();
345	String line = null;
346
347	double cumulativeConfidence = 0;
348	int numLoops = 0;
349
350	int i = 0;
351	String language = null;
352
353	while((line = reader.readLine()) != null) { // readLine removes newline separator
354	text.append(line + "\n"); // add back (unix style) line ending
355
356	i++; // read nth line of numLoop
357
358
359	if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
360
361	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
362	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
363	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
364	}
365	language = bestLanguage.getLang();
366	cumulativeConfidence += bestLanguage.getConfidence();
367
368	doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
369
370	// finished analysing language of NUM_LINES of text
371	text = new StringBuilder();
372	i = 0;
373	numLoops++;
374	}
375	}
376
377	// process any (remaining) text that was less than n NUM_LINES
378	if(!text.toString().equals("")) {
379	text.append(line + "\n"); // add back (unix style) line ending
380	i++;
381
382	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
383
384	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
385	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
386	}
387	language = bestLanguage.getLang();
388	cumulativeConfidence += bestLanguage.getConfidence();
389	doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
390	}
391
392
393	int totalLinesRead = numLoops * NUM_LINES + i; // not used
394	double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
395
396
397	return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
398	}
399
400
401	/**
402	* Prints to STDOUT the predicted languages of the input text in order of descending confidence.
403	* UNUSED.
404	*/
405	public void predictedLanguages(String text) {
406	// Get an array with the most probable languages
407
408	Language[] languages = myCategorizer.predictLanguages(text);
409
410	if(languages == null \|\| languages.length <= 0) {
411	doPrintErr("No languages predicted for the input text");
412	} else {
413	for(int i = 0; i < languages.length; i++) {
414	doPrint("Language prediction " + i + ": " + languages[i]);
415	}
416	}
417
418	}
419
420	public void doPrint(String msg) {
421	if(!this.silentMode) System.out.println(msg);
422	}
423	public void doPrintErr(String msg) {
424	if(!this.silentMode) System.err.println(msg);
425	}
426
427	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33790

Download in other formats: