Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33586

Last change on this file since 33586 was 33586, checked in by ak19, 5 years ago
Refactored MaoriTextDetector.java class into more general TextLanguageDetector.java superclass and just the MRI-specific methods, constructors and member vars remaining in MaoriTextDetector.java. Easier to read code. Makes superclass reusable for other languages that need a similar treatment.
File size: 14.0 KB

Line
1	/**
2	* Class that uses OpenNLP with the Language Detection Model to determine, with a default
3	* or configurable level of confidence, whether text (from a file or stdin) is in a given
4	* language or not.
5	* Internal functions can be used for detecting any of the 103 languages currently supported by
6	* the OpenNLP Language Detection Model.
7	*
8	* http://opennlp.apache.org/news/model-langdetect-183.html
9	* language detector model: http://opennlp.apache.org/models.html
10	* Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11	* Use of Apache OpenNLP in general:
12	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13	* Use of OpenNLP for language detection:
14	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15	*
16	* This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17	* See also the accompanying README file.
18	*
19	* July 2019
20	*/
21
22	package org.greenstone.atea;
23
24	import java.io.*;
25	import opennlp.tools.langdetect.*;
26	import opennlp.tools.sentdetect.*;
27	import opennlp.tools.util.*;
28
29	import java.util.ArrayList;
30
31	/**
32	* EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
33	* Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
34	* (which is the language detection model zipped up and renamed to .bin extension).
35	*
36	* Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
37	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
38	*
39	* Only the subclass MaoriTextDetector.java has a main method at present that can be run.
40	*
41	*/
42	public class TextLanguageDetector {
43
44	public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
45
46	/**
47	* Configurable: cut off minimum confidence value,
48	* greater or equal to which determines that the best predicted language is
49	* acceptable to user of TextLanguageDetector.
50	*/
51	public final double MINIMUM_CONFIDENCE;
52
53	/** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
54	public final boolean silentMode;
55
56	private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
57
58	/** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
59	private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
60
61	/**
62	* The LanguageDetectorModel object that will do the actual language detection/prediction for us.
63	* Created once in the constructor, can be used as often as needed thereafter.
64	*/
65	private LanguageDetector myCategorizer = null;
66
67	/**
68	* The Sentence Detection object that does the sentence splitting for the language
69	* the sentece model was trained for.
70	*/
71	private SentenceDetectorME sentenceDetector = null;
72
73
74	/** Constructor with default confidence for language detection.
75	* Does not create sentence model, just the language detection model.
76	*/
77	public TextLanguageDetector(boolean silentMode) throws Exception {
78	this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
79	}
80
81	/** Constructor with configurable min_confidence for language detection
82	* Does not create sentence model, just the language detection model.
83	*/
84	public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
85	this.silentMode = silentMode;
86	this.MINIMUM_CONFIDENCE = min_confidence;
87
88	// 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
89	String langDetectModelPath = System.getenv("OPENNLP_HOME");
90	if(System.getenv("OPENNLP_HOME") == null) {
91	throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
92	}
93	langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
94	File langDetectModelBinFile = new File(langDetectModelPath);
95	if(!langDetectModelBinFile.exists()) {
96	throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
97	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
98	+ "\n\t*** with the model file 'langdetect-183.bin' in it.");
99	}
100
101
102	// 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
103	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
104	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
105	try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
106
107	LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
108
109	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
110	this.myCategorizer = new LanguageDetectorME(model);
111	}/*catch(Exception e) {
112	e.printStackTrace();
113	}*/
114
115	// instantiating function should handle critical exceptions. Constructors shouldn't.
116
117	}
118
119	/** More general constructor that additionally can load up the sentence detector model
120	* for other languages, as long as the provided trained sentence model .bin file exists
121	* in the OPENNLP_MODELS_RELATIVE_PATH folder. */
122	public TextLanguageDetector(boolean silentMode, double min_confidence,
123	String sentenceModelFileName) throws Exception
124	{
125	this(silentMode, min_confidence);
126
127	// 3. Set up our sentence model and SentenceDetector object
128	String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
129	+ OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
130	File sentenceModelBinFile = new File(sentenceModelPath);
131	if(!sentenceModelBinFile.exists()) {
132	throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
133	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
134	+ "\n\t*** with the model file "+sentenceModelFileName+" in it.");
135	}
136	try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
137	// https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
138	SentenceModel sentenceModel = new SentenceModel(modelIn);
139	this.sentenceDetector = new SentenceDetectorME(sentenceModel);
140
141	} // instantiating function should handle this critical exception
142	}
143
144	/**
145	* In this class' constructor, need to have set up the Sentence Detection Model
146	* for the langCode passed in to this function in order for the output to make
147	* sense for that language.
148	* Function that takes a text and returns those sentences in the requested language.
149	* @param text: the string of text from which sentences in the requested
150	* language are to be identified and returned.
151	* @param langCode: 3 letter code of requested language
152	* @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
153	* even if the language detector determined the requested language as the primary one
154	* for that sentence. The confidence cutoff provides an additional check.
155	* @return null if no Sentence Detection Model set up in constructor
156	* else returns an ArrayList of sentences in the text parameter that are
157	* in the requested language.
158	*/
159	public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
160	{
161	// big assumption here: that we can split incoming text into sentences
162	// for any language using the sentence model trained for a given language (that of
163	// langCode), despite not knowing what language each sentence in the text param are in.
164	// Hinges on sentence detection in langCode being similar to all others?
165
166
167	// This function doesn't work if the sentenceDetector object wasn't set up
168	if(sentenceDetector == null) return null;
169
170	// we'll be storing just those sentences in text that are in the denoted language code
171	ArrayList<String> sentencesInLang = new ArrayList<String>();
172	// OpenNLP language detection works best with a minimum of 2 sentences
173	// See https://opennlp.apache.org/news/model-langdetect-183.html
174	// "It is important to note that this model is trained for and works well with
175	// longer texts that have at least 2 sentences or more from the same language."
176
177	// For evaluating single languages, I used a very small data set and found that
178	// if the primary language detected is MRI AND if the confidence is >= 0.1, the
179	// results appear reasonably to be in te reo MÄori.
180
181	String[] sentences = sentenceDetector.sentDetect(text);
182
183	for(int i = 0; i < sentences.length; i++) {
184	String sentence = sentences[i];
185
186	//System.err.println(sentence);
187
188	Language bestLanguage = myCategorizer.predictLanguage(sentence);
189	double confidence = bestLanguage.getConfidence();
190
191	if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
192	//System.err.println("Adding sentence: " + sentence + "\n");
193	sentencesInLang.add(sentence);
194	} //else {
195	//System.err.println("SKIPPING sentence: " + sentence + "\n");
196	//}
197	}
198	return sentencesInLang;
199	}
200
201
202	/** @param langCode is 3 letter language code, ISO 639-2/3
203	* https://www.loc.gov/standards/iso639-2/php/code_list.php
204	* https://en.wikipedia.org/wiki/ISO_639-3
205	* @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
206	* else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
207	*/
208	public boolean isTextInLanguage(String langCode, String text) {
209	// Get the most probable language
210	Language bestLanguage = myCategorizer.predictLanguage(text);
211	doPrint("Best language: " + bestLanguage.getLang());
212	doPrint("Best language confidence: " + bestLanguage.getConfidence());
213
214	return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
215	}
216
217
218	/**
219	* Handle "smaller" textfiles/streams of text read in.
220	* Return value is the same as for isTextInLanguage(String langCode, String text);
221	*/
222	public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
223	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
224
225	StringBuilder text = new StringBuilder();
226	String line = null;
227
228
229	while((line = reader.readLine()) != null) { // readLine removes newline separator
230	text.append(line + "\n"); // add back (unix style) line ending
231	}
232	return isTextInLanguage(langCode, text.toString());
233	}
234
235
236	/**
237	* Rudimentary attempt to deal with very large files.
238	* Return value is the same as for isTextInLanguage(String langCode, String text);
239	*/
240	public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
241	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
242
243	final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
244
245	StringBuilder text = new StringBuilder();
246	String line = null;
247
248	double cumulativeConfidence = 0;
249	int numLoops = 0;
250
251	int i = 0;
252	String language = null;
253
254	while((line = reader.readLine()) != null) { // readLine removes newline separator
255	text.append(line + "\n"); // add back (unix style) line ending
256
257	i++; // read nth line of numLoop
258
259
260	if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
261
262
263	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
264	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
265	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
266	}
267	language = bestLanguage.getLang();
268	cumulativeConfidence += bestLanguage.getConfidence();
269
270	doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
271
272	// finished analysing language of NUM_LINES of text
273	text = new StringBuilder();
274	i = 0;
275	numLoops++;
276	}
277	}
278
279	// process any (remaining) text that was less than n NUM_LINES
280	if(!text.toString().equals("")) {
281	text.append(line + "\n"); // add back (unix style) line ending
282	i++;
283
284	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
285
286	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
287	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
288	}
289	language = bestLanguage.getLang();
290	cumulativeConfidence += bestLanguage.getConfidence();
291	doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
292	}
293
294
295	int totalLinesRead = numLoops * NUM_LINES + i; // not used
296	double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
297
298
299	return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
300	}
301
302
303	/**
304	* Prints to STDOUT the predicted languages of the input text in order of descending confidence.
305	* UNUSED.
306	*/
307	public void predictedLanguages(String text) {
308	// Get an array with the most probable languages
309
310	Language[] languages = myCategorizer.predictLanguages(text);
311
312	if(languages == null \|\| languages.length <= 0) {
313	doPrintErr("No languages predicted for the input text");
314	} else {
315	for(int i = 0; i < languages.length; i++) {
316	doPrint("Language prediction " + i + ": " + languages[i]);
317	}
318	}
319
320	}
321
322	public void doPrint(String msg) {
323	if(!this.silentMode) System.out.println(msg);
324	}
325	public void doPrintErr(String msg) {
326	if(!this.silentMode) System.err.println(msg);
327	}
328
329	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: