Context Navigation

source: other-projects/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33674

Last change on this file since 33674 was 33674, checked in by ak19, 4 years ago
Changes to support the top 5 predicted langcodes and their confidence values per sentence/overlapping sentence (all 103 made some documents, like of site 00006, too big too go into mongodb). Have re-run the NutchTextDumpToMongDB to send the new form of the docs into mongodb.
File size: 18.0 KB

Line
1	/**
2	* Class that uses OpenNLP with the Language Detection Model to determine, with a default
3	* or configurable level of confidence, whether text (from a file or stdin) is in a given
4	* language or not.
5	* Internal functions can be used for detecting any of the 103 languages currently supported by
6	* the OpenNLP Language Detection Model.
7	*
8	* http://opennlp.apache.org/news/model-langdetect-183.html
9	* language detector model: http://opennlp.apache.org/models.html
10	* Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11	* Use of Apache OpenNLP in general:
12	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13	* Use of OpenNLP for language detection:
14	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15	*
16	* This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17	* See also the accompanying README file.
18	*
19	* July 2019
20	*/
21
22	package org.greenstone.atea;
23
24	import java.io.*;
25	import opennlp.tools.langdetect.*;
26	import opennlp.tools.sentdetect.*;
27	import opennlp.tools.util.*;
28
29	import java.util.ArrayList;
30
31	import org.greenstone.atea.morphia.SentenceInfo;
32	import org.greenstone.atea.morphia.LanguageInfo;
33
34	/**
35	* EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
36	* Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
37	* (which is the language detection model zipped up and renamed to .bin extension).
38	*
39	* Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
40	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
41	*
42	* Only the subclass MaoriTextDetector.java has a main method at present that can be run.
43	*
44	*/
45	public class TextLanguageDetector {
46
47	public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
48
49	/**
50	* Configurable: cut off minimum confidence value,
51	* greater or equal to which determines that the best predicted language is
52	* acceptable to user of TextLanguageDetector.
53	*/
54	public final double MINIMUM_CONFIDENCE;
55
56	/** Number of language and confidence results to return for storing in MongoDB
57	* MongoDB runs out of space if storing too many, as we store this info per sentence
58	* and a long text document becomes a very large MongoDB document presumable*/
59	public final int NUM_TOP_LANGUAGES = 5; // 103 max, in current version of opennlp lang model
60
61	/** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
62	public final boolean silentMode;
63
64	private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
65
66	/** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
67	private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
68
69	/**
70	* The LanguageDetectorModel object that will do the actual language detection/prediction for us.
71	* Created once in the constructor, can be used as often as needed thereafter.
72	*/
73	private LanguageDetector myCategorizer = null;
74
75	/**
76	* The Sentence Detection object that does the sentence splitting for the language
77	* the sentece model was trained for.
78	*/
79	private SentenceDetectorME sentenceDetector = null;
80
81
82	/** Constructor with default confidence for language detection.
83	* Does not create sentence model, just the language detection model.
84	*/
85	public TextLanguageDetector(boolean silentMode) throws Exception {
86	this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
87	}
88
89	/** Constructor with configurable min_confidence for language detection
90	* Does not create sentence model, just the language detection model.
91	*/
92	public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
93	this.silentMode = silentMode;
94	this.MINIMUM_CONFIDENCE = min_confidence;
95
96	// 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
97	String langDetectModelPath = System.getenv("OPENNLP_HOME");
98	if(System.getenv("OPENNLP_HOME") == null) {
99	throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
100	}
101	langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
102	File langDetectModelBinFile = new File(langDetectModelPath);
103	if(!langDetectModelBinFile.exists()) {
104	throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
105	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
106	+ "\n\t*** with the model file 'langdetect-183.bin' in it.");
107	}
108
109
110	// 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
111	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
112	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
113	try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
114
115	LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
116
117	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
118	this.myCategorizer = new LanguageDetectorME(model);
119	}/*catch(Exception e) {
120	e.printStackTrace();
121	}*/
122
123	// instantiating function should handle critical exceptions. Constructors shouldn't.
124
125	}
126
127	/** More general constructor that additionally can load up the sentence detector model
128	* for other languages, as long as the provided trained sentence model .bin file exists
129	* in the OPENNLP_MODELS_RELATIVE_PATH folder. */
130	public TextLanguageDetector(boolean silentMode, double min_confidence,
131	String sentenceModelFileName) throws Exception
132	{
133	this(silentMode, min_confidence);
134
135	// 3. Set up our sentence model and SentenceDetector object
136	String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
137	+ OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
138	File sentenceModelBinFile = new File(sentenceModelPath);
139	if(!sentenceModelBinFile.exists()) {
140	throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
141	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
142	+ "\n\t*** with the model file "+sentenceModelFileName+" in it.");
143	}
144	try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
145	// https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
146	SentenceModel sentenceModel = new SentenceModel(modelIn);
147	this.sentenceDetector = new SentenceDetectorME(sentenceModel);
148
149	} // instantiating function should handle this critical exception
150	}
151
152	/** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
153	* into sentences? What if the text in any other language or a mix of languages?
154	* Doesn't this assume that all languages split sentences alike? */
155	public String[] getAllSentences(String text) {
156
157	// This function doesn't work if the sentenceDetector object wasn't set up
158	if(sentenceDetector == null) return null;
159
160	String[] sentences = sentenceDetector.sentDetect(text);
161	return sentences;
162	}
163
164	public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
165
166	if(sentences == null) {
167	return null;
168	}
169
170	ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
171	for(int i = 0; i < sentences.length; i++) {
172	String sentence = sentences[i];
173
174	//System.err.println(sentence);
175
176	//Language bestLanguage = myCategorizer.predictLanguage(sentence);
177	//double confidence = bestLanguage.getConfidence();
178	//sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
179
180	Language languages[] = myCategorizer.predictLanguages(sentence);
181	// languages array already sorted in order of descending confidence
182	LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
183	for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
184	String langCode = languages[j].getLang();
185	double confidence = languages[j].getConfidence();
186	languagesInfo[j] = new LanguageInfo(confidence, langCode);
187	}
188
189	sentencesList.add(new SentenceInfo(sentence, languagesInfo));
190
191	}
192
193	return sentencesList;
194	}
195
196	public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
197
198	if(sentences == null) {
199	return null;
200	}
201
202	ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
203	for(int i = 1; i < sentences.length; i++) {
204	// glue every two adjacent sentences together
205	String doubleSentence = sentences[i-1];
206
207	String separator = ". ";
208	// if the sentence already ends with a terminating punctuation character,
209	// then separator is just a space
210	doubleSentence = doubleSentence.trim();
211	if(doubleSentence.endsWith(".") \|\| doubleSentence.endsWith("?") \|\| doubleSentence.endsWith("!")) {
212	separator = " ";
213	}
214	doubleSentence = doubleSentence + separator + sentences[i];
215
216	//System.err.println(sentence);
217
218	//Language bestLanguage = myCategorizer.predictLanguage(doubleSentence);
219	//double confidence = bestLanguage.getConfidence();
220	//sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), doubleSentence));
221
222	Language languages[] = myCategorizer.predictLanguages(doubleSentence);
223	// languages array already sorted in order of descending confidence
224	LanguageInfo[] languagesInfo = new LanguageInfo[NUM_TOP_LANGUAGES];
225
226	for(int j = 0; j < languages.length && j < NUM_TOP_LANGUAGES; j++) {
227	String langCode = languages[j].getLang();
228	double confidence = languages[j].getConfidence();
229	languagesInfo[j] = new LanguageInfo(confidence, langCode);
230	}
231	sentencesList.add(new SentenceInfo(doubleSentence, languagesInfo));
232
233	}
234
235	return sentencesList;
236	}
237
238	/**
239	* In this class' constructor, need to have set up the Sentence Detection Model
240	* for the langCode passed in to this function in order for the output to make
241	* sense for that language.
242	* Function that takes a text and returns those sentences in the requested language.
243	* @param text: the string of text from which sentences in the requested
244	* language are to be identified and returned.
245	* @param langCode: 3 letter code of requested language
246	* @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
247	* even if the language detector determined the requested language as the primary one
248	* for that sentence. The confidence cutoff provides an additional check.
249	* @return null if no Sentence Detection Model set up in constructor
250	* else returns an ArrayList where:
251	* - the first element is the total number of sentences in the text parameter
252	* - remaining elements are the sentences in the text parameter that were in the
253	* requested language.
254	*/
255	public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
256	{
257	// big assumption here: that we can split incoming text into sentences
258	// for any language using the sentence model trained for a given language (that of
259	// langCode), despite not knowing what language each sentence in the text param are in.
260	// Hinges on sentence detection in langCode being similar to all others?
261
262
263	// This function doesn't work if the sentenceDetector object wasn't set up
264	if(sentenceDetector == null) return null;
265
266	// we'll be storing just those sentences in text that are in the denoted language code
267	ArrayList<String> sentencesInLang = new ArrayList<String>();
268	// OpenNLP language detection works best with a minimum of 2 sentences
269	// See https://opennlp.apache.org/news/model-langdetect-183.html
270	// "It is important to note that this model is trained for and works well with
271	// longer texts that have at least 2 sentences or more from the same language."
272
273	// For evaluating single languages, I used a very small data set and found that
274	// if the primary language detected is MRI AND if the confidence is >= 0.1, the
275	// results appear reasonably to be in te reo MÄori.
276
277	String[] sentences = sentenceDetector.sentDetect(text);
278	if(sentences == null) {
279	sentencesInLang.add("0"); // to indicate 0 sentences in requested language
280	return sentencesInLang;
281	}
282
283	// add in first element: how many sentences there were in text.
284	sentencesInLang.add(Integer.toString(sentences.length));
285
286	for(int i = 0; i < sentences.length; i++) {
287	String sentence = sentences[i];
288
289	//System.err.println(sentence);
290
291	Language bestLanguage = myCategorizer.predictLanguage(sentence);
292	double confidence = bestLanguage.getConfidence();
293
294	if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
295	//System.err.println("Adding sentence: " + sentence + "\n");
296	sentencesInLang.add(sentence);
297	} //else {
298	//System.err.println("SKIPPING sentence: " + sentence + "\n");
299	//}
300	}
301	return sentencesInLang;
302	}
303
304
305	/** @param langCode is 3 letter language code, ISO 639-2/3
306	* https://www.loc.gov/standards/iso639-2/php/code_list.php
307	* https://en.wikipedia.org/wiki/ISO_639-3
308	* @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
309	* else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
310	*/
311	public boolean isTextInLanguage(String langCode, String text) {
312	// Get the most probable language
313	Language bestLanguage = myCategorizer.predictLanguage(text);
314	doPrint("Best language: " + bestLanguage.getLang());
315	doPrint("Best language confidence: " + bestLanguage.getConfidence());
316
317	return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
318	}
319
320
321	/**
322	* Handle "smaller" textfiles/streams of text read in.
323	* Return value is the same as for isTextInLanguage(String langCode, String text);
324	*/
325	public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
326	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
327
328	StringBuilder text = new StringBuilder();
329	String line = null;
330
331
332	while((line = reader.readLine()) != null) { // readLine removes newline separator
333	text.append(line + "\n"); // add back (unix style) line ending
334	}
335
336	return isTextInLanguage(langCode, text.toString());
337	}
338
339
340	/**
341	* Rudimentary attempt to deal with very large files.
342	* Return value is the same as for isTextInLanguage(String langCode, String text);
343	*/
344	public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
345	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
346
347	final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
348
349	StringBuilder text = new StringBuilder();
350	String line = null;
351
352	double cumulativeConfidence = 0;
353	int numLoops = 0;
354
355	int i = 0;
356	String language = null;
357
358	while((line = reader.readLine()) != null) { // readLine removes newline separator
359	text.append(line + "\n"); // add back (unix style) line ending
360
361	i++; // read nth line of numLoop
362
363
364	if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
365
366	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
367	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
368	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
369	}
370	language = bestLanguage.getLang();
371	cumulativeConfidence += bestLanguage.getConfidence();
372
373	doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
374
375	// finished analysing language of NUM_LINES of text
376	text = new StringBuilder();
377	i = 0;
378	numLoops++;
379	}
380	}
381
382	// process any (remaining) text that was less than n NUM_LINES
383	if(!text.toString().equals("")) {
384	text.append(line + "\n"); // add back (unix style) line ending
385	i++;
386
387	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
388
389	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
390	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
391	}
392	language = bestLanguage.getLang();
393	cumulativeConfidence += bestLanguage.getConfidence();
394	doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
395	}
396
397
398	int totalLinesRead = numLoops * NUM_LINES + i; // not used
399	double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
400
401
402	return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
403	}
404
405
406	/**
407	* Prints to STDOUT the predicted languages of the input text in order of descending confidence.
408	* UNUSED.
409	*/
410	public void predictedLanguages(String text) {
411	// Get an array with the most probable languages
412
413	Language[] languages = myCategorizer.predictLanguages(text);
414
415	if(languages == null \|\| languages.length <= 0) {
416	doPrintErr("No languages predicted for the input text");
417	} else {
418	for(int i = 0; i < languages.length; i++) {
419	doPrint("Language prediction " + i + ": " + languages[i]);
420	}
421	}
422
423	}
424
425	public void doPrint(String msg) {
426	if(!this.silentMode) System.out.println(msg);
427	}
428	public void doPrintErr(String msg) {
429	if(!this.silentMode) System.err.println(msg);
430	}
431
432	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: