Context Navigation

TextLanguageDetector.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 16.4 KB

Line
1	/**
2	* Class that uses OpenNLP with the Language Detection Model to determine, with a default
3	* or configurable level of confidence, whether text (from a file or stdin) is in a given
4	* language or not.
5	* Internal functions can be used for detecting any of the 103 languages currently supported by
6	* the OpenNLP Language Detection Model.
7	*
8	* http://opennlp.apache.org/news/model-langdetect-183.html
9	* language detector model: http://opennlp.apache.org/models.html
10	* Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11	* Use of Apache OpenNLP in general:
12	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13	* Use of OpenNLP for language detection:
14	* http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15	*
16	* This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17	* See also the accompanying README file.
18	*
19	* July 2019
20	*/
21
22	package org.greenstone.atea;
23
24	import java.io.*;
25	import opennlp.tools.langdetect.*;
26	import opennlp.tools.sentdetect.*;
27	import opennlp.tools.util.*;
28
29	import java.util.ArrayList;
30
31	/**
32	* EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
33	* Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
34	* (which is the language detection model zipped up and renamed to .bin extension).
35	*
36	* Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
37	* maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
38	*
39	* Only the subclass MaoriTextDetector.java has a main method at present that can be run.
40	*
41	*/
42	public class TextLanguageDetector {
43
44	public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
45
46	/**
47	* Configurable: cut off minimum confidence value,
48	* greater or equal to which determines that the best predicted language is
49	* acceptable to user of TextLanguageDetector.
50	*/
51	public final double MINIMUM_CONFIDENCE;
52
53	/** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
54	public final boolean silentMode;
55
56	private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
57
58	/** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
59	private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
60
61	/**
62	* The LanguageDetectorModel object that will do the actual language detection/prediction for us.
63	* Created once in the constructor, can be used as often as needed thereafter.
64	*/
65	private LanguageDetector myCategorizer = null;
66
67	/**
68	* The Sentence Detection object that does the sentence splitting for the language
69	* the sentece model was trained for.
70	*/
71	private SentenceDetectorME sentenceDetector = null;
72
73
74	/** Constructor with default confidence for language detection.
75	* Does not create sentence model, just the language detection model.
76	*/
77	public TextLanguageDetector(boolean silentMode) throws Exception {
78	this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
79	}
80
81	/** Constructor with configurable min_confidence for language detection
82	* Does not create sentence model, just the language detection model.
83	*/
84	public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
85	this.silentMode = silentMode;
86	this.MINIMUM_CONFIDENCE = min_confidence;
87
88	// 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
89	String langDetectModelPath = System.getenv("OPENNLP_HOME");
90	if(System.getenv("OPENNLP_HOME") == null) {
91	throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
92	}
93	langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
94	File langDetectModelBinFile = new File(langDetectModelPath);
95	if(!langDetectModelBinFile.exists()) {
96	throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
97	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
98	+ "\n\t*** with the model file 'langdetect-183.bin' in it.");
99	}
100
101
102	// 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
103	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
104	// https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
105	try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
106
107	LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
108
109	// http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
110	this.myCategorizer = new LanguageDetectorME(model);
111	}/*catch(Exception e) {
112	e.printStackTrace();
113	}*/
114
115	// instantiating function should handle critical exceptions. Constructors shouldn't.
116
117	}
118
119	/** More general constructor that additionally can load up the sentence detector model
120	* for other languages, as long as the provided trained sentence model .bin file exists
121	* in the OPENNLP_MODELS_RELATIVE_PATH folder. */
122	public TextLanguageDetector(boolean silentMode, double min_confidence,
123	String sentenceModelFileName) throws Exception
124	{
125	this(silentMode, min_confidence);
126
127	// 3. Set up our sentence model and SentenceDetector object
128	String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
129	+ OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
130	File sentenceModelBinFile = new File(sentenceModelPath);
131	if(!sentenceModelBinFile.exists()) {
132	throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
133	+ "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
134	+ "\n\t*** with the model file "+sentenceModelFileName+" in it.");
135	}
136	try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
137	// https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
138	SentenceModel sentenceModel = new SentenceModel(modelIn);
139	this.sentenceDetector = new SentenceDetectorME(sentenceModel);
140
141	} // instantiating function should handle this critical exception
142	}
143
144	/** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
145	* into sentences? What if the text in any other language or a mix of languages?
146	* Doesn't this assume that all languages split sentences alike? */
147	public String[] getAllSentences(String text) {
148
149	// This function doesn't work if the sentenceDetector object wasn't set up
150	if(sentenceDetector == null) return null;
151
152	String[] sentences = sentenceDetector.sentDetect(text);
153	return sentences;
154	}
155
156	public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
157
158	if(sentences == null) {
159	return null;
160	}
161
162	ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
163	for(int i = 0; i < sentences.length; i++) {
164	String sentence = sentences[i];
165
166	//System.err.println(sentence);
167
168	Language bestLanguage = myCategorizer.predictLanguage(sentence);
169	double confidence = bestLanguage.getConfidence();
170
171	sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
172	}
173
174	return sentencesList;
175	}
176
177	public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
178
179	if(sentences == null) {
180	return null;
181	}
182
183	ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
184	for(int i = 1; i < sentences.length; i++) {
185	// glue every two adjacent sentences together
186	String sentence = sentences[i-1];
187
188	String separator = ". ";
189	// if the sentence already ends with a terminating punctuation character,
190	// then separator is just a space
191	sentence = sentence.trim();
192	if(sentence.endsWith(".") \|\| sentence.endsWith("?") \|\| sentence.endsWith("!")) {
193	separator = " ";
194	}
195	sentence = sentence + separator + sentences[i];
196
197	//System.err.println(sentence);
198
199	Language bestLanguage = myCategorizer.predictLanguage(sentence);
200	double confidence = bestLanguage.getConfidence();
201
202	sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
203	}
204
205	return sentencesList;
206	}
207
208	/**
209	* In this class' constructor, need to have set up the Sentence Detection Model
210	* for the langCode passed in to this function in order for the output to make
211	* sense for that language.
212	* Function that takes a text and returns those sentences in the requested language.
213	* @param text: the string of text from which sentences in the requested
214	* language are to be identified and returned.
215	* @param langCode: 3 letter code of requested language
216	* @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
217	* even if the language detector determined the requested language as the primary one
218	* for that sentence. The confidence cutoff provides an additional check.
219	* @return null if no Sentence Detection Model set up in constructor
220	* else returns an ArrayList where:
221	* - the first element is the total number of sentences in the text parameter
222	* - remaining elements are the sentences in the text parameter that were in the
223	* requested language.
224	*/
225	public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
226	{
227	// big assumption here: that we can split incoming text into sentences
228	// for any language using the sentence model trained for a given language (that of
229	// langCode), despite not knowing what language each sentence in the text param are in.
230	// Hinges on sentence detection in langCode being similar to all others?
231
232
233	// This function doesn't work if the sentenceDetector object wasn't set up
234	if(sentenceDetector == null) return null;
235
236	// we'll be storing just those sentences in text that are in the denoted language code
237	ArrayList<String> sentencesInLang = new ArrayList<String>();
238	// OpenNLP language detection works best with a minimum of 2 sentences
239	// See https://opennlp.apache.org/news/model-langdetect-183.html
240	// "It is important to note that this model is trained for and works well with
241	// longer texts that have at least 2 sentences or more from the same language."
242
243	// For evaluating single languages, I used a very small data set and found that
244	// if the primary language detected is MRI AND if the confidence is >= 0.1, the
245	// results appear reasonably to be in te reo MÄori.
246
247	String[] sentences = sentenceDetector.sentDetect(text);
248	if(sentences == null) {
249	sentencesInLang.add("0"); // to indicate 0 sentences in requested language
250	return sentencesInLang;
251	}
252
253	// add in first element: how many sentences there were in text.
254	sentencesInLang.add(Integer.toString(sentences.length));
255
256	for(int i = 0; i < sentences.length; i++) {
257	String sentence = sentences[i];
258
259	//System.err.println(sentence);
260
261	Language bestLanguage = myCategorizer.predictLanguage(sentence);
262	double confidence = bestLanguage.getConfidence();
263
264	if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
265	//System.err.println("Adding sentence: " + sentence + "\n");
266	sentencesInLang.add(sentence);
267	} //else {
268	//System.err.println("SKIPPING sentence: " + sentence + "\n");
269	//}
270	}
271	return sentencesInLang;
272	}
273
274
275	/** @param langCode is 3 letter language code, ISO 639-2/3
276	* https://www.loc.gov/standards/iso639-2/php/code_list.php
277	* https://en.wikipedia.org/wiki/ISO_639-3
278	* @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
279	* else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
280	*/
281	public boolean isTextInLanguage(String langCode, String text) {
282	// Get the most probable language
283	Language bestLanguage = myCategorizer.predictLanguage(text);
284	doPrint("Best language: " + bestLanguage.getLang());
285	doPrint("Best language confidence: " + bestLanguage.getConfidence());
286
287	return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
288	}
289
290
291	/**
292	* Handle "smaller" textfiles/streams of text read in.
293	* Return value is the same as for isTextInLanguage(String langCode, String text);
294	*/
295	public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
296	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
297
298	StringBuilder text = new StringBuilder();
299	String line = null;
300
301
302	while((line = reader.readLine()) != null) { // readLine removes newline separator
303	text.append(line + "\n"); // add back (unix style) line ending
304	}
305	return isTextInLanguage(langCode, text.toString());
306	}
307
308
309	/**
310	* Rudimentary attempt to deal with very large files.
311	* Return value is the same as for isTextInLanguage(String langCode, String text);
312	*/
313	public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
314	// https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
315
316	final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
317
318	StringBuilder text = new StringBuilder();
319	String line = null;
320
321	double cumulativeConfidence = 0;
322	int numLoops = 0;
323
324	int i = 0;
325	String language = null;
326
327	while((line = reader.readLine()) != null) { // readLine removes newline separator
328	text.append(line + "\n"); // add back (unix style) line ending
329
330	i++; // read nth line of numLoop
331
332
333	if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
334
335	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
336	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
337	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
338	}
339	language = bestLanguage.getLang();
340	cumulativeConfidence += bestLanguage.getConfidence();
341
342	doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
343
344	// finished analysing language of NUM_LINES of text
345	text = new StringBuilder();
346	i = 0;
347	numLoops++;
348	}
349	}
350
351	// process any (remaining) text that was less than n NUM_LINES
352	if(!text.toString().equals("")) {
353	text.append(line + "\n"); // add back (unix style) line ending
354	i++;
355
356	Language bestLanguage = myCategorizer.predictLanguage(text.toString());
357
358	if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
359	doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
360	}
361	language = bestLanguage.getLang();
362	cumulativeConfidence += bestLanguage.getConfidence();
363	doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
364	}
365
366
367	int totalLinesRead = numLoops * NUM_LINES + i; // not used
368	double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
369
370
371	return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
372	}
373
374
375	/**
376	* Prints to STDOUT the predicted languages of the input text in order of descending confidence.
377	* UNUSED.
378	*/
379	public void predictedLanguages(String text) {
380	// Get an array with the most probable languages
381
382	Language[] languages = myCategorizer.predictLanguages(text);
383
384	if(languages == null \|\| languages.length <= 0) {
385	doPrintErr("No languages predicted for the input text");
386	} else {
387	for(int i = 0; i < languages.length; i++) {
388	doPrint("Language prediction " + i + ": " + languages[i]);
389	}
390	}
391
392	}
393
394	public void doPrint(String msg) {
395	if(!this.silentMode) System.out.println(msg);
396	}
397	public void doPrintErr(String msg) {
398	if(!this.silentMode) System.err.println(msg);
399	}
400
401	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33634

Download in other formats: