source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/TextLanguageDetector.java@ 33634

Last change on this file since 33634 was 33634, checked in by ak19, 4 years ago

Rewrote NutchTextDumpProcessor as NutchTextDumpToMongoDB.java, which uses MongoDBAccess that now has insertWebpageInfo() and insertWebsiteInfo(). However, testing has been unsuccessful locally, despite the fact that authentication should be working, as I'm following the examples online to use the Credential object. It supposedly connects to the database, but database.listCollections() fails with an Unauthorized error. Nothing subsequent can be expected to work. I could do my preliminary testing against a small sample subset of crawled sites on vagrant where there is no authentication setup, but what if someone else wants to run this one day against a mongodb where they authentication is set up (the way TSG set it up for the mongodb they gave me access to). Then it still wouldn't work.

File size: 16.4 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in a given
4 * language or not.
5 * Internal functions can be used for detecting any of the 103 languages currently supported by
6 * the OpenNLP Language Detection Model.
7 *
8 * http://opennlp.apache.org/news/model-langdetect-183.html
9 * language detector model: http://opennlp.apache.org/models.html
10 * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
11 * Use of Apache OpenNLP in general:
12 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
13 * Use of OpenNLP for language detection:
14 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
15 *
16 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
17 * See also the accompanying README file.
18 *
19 * July 2019
20 */
21
22package org.greenstone.atea;
23
24import java.io.*;
25import opennlp.tools.langdetect.*;
26import opennlp.tools.sentdetect.*;
27import opennlp.tools.util.*;
28
29import java.util.ArrayList;
30
31/**
32 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
33 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
34 * (which is the language detection model zipped up and renamed to .bin extension).
35 *
36 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
37 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/TextLanguageDetector.java
38 *
39 * Only the subclass MaoriTextDetector.java has a main method at present that can be run.
40 *
41 */
42public class TextLanguageDetector {
43
44 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
45
46 /**
47 * Configurable: cut off minimum confidence value,
48 * greater or equal to which determines that the best predicted language is
49 * acceptable to user of TextLanguageDetector.
50 */
51 public final double MINIMUM_CONFIDENCE;
52
53 /** silentMode set to false means TextLanguageDetector won't print helpful messages while running. Set to true to run silently. */
54 public final boolean silentMode;
55
56 private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
57
58 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
59 private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
60
61 /**
62 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
63 * Created once in the constructor, can be used as often as needed thereafter.
64 */
65 private LanguageDetector myCategorizer = null;
66
67 /**
68 * The Sentence Detection object that does the sentence splitting for the language
69 * the sentece model was trained for.
70 */
71 private SentenceDetectorME sentenceDetector = null;
72
73
74 /** Constructor with default confidence for language detection.
75 * Does not create sentence model, just the language detection model.
76 */
77 public TextLanguageDetector(boolean silentMode) throws Exception {
78 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
79 }
80
81 /** Constructor with configurable min_confidence for language detection
82 * Does not create sentence model, just the language detection model.
83 */
84 public TextLanguageDetector(boolean silentMode, double min_confidence) throws Exception {
85 this.silentMode = silentMode;
86 this.MINIMUM_CONFIDENCE = min_confidence;
87
88 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
89 String langDetectModelPath = System.getenv("OPENNLP_HOME");
90 if(System.getenv("OPENNLP_HOME") == null) {
91 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
92 }
93 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
94 File langDetectModelBinFile = new File(langDetectModelPath);
95 if(!langDetectModelBinFile.exists()) {
96 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
97 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
98 + "\n\t*** with the model file 'langdetect-183.bin' in it.");
99 }
100
101
102 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
103 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
104 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
105 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
106
107 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
108
109 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
110 this.myCategorizer = new LanguageDetectorME(model);
111 }/*catch(Exception e) {
112 e.printStackTrace();
113 }*/
114
115 // instantiating function should handle critical exceptions. Constructors shouldn't.
116
117 }
118
119 /** More general constructor that additionally can load up the sentence detector model
120 * for other languages, as long as the provided trained sentence model .bin file exists
121 * in the OPENNLP_MODELS_RELATIVE_PATH folder. */
122 public TextLanguageDetector(boolean silentMode, double min_confidence,
123 String sentenceModelFileName) throws Exception
124 {
125 this(silentMode, min_confidence);
126
127 // 3. Set up our sentence model and SentenceDetector object
128 String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
129 + OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
130 File sentenceModelBinFile = new File(sentenceModelPath);
131 if(!sentenceModelBinFile.exists()) {
132 throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
133 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
134 + "\n\t*** with the model file "+sentenceModelFileName+" in it.");
135 }
136 try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
137 // https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
138 SentenceModel sentenceModel = new SentenceModel(modelIn);
139 this.sentenceDetector = new SentenceDetectorME(sentenceModel);
140
141 } // instantiating function should handle this critical exception
142 }
143
144 /** TODO: Is it sensible to use the Maori Language Sentence Model to split the text
145 * into sentences? What if the text in any other language or a mix of languages?
146 * Doesn't this assume that all languages split sentences alike? */
147 public String[] getAllSentences(String text) {
148
149 // This function doesn't work if the sentenceDetector object wasn't set up
150 if(sentenceDetector == null) return null;
151
152 String[] sentences = sentenceDetector.sentDetect(text);
153 return sentences;
154 }
155
156 public ArrayList<SentenceInfo> getAllSentencesInfo(String[] sentences) {
157
158 if(sentences == null) {
159 return null;
160 }
161
162 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
163 for(int i = 0; i < sentences.length; i++) {
164 String sentence = sentences[i];
165
166 //System.err.println(sentence);
167
168 Language bestLanguage = myCategorizer.predictLanguage(sentence);
169 double confidence = bestLanguage.getConfidence();
170
171 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
172 }
173
174 return sentencesList;
175 }
176
177 public ArrayList<SentenceInfo> getAllOverlappingSentencesInfo(String[] sentences) {
178
179 if(sentences == null) {
180 return null;
181 }
182
183 ArrayList<SentenceInfo> sentencesList = new ArrayList<SentenceInfo>();
184 for(int i = 1; i < sentences.length; i++) {
185 // glue every two adjacent sentences together
186 String sentence = sentences[i-1];
187
188 String separator = ". ";
189 // if the sentence already ends with a terminating punctuation character,
190 // then separator is just a space
191 sentence = sentence.trim();
192 if(sentence.endsWith(".") || sentence.endsWith("?") || sentence.endsWith("!")) {
193 separator = " ";
194 }
195 sentence = sentence + separator + sentences[i];
196
197 //System.err.println(sentence);
198
199 Language bestLanguage = myCategorizer.predictLanguage(sentence);
200 double confidence = bestLanguage.getConfidence();
201
202 sentencesList.add(new SentenceInfo(confidence, bestLanguage.getLang(), sentence));
203 }
204
205 return sentencesList;
206 }
207
208 /**
209 * In this class' constructor, need to have set up the Sentence Detection Model
210 * for the langCode passed in to this function in order for the output to make
211 * sense for that language.
212 * Function that takes a text and returns those sentences in the requested language.
213 * @param text: the string of text from which sentences in the requested
214 * language are to be identified and returned.
215 * @param langCode: 3 letter code of requested language
216 * @param confidenceCutoff: minimum confidence for a SINGLE sentence to be selected
217 * even if the language detector determined the requested language as the primary one
218 * for that sentence. The confidence cutoff provides an additional check.
219 * @return null if no Sentence Detection Model set up in constructor
220 * else returns an ArrayList where:
221 * - the first element is the total number of sentences in the text parameter
222 * - remaining elements are the sentences in the text parameter that were in the
223 * requested language.
224 */
225 public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
226 {
227 // big assumption here: that we can split incoming text into sentences
228 // for any language using the sentence model trained for a given language (that of
229 // langCode), despite not knowing what language each sentence in the text param are in.
230 // Hinges on sentence detection in langCode being similar to all others?
231
232
233 // This function doesn't work if the sentenceDetector object wasn't set up
234 if(sentenceDetector == null) return null;
235
236 // we'll be storing just those sentences in text that are in the denoted language code
237 ArrayList<String> sentencesInLang = new ArrayList<String>();
238 // OpenNLP language detection works best with a minimum of 2 sentences
239 // See https://opennlp.apache.org/news/model-langdetect-183.html
240 // "It is important to note that this model is trained for and works well with
241 // longer texts that have at least 2 sentences or more from the same language."
242
243 // For evaluating single languages, I used a very small data set and found that
244 // if the primary language detected is MRI AND if the confidence is >= 0.1, the
245 // results appear reasonably to be in te reo Māori.
246
247 String[] sentences = sentenceDetector.sentDetect(text);
248 if(sentences == null) {
249 sentencesInLang.add("0"); // to indicate 0 sentences in requested language
250 return sentencesInLang;
251 }
252
253 // add in first element: how many sentences there were in text.
254 sentencesInLang.add(Integer.toString(sentences.length));
255
256 for(int i = 0; i < sentences.length; i++) {
257 String sentence = sentences[i];
258
259 //System.err.println(sentence);
260
261 Language bestLanguage = myCategorizer.predictLanguage(sentence);
262 double confidence = bestLanguage.getConfidence();
263
264 if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
265 //System.err.println("Adding sentence: " + sentence + "\n");
266 sentencesInLang.add(sentence);
267 } //else {
268 //System.err.println("SKIPPING sentence: " + sentence + "\n");
269 //}
270 }
271 return sentencesInLang;
272 }
273
274
275 /** @param langCode is 3 letter language code, ISO 639-2/3
276 * https://www.loc.gov/standards/iso639-2/php/code_list.php
277 * https://en.wikipedia.org/wiki/ISO_639-3
278 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
279 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
280 */
281 public boolean isTextInLanguage(String langCode, String text) {
282 // Get the most probable language
283 Language bestLanguage = myCategorizer.predictLanguage(text);
284 doPrint("Best language: " + bestLanguage.getLang());
285 doPrint("Best language confidence: " + bestLanguage.getConfidence());
286
287 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
288 }
289
290
291 /**
292 * Handle "smaller" textfiles/streams of text read in.
293 * Return value is the same as for isTextInLanguage(String langCode, String text);
294 */
295 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
296 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
297
298 StringBuilder text = new StringBuilder();
299 String line = null;
300
301
302 while((line = reader.readLine()) != null) { // readLine removes newline separator
303 text.append(line + "\n"); // add back (unix style) line ending
304 }
305 return isTextInLanguage(langCode, text.toString());
306 }
307
308
309 /**
310 * Rudimentary attempt to deal with very large files.
311 * Return value is the same as for isTextInLanguage(String langCode, String text);
312 */
313 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
314 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
315
316 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
317
318 StringBuilder text = new StringBuilder();
319 String line = null;
320
321 double cumulativeConfidence = 0;
322 int numLoops = 0;
323
324 int i = 0;
325 String language = null;
326
327 while((line = reader.readLine()) != null) { // readLine removes newline separator
328 text.append(line + "\n"); // add back (unix style) line ending
329
330 i++; // read nth line of numLoop
331
332
333 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
334
335 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
336 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
337 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
338 }
339 language = bestLanguage.getLang();
340 cumulativeConfidence += bestLanguage.getConfidence();
341
342 doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
343
344 // finished analysing language of NUM_LINES of text
345 text = new StringBuilder();
346 i = 0;
347 numLoops++;
348 }
349 }
350
351 // process any (remaining) text that was less than n NUM_LINES
352 if(!text.toString().equals("")) {
353 text.append(line + "\n"); // add back (unix style) line ending
354 i++;
355
356 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
357
358 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
359 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
360 }
361 language = bestLanguage.getLang();
362 cumulativeConfidence += bestLanguage.getConfidence();
363 doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
364 }
365
366
367 int totalLinesRead = numLoops * NUM_LINES + i; // not used
368 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
369
370
371 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
372 }
373
374
375 /**
376 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
377 * UNUSED.
378 */
379 public void predictedLanguages(String text) {
380 // Get an array with the most probable languages
381
382 Language[] languages = myCategorizer.predictLanguages(text);
383
384 if(languages == null || languages.length <= 0) {
385 doPrintErr("No languages predicted for the input text");
386 } else {
387 for(int i = 0; i < languages.length; i++) {
388 doPrint("Language prediction " + i + ": " + languages[i]);
389 }
390 }
391
392 }
393
394 public void doPrint(String msg) {
395 if(!this.silentMode) System.out.println(msg);
396 }
397 public void doPrintErr(String msg) {
398 if(!this.silentMode) System.err.println(msg);
399 }
400
401}
Note: See TracBrowser for help on using the repository browser.