source: gs3-extensions/maori-lang-detection/src/MaoriTextDetector.java@ 33355

Last change on this file since 33355 was 33355, checked in by ak19, 5 years ago

Changes for adding in the new gen_SentenceDetection_model.sh script, which automates generating a Sentence Detector model for the Maori language, mri-sent_trained.bin, trained on the mri-sent.train file generated by appropritely formatting the 100k Maori sentences file from the opennlp corpus 2011

File size: 18.0 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not.
4 * Internal functions can be used for detecting any of the 103 languages currently supported by
5 * the OpenNLP Language Detection Model.
6 *
7 * http://opennlp.apache.org/news/model-langdetect-183.html
8 * language detector model: http://opennlp.apache.org/models.html
9 * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
10 * Use of Apache OpenNLP in general:
11 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
12 * Use of OpenNLP for language detection:
13 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
14 *
15 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
16 * See also the accompanying README file.
17 *
18 * July 2019
19 */
20
21import java.io.*;
22import opennlp.tools.langdetect.*;
23import opennlp.tools.util.*;
24
25/**
26 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
27 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
28 * (which is the language detection model zipped up and renamed to .bin extension).
29 *
30 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
31 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriTextDetector.java
32 *
33 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
34 *
35 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriTextDetector --help
36 *
37 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriTextDetector --file <full/path/to/textfile>
38 *
39 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriTextDetector -
40 * Press enter. This variant of the program expects text to stream in from standard input.
41 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
42 *
43 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
44 * Also has information on how to run this class if it's in a Java package.
45 */
46public class MaoriTextDetector {
47 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
48 public static final String MAORI_3LETTER_CODE = "mri";
49 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
50
51 /** Configurable: cut off minimum confidence value,
52 greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */
53 public final double MINIMUM_CONFIDENCE;
54
55 /** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */
56 public final boolean silentMode;
57
58 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
59 private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
60
61 /**
62 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
63 * Created once in the constructor, can be used as often as needed thereafter.
64 */
65 private LanguageDetector myCategorizer = null;
66
67 /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
68 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
69
70 /** test input string for a negative result */
71 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
72
73
74 public MaoriTextDetector(boolean silentMode) throws Exception {
75 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
76 }
77
78 public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
79 this.silentMode = silentMode;
80 this.MINIMUM_CONFIDENCE = min_confidence;
81
82 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
83 String langDetectModelPath = System.getenv("OPENNLP_HOME");
84 if(System.getenv("OPENNLP_HOME") == null) {
85 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
86 }
87 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
88 File langDetectModelBinFile = new File(langDetectModelPath);
89 if(!langDetectModelBinFile.exists()) {
90 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
91 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
92 }
93
94
95 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
96 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
97 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
98 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
99
100 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
101
102 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
103 this.myCategorizer = new LanguageDetectorME(model);
104 }/*catch(Exception e) {
105 e.printStackTrace();
106 }*/
107
108 // instantiating function should handle critical exceptions. Constructors shouldn't.
109 }
110
111 /**
112 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
113 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
114 */
115 public boolean isTextInMaori(String text) {
116 return isTextInLanguage(MAORI_3LETTER_CODE, text);
117 }
118
119 /** @param langCode is 3 letter language code, ISO 639-2/3
120 * https://www.loc.gov/standards/iso639-2/php/code_list.php
121 * https://en.wikipedia.org/wiki/ISO_639-3
122 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
123 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
124 */
125 public boolean isTextInLanguage(String langCode, String text) {
126 // Get the most probable language
127 Language bestLanguage = myCategorizer.predictLanguage(text);
128 doPrint("Best language: " + bestLanguage.getLang());
129 doPrint("Best language confidence: " + bestLanguage.getConfidence());
130
131 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
132 }
133
134
135 /**
136 * Handle "smaller" textfiles/streams of text read in.
137 * Return value is the same as for isTextInMaori(String text);
138 */
139 public boolean isTextInMaori(BufferedReader reader) throws Exception {
140 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
141 }
142 /**
143 * Handle "smaller" textfiles/streams of text read in.
144 * Return value is the same as for isTextInLanguage(String langCode, String text);
145 */
146 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
147 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
148
149 StringBuilder text = new StringBuilder();
150 String line = null;
151
152
153 while((line = reader.readLine()) != null) { // readLine removes newline separator
154 text.append(line + "\n"); // add back (unix style) line ending
155 }
156 return isTextInLanguage(langCode, text.toString());
157 }
158
159 /*
160 * Need better handling of "larger" textfiles/streams of text read in:
161 * what if multiple languages with high confidence every NUM_LINES read in?
162 * Does this mean the file is multi-lingual with each section dominated by a different language?
163 * How best to convey such information to the user?
164 */
165 /**
166 * Rudimentary attempt to deal with very large files.
167 * Return value is the same as for isTextInMaori(String text);
168 */
169 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
170 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
171 }
172
173 /**
174 * Rudimentary attempt to deal with very large files.
175 * Return value is the same as for isTextInLanguage(String langCode, String text);
176 */
177 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
178 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
179
180 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
181
182 StringBuilder text = new StringBuilder();
183 String line = null;
184
185 double cumulativeConfidence = 0;
186 int numLoops = 0;
187
188 int i = 0;
189 String language = null;
190
191 while((line = reader.readLine()) != null) { // readLine removes newline separator
192 text.append(line + "\n"); // add back (unix style) line ending
193
194 i++; // read nth line of numLoop
195
196
197 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
198
199
200 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
201 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
202 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
203 }
204 language = bestLanguage.getLang();
205 cumulativeConfidence += bestLanguage.getConfidence();
206
207 doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
208
209 // finished analysing language of NUM_LINES of text
210 text = new StringBuilder();
211 i = 0;
212 numLoops++;
213 }
214 }
215
216 // process any (remaining) text that was less than n NUM_LINES
217 if(!text.toString().equals("")) {
218 text.append(line + "\n"); // add back (unix style) line ending
219 i++;
220
221 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
222
223 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
224 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
225 }
226 language = bestLanguage.getLang();
227 cumulativeConfidence += bestLanguage.getConfidence();
228 doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
229 }
230
231
232 int totalLinesRead = numLoops * NUM_LINES + i; // not used
233 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
234
235
236 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
237 }
238
239
240 /**
241 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
242 * UNUSED.
243 */
244 public void predictedLanguages(String text) {
245 // Get an array with the most probable languages
246
247 Language[] languages = myCategorizer.predictLanguages(text);
248
249 if(languages == null || languages.length <= 0) {
250 doPrintErr("No languages predicted for the input text");
251 } else {
252 for(int i = 0; i < languages.length; i++) {
253 doPrint("Language prediction " + i + ": " + languages[i]);
254 }
255 }
256
257 }
258
259 public void doPrint(String msg) {
260 doPrint(this.silentMode, msg);
261 }
262 public void doPrintErr(String msg) {
263 doPrintErr(this.silentMode, msg);
264 }
265
266 /********** STATIC METHODS *************/
267
268 public static void doPrint(boolean runSilent, String msg) {
269 if(!runSilent) System.out.println(msg);
270 }
271 public static void doPrintErr(boolean runSilent, String msg) {
272 if(!runSilent) System.err.println(msg);
273 }
274
275 public static void printUsage() {
276 System.err.println("Run this program with:");
277 System.err.println("\t--help (-h)\tfor seeing this usage message again");
278 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
279 System.err.println("\t--file (-f)\tto provide an input file path");
280 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]");
281 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
282 System.err.println("\t\tof the predicted language that will be considered acceptable.");
283 System.err.println();
284 System.err.println("This program terminates with exit value:");
285 System.err.println("\t0 if the input text is in Maori");
286 System.err.println("\t1 if input text is not in Maori");
287 System.err.println();
288 System.err.println("\t-1 if the input arguments were wrong");
289 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
290 System.err.println("\t2 if the user asked to run this program with --help/-h.");
291 System.err.println();
292 }
293
294 /**
295 * The main program exits with:
296 * 0 if text is in Maori;
297 * 1 if text is not in Maori;
298 *
299 * -1 if the input arguments were wrong
300 * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
301 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
302 * 2 if the user asked to run this program with --help/-h.
303 */
304 public static void main(String args[]) {
305 int returnVal = -1;
306
307 // 1. Check input arguments
308 boolean printUsage = false;
309 boolean readFromStdIn = false;
310 File inFile = null;
311 boolean runSilent = false;
312 double minConfidence = -1;
313
314 for (int i = 0; !printUsage && i < args.length; i++) {
315
316 // check for help first and quit after printing usage
317 if(args[i].equals("--help") || args[i].equals("-h")) {
318 printUsage = true;
319 returnVal = 2;
320 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
321 runSilent = true;
322 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
323 i++;
324 if(i >= args.length) {
325 doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
326 printUsage = true;
327 returnVal = -1;
328 } else {
329 try {
330 minConfidence = Double.parseDouble(args[i]);
331 if(minConfidence < 0 || minConfidence > 1) {
332 throw new NumberFormatException("Number out of range, must be between 0-1");
333 }
334 } catch(NumberFormatException nfe) {
335 doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
336 printUsage = true;
337 returnVal = -1;
338 }
339 }
340 } else if(args[i].equals("-")) {
341 readFromStdIn = true;
342 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
343 } else if(args[i].equals("--file") || args[i].equals("-f")) {
344 i++;
345 if(i >= args.length) {
346 doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
347 printUsage = true;
348 returnVal = -1;
349 } else {
350 String filePath = args[i];
351 inFile = new File(filePath);
352 if(!inFile.isFile()) {
353 doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
354 printUsage = true;
355 returnVal = -1;
356 }
357 }
358 } else { // unrecognised input argument
359 doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
360 printUsage = true;
361 returnVal = -1;
362 }
363 }
364
365 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
366 doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
367 printUsage = true;
368 returnVal = -1;
369 }
370
371 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
372 doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
373 printUsage = true;
374 returnVal = -1;
375 }
376
377 if(printUsage) {
378 // If not running silent print usage.
379 // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
380 if(returnVal == 2 || !runSilent) {
381 printUsage();
382 }
383 System.exit(returnVal);
384 }
385
386
387 // 2. Finally, we can now do the actual language detection
388 try {
389 MaoriTextDetector maoriTextDetector = null;
390 if(minConfidence == -1) {
391 maoriTextDetector = new MaoriTextDetector(runSilent);
392 } else {
393 maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
394 }
395
396 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
397 boolean textIsInMaori = false;
398
399 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
400 if(inFile != null) {
401 doPrint(runSilent, "Reading text from file " + inFile.getPath());
402 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
403 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
404 } // let outer try deal with any file/reading exceptions
405 }
406 else if (readFromStdIn) {
407 doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
408 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
409 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
410 } // let outer try deal with any file/reading exceptions
411 }
412
413 if(textIsInMaori) {
414 returnVal = 0;
415 } else {
416 returnVal = 1;
417 }
418
419 } catch(Exception e) {
420 e.printStackTrace();
421
422 } finally {
423 doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
424 System.exit(returnVal);
425 }
426 }
427
428}
Note: See TracBrowser for help on using the repository browser.