source: gs3-extensions/maori-lang-detection/src/MaoriTextDetector.java@ 33350

Last change on this file since 33350 was 33350, checked in by ak19, 5 years ago

Better comments. Tested macronised vs unmacronised Māori language test string and both are detected as mri, but the unmacronised is detected with lower confidence. Added a note on that in the README.

File size: 17.9 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not.
4 * Internal functions can be used for detecting any of the 103 languages currently supported by
5 * the OpenNLP Language Detection Model.
6 *
7 * http://opennlp.apache.org/news/model-langdetect-183.html
8 * language detector model: http://opennlp.apache.org/models.html
9 * Use of Apache OpenNLP in general:
10 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
11 * Use of OpenNLP for language detection:
12 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
13 *
14 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
15 * See also the accompanying README file.
16 *
17 * July 2019
18 */
19
20import java.io.*;
21import opennlp.tools.langdetect.*;
22import opennlp.tools.util.*;
23
24/**
25 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
26 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
27 * (which is the language detection model zipped up and renamed to .bin extension).
28 *
29 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
30 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriTextDetector.java
31 *
32 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
33 *
34 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriTextDetector --help
35 *
36 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriTextDetector --file <full/path/to/textfile>
37 *
38 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriTextDetector -
39 * Press enter. This variant of the program expects text to stream in from standard input.
40 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
41 *
42 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
43 * Also has information on how to run this class if it's in a Java package.
44 */
45public class MaoriTextDetector {
46 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
47 public static final String MAORI_3LETTER_CODE = "mri";
48 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
49
50 /** Configurable: cut off minimum confidence value,
51 greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */
52 public final double MINIMUM_CONFIDENCE;
53
54 /** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */
55 public final boolean silentMode;
56
57 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
58 private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
59
60 /**
61 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
62 * Created once in the constructor, can be used as often as needed thereafter.
63 */
64 private LanguageDetector myCategorizer = null;
65
66 /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
67 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
68
69 /** test input string for a negative result */
70 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
71
72
73 public MaoriTextDetector(boolean silentMode) throws Exception {
74 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
75 }
76
77 public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
78 this.silentMode = silentMode;
79 this.MINIMUM_CONFIDENCE = min_confidence;
80
81 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
82 String langDetectModelPath = System.getenv("OPENNLP_HOME");
83 if(System.getenv("OPENNLP_HOME") == null) {
84 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
85 }
86 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
87 File langDetectModelBinFile = new File(langDetectModelPath);
88 if(!langDetectModelBinFile.exists()) {
89 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
90 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
91 }
92
93
94 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
95 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
96 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
97 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
98
99 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
100
101 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
102 this.myCategorizer = new LanguageDetectorME(model);
103 }/*catch(Exception e) {
104 e.printStackTrace();
105 }*/
106
107 // instantiating function should handle critical exceptions. Constructors shouldn't.
108 }
109
110 /**
111 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
112 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
113 */
114 public boolean isTextInMaori(String text) {
115 return isTextInLanguage(MAORI_3LETTER_CODE, text);
116 }
117
118 /** @param langCode is 3 letter language code, ISO 639-2/3
119 * https://www.loc.gov/standards/iso639-2/php/code_list.php
120 * https://en.wikipedia.org/wiki/ISO_639-3
121 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
122 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
123 */
124 public boolean isTextInLanguage(String langCode, String text) {
125 // Get the most probable language
126 Language bestLanguage = myCategorizer.predictLanguage(text);
127 doPrint("Best language: " + bestLanguage.getLang());
128 doPrint("Best language confidence: " + bestLanguage.getConfidence());
129
130 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
131 }
132
133
134 /**
135 * Handle "smaller" textfiles/streams of text read in.
136 * Return value is the same as for isTextInMaori(String text);
137 */
138 public boolean isTextInMaori(BufferedReader reader) throws Exception {
139 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
140 }
141 /**
142 * Handle "smaller" textfiles/streams of text read in.
143 * Return value is the same as for isTextInLanguage(String langCode, String text);
144 */
145 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
146 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
147
148 StringBuilder text = new StringBuilder();
149 String line = null;
150
151
152 while((line = reader.readLine()) != null) { // readLine removes newline separator
153 text.append(line + "\n"); // add back (unix style) line ending
154 }
155 return isTextInLanguage(langCode, text.toString());
156 }
157
158 /*
159 * Need better handling of "larger" textfiles/streams of text read in:
160 * what if multiple languages with high confidence every NUM_LINES read in?
161 * Does this mean the file is multi-lingual with each section dominated by a different language?
162 * How best to convey such information to the user?
163 */
164 /**
165 * Rudimentary attempt to deal with very large files.
166 * Return value is the same as for isTextInMaori(String text);
167 */
168 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
169 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
170 }
171
172 /**
173 * Rudimentary attempt to deal with very large files.
174 * Return value is the same as for isTextInLanguage(String langCode, String text);
175 */
176 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
177 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
178
179 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
180
181 StringBuilder text = new StringBuilder();
182 String line = null;
183
184 double cumulativeConfidence = 0;
185 int numLoops = 0;
186
187 int i = 0;
188 String language = null;
189
190 while((line = reader.readLine()) != null) { // readLine removes newline separator
191 text.append(line + "\n"); // add back (unix style) line ending
192
193 i++; // read nth line of numLoop
194
195
196 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
197
198
199 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
200 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
201 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
202 }
203 language = bestLanguage.getLang();
204 cumulativeConfidence += bestLanguage.getConfidence();
205
206 doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
207
208 // finished analysing language of NUM_LINES of text
209 text = new StringBuilder();
210 i = 0;
211 numLoops++;
212 }
213 }
214
215 // process any (remaining) text that was less than n NUM_LINES
216 if(!text.toString().equals("")) {
217 text.append(line + "\n"); // add back (unix style) line ending
218 i++;
219
220 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
221
222 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
223 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
224 }
225 language = bestLanguage.getLang();
226 cumulativeConfidence += bestLanguage.getConfidence();
227 doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
228 }
229
230
231 int totalLinesRead = numLoops * NUM_LINES + i; // not used
232 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
233
234
235 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
236 }
237
238
239 /**
240 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
241 * UNUSED.
242 */
243 public void predictedLanguages(String text) {
244 // Get an array with the most probable languages
245
246 Language[] languages = myCategorizer.predictLanguages(text);
247
248 if(languages == null || languages.length <= 0) {
249 doPrintErr("No languages predicted for the input text");
250 } else {
251 for(int i = 0; i < languages.length; i++) {
252 doPrint("Language prediction " + i + ": " + languages[i]);
253 }
254 }
255
256 }
257
258 public void doPrint(String msg) {
259 doPrint(this.silentMode, msg);
260 }
261 public void doPrintErr(String msg) {
262 doPrintErr(this.silentMode, msg);
263 }
264
265 /********** STATIC METHODS *************/
266
267 public static void doPrint(boolean runSilent, String msg) {
268 if(!runSilent) System.out.println(msg);
269 }
270 public static void doPrintErr(boolean runSilent, String msg) {
271 if(!runSilent) System.err.println(msg);
272 }
273
274 public static void printUsage() {
275 System.err.println("Run this program with:");
276 System.err.println("\t--help (-h)\tfor seeing this usage message again");
277 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
278 System.err.println("\t--file (-f)\tto provide an input file path");
279 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]");
280 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
281 System.err.println("\t\tof the predicted language that will be considered acceptable.");
282 System.err.println();
283 System.err.println("This program terminates with exit value:");
284 System.err.println("\t0 if the input text is in Maori");
285 System.err.println("\t1 if input text is not in Maori");
286 System.err.println();
287 System.err.println("\t-1 if the input arguments were wrong");
288 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
289 System.err.println("\t2 if the user asked to run this program with --help/-h.");
290 System.err.println();
291 }
292
293 /**
294 * The main program exits with:
295 * 0 if text is in Maori;
296 * 1 if text is not in Maori;
297 *
298 * -1 if the input arguments were wrong
299 * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
300 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
301 * 2 if the user asked to run this program with --help/-h.
302 */
303 public static void main(String args[]) {
304 int returnVal = -1;
305
306 // 1. Check input arguments
307 boolean printUsage = false;
308 boolean readFromStdIn = false;
309 File inFile = null;
310 boolean runSilent = false;
311 double minConfidence = -1;
312
313 for (int i = 0; !printUsage && i < args.length; i++) {
314
315 // check for help first and quit after printing usage
316 if(args[i].equals("--help") || args[i].equals("-h")) {
317 printUsage = true;
318 returnVal = 2;
319 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
320 runSilent = true;
321 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
322 i++;
323 if(i >= args.length) {
324 doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
325 printUsage = true;
326 returnVal = -1;
327 } else {
328 try {
329 minConfidence = Double.parseDouble(args[i]);
330 if(minConfidence < 0 || minConfidence > 1) {
331 throw new NumberFormatException("Number out of range, must be between 0-1");
332 }
333 } catch(NumberFormatException nfe) {
334 doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
335 printUsage = true;
336 returnVal = -1;
337 }
338 }
339 } else if(args[i].equals("-")) {
340 readFromStdIn = true;
341 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
342 } else if(args[i].equals("--file") || args[i].equals("-f")) {
343 i++;
344 if(i >= args.length) {
345 doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
346 printUsage = true;
347 returnVal = -1;
348 } else {
349 String filePath = args[i];
350 inFile = new File(filePath);
351 if(!inFile.isFile()) {
352 doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
353 printUsage = true;
354 returnVal = -1;
355 }
356 }
357 } else { // unrecognised input argument
358 doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
359 printUsage = true;
360 returnVal = -1;
361 }
362 }
363
364 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
365 doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
366 printUsage = true;
367 returnVal = -1;
368 }
369
370 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
371 doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
372 printUsage = true;
373 returnVal = -1;
374 }
375
376 if(printUsage) {
377 // If not running silent print usage.
378 // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
379 if(returnVal == 2 || !runSilent) {
380 printUsage();
381 }
382 System.exit(returnVal);
383 }
384
385
386 // 2. Finally, we can now do the actual language detection
387 try {
388 MaoriTextDetector maoriTextDetector = null;
389 if(minConfidence == -1) {
390 maoriTextDetector = new MaoriTextDetector(runSilent);
391 } else {
392 maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
393 }
394
395 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
396 boolean textIsInMaori = false;
397
398 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
399 if(inFile != null) {
400 doPrint(runSilent, "Reading text from file " + inFile.getPath());
401 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
402 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
403 } // let outer try deal with any file/reading exceptions
404 }
405 else if (readFromStdIn) {
406 doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
407 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
408 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
409 } // let outer try deal with any file/reading exceptions
410 }
411
412 if(textIsInMaori) {
413 returnVal = 0;
414 } else {
415 returnVal = 1;
416 }
417
418 } catch(Exception e) {
419 e.printStackTrace();
420
421 } finally {
422 doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
423 System.exit(returnVal);
424 }
425 }
426
427}
Note: See TracBrowser for help on using the repository browser.