source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java@ 33577

Last change on this file since 33577 was 33577, checked in by ak19, 5 years ago

Forgot to adjust usage statement to say that silent mode was already implemented and working in previous commits to MaoriTextDetector.java

File size: 18.3 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not.
4 * Internal functions can be used for detecting any of the 103 languages currently supported by
5 * the OpenNLP Language Detection Model.
6 *
7 * http://opennlp.apache.org/news/model-langdetect-183.html
8 * language detector model: http://opennlp.apache.org/models.html
9 * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
10 * Use of Apache OpenNLP in general:
11 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
12 * Use of OpenNLP for language detection:
13 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
14 *
15 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
16 * See also the accompanying README file.
17 *
18 * July 2019
19 */
20
21package org.greenstone.atea;
22
23import java.io.*;
24import opennlp.tools.langdetect.*;
25import opennlp.tools.util.*;
26
27/**
28 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
29 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
30 * (which is the language detection model zipped up and renamed to .bin extension).
31 *
32 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
33 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
34 *
35 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
36 *
37 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
38 *
39 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>
40 *
41 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
42 * Press enter. This variant of the program expects text to stream in from standard input.
43 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
44 *
45 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
46 * Also has information on how to run this class if it's in a Java package.
47 */
48public class MaoriTextDetector {
49 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
50 public static final String MAORI_3LETTER_CODE = "mri";
51 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
52
53 /** Configurable: cut off minimum confidence value,
54 greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */
55 public final double MINIMUM_CONFIDENCE;
56
57 /** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */
58 public final boolean silentMode;
59
60 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
61 private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
62
63 /**
64 * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
65 * Created once in the constructor, can be used as often as needed thereafter.
66 */
67 private LanguageDetector myCategorizer = null;
68
69 /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
70 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
71
72 /** test input string for a negative result */
73 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
74
75
76 public MaoriTextDetector(boolean silentMode) throws Exception {
77 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
78 }
79
80 public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
81 this.silentMode = silentMode;
82 this.MINIMUM_CONFIDENCE = min_confidence;
83
84 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
85 String langDetectModelPath = System.getenv("OPENNLP_HOME");
86 if(System.getenv("OPENNLP_HOME") == null) {
87 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
88 }
89 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
90 File langDetectModelBinFile = new File(langDetectModelPath);
91 if(!langDetectModelBinFile.exists()) {
92 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
93 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
94 }
95
96
97 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
98 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
99 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
100 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
101
102 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
103
104 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
105 this.myCategorizer = new LanguageDetectorME(model);
106 }/*catch(Exception e) {
107 e.printStackTrace();
108 }*/
109
110 // instantiating function should handle critical exceptions. Constructors shouldn't.
111 }
112
113 /**
114 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
115 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
116 */
117 public boolean isTextInMaori(String text) {
118 return isTextInLanguage(MAORI_3LETTER_CODE, text);
119 }
120
121 /** @param langCode is 3 letter language code, ISO 639-2/3
122 * https://www.loc.gov/standards/iso639-2/php/code_list.php
123 * https://en.wikipedia.org/wiki/ISO_639-3
124 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
125 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
126 */
127 public boolean isTextInLanguage(String langCode, String text) {
128 // Get the most probable language
129 Language bestLanguage = myCategorizer.predictLanguage(text);
130 doPrint("Best language: " + bestLanguage.getLang());
131 doPrint("Best language confidence: " + bestLanguage.getConfidence());
132
133 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
134 }
135
136
137 /**
138 * Handle "smaller" textfiles/streams of text read in.
139 * Return value is the same as for isTextInMaori(String text);
140 */
141 public boolean isTextInMaori(BufferedReader reader) throws Exception {
142 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
143 }
144 /**
145 * Handle "smaller" textfiles/streams of text read in.
146 * Return value is the same as for isTextInLanguage(String langCode, String text);
147 */
148 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
149 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
150
151 StringBuilder text = new StringBuilder();
152 String line = null;
153
154
155 while((line = reader.readLine()) != null) { // readLine removes newline separator
156 text.append(line + "\n"); // add back (unix style) line ending
157 }
158 return isTextInLanguage(langCode, text.toString());
159 }
160
161 /*
162 * Need better handling of "larger" textfiles/streams of text read in:
163 * what if multiple languages with high confidence every NUM_LINES read in?
164 * Does this mean the file is multi-lingual with each section dominated by a different language?
165 * How best to convey such information to the user?
166 */
167 /**
168 * Rudimentary attempt to deal with very large files.
169 * Return value is the same as for isTextInMaori(String text);
170 */
171 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
172 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
173 }
174
175 /**
176 * Rudimentary attempt to deal with very large files.
177 * Return value is the same as for isTextInLanguage(String langCode, String text);
178 */
179 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
180 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
181
182 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
183
184 StringBuilder text = new StringBuilder();
185 String line = null;
186
187 double cumulativeConfidence = 0;
188 int numLoops = 0;
189
190 int i = 0;
191 String language = null;
192
193 while((line = reader.readLine()) != null) { // readLine removes newline separator
194 text.append(line + "\n"); // add back (unix style) line ending
195
196 i++; // read nth line of numLoop
197
198
199 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
200
201
202 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
203 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
204 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
205 }
206 language = bestLanguage.getLang();
207 cumulativeConfidence += bestLanguage.getConfidence();
208
209 doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
210
211 // finished analysing language of NUM_LINES of text
212 text = new StringBuilder();
213 i = 0;
214 numLoops++;
215 }
216 }
217
218 // process any (remaining) text that was less than n NUM_LINES
219 if(!text.toString().equals("")) {
220 text.append(line + "\n"); // add back (unix style) line ending
221 i++;
222
223 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
224
225 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
226 doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
227 }
228 language = bestLanguage.getLang();
229 cumulativeConfidence += bestLanguage.getConfidence();
230 doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
231 }
232
233
234 int totalLinesRead = numLoops * NUM_LINES + i; // not used
235 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
236
237
238 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
239 }
240
241
242 /**
243 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
244 * UNUSED.
245 */
246 public void predictedLanguages(String text) {
247 // Get an array with the most probable languages
248
249 Language[] languages = myCategorizer.predictLanguages(text);
250
251 if(languages == null || languages.length <= 0) {
252 doPrintErr("No languages predicted for the input text");
253 } else {
254 for(int i = 0; i < languages.length; i++) {
255 doPrint("Language prediction " + i + ": " + languages[i]);
256 }
257 }
258
259 }
260
261 public void doPrint(String msg) {
262 doPrint(this.silentMode, msg);
263 }
264 public void doPrintErr(String msg) {
265 doPrintErr(this.silentMode, msg);
266 }
267
268 /********** STATIC METHODS *************/
269
270 public static void doPrint(boolean runSilent, String msg) {
271 if(!runSilent) System.out.println(msg);
272 }
273 public static void doPrintErr(boolean runSilent, String msg) {
274 if(!runSilent) System.err.println(msg);
275 }
276
277 public static void printUsage() {
278 System.err.println("Run this program with:");
279 System.err.println("\t--help (-h)\tfor seeing this usage message again");
280 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
281 System.err.println("\t--file (-f)\tto provide an input file path");
282 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value.");
283 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
284 System.err.println("\t\tof the predicted language that will be considered acceptable.");
285 System.err.println();
286 System.err.println("This program terminates with exit value:");
287 System.err.println("\t0 if the input text is in Maori");
288 System.err.println("\t1 if input text is not in Maori");
289 System.err.println();
290 System.err.println("\t-1 if the input arguments were wrong");
291 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
292 System.err.println("\t2 if the user asked to run this program with --help/-h.");
293 System.err.println();
294 }
295
296 /**
297 * The main program exits with:
298 * 0 if text is in Maori;
299 * 1 if text is not in Maori;
300 *
301 * -1 if the input arguments were wrong
302 * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
303 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
304 * 2 if the user asked to run this program with --help/-h.
305 */
306 public static void main(String args[]) {
307 int returnVal = -1;
308
309 // 1. Check input arguments
310 boolean printUsage = false;
311 boolean readFromStdIn = false;
312 File inFile = null;
313 boolean runSilent = false;
314 double minConfidence = -1;
315
316 for (int i = 0; !printUsage && i < args.length; i++) {
317
318 // check for help first and quit after printing usage
319 if(args[i].equals("--help") || args[i].equals("-h")) {
320 printUsage = true;
321 returnVal = 2;
322 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
323 runSilent = true;
324 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
325 i++;
326 if(i >= args.length) {
327 doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
328 printUsage = true;
329 returnVal = -1;
330 } else {
331 try {
332 minConfidence = Double.parseDouble(args[i]);
333 if(minConfidence < 0 || minConfidence > 1) {
334 throw new NumberFormatException("Number out of range, must be between 0-1");
335 }
336 } catch(NumberFormatException nfe) {
337 doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
338 printUsage = true;
339 returnVal = -1;
340 }
341 }
342 } else if(args[i].equals("-")) {
343 readFromStdIn = true;
344 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
345 } else if(args[i].equals("--file") || args[i].equals("-f")) {
346 i++;
347 if(i >= args.length) {
348 doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
349 printUsage = true;
350 returnVal = -1;
351 } else {
352 String filePath = args[i];
353 inFile = new File(filePath);
354 if(!inFile.isFile()) {
355 doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
356 printUsage = true;
357 returnVal = -1;
358 }
359 }
360 } else { // unrecognised input argument
361 doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
362 printUsage = true;
363 returnVal = -1;
364 }
365 }
366
367 if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane
368 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
369 doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
370 printUsage = true;
371 returnVal = -1;
372 }
373
374 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
375 doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
376 printUsage = true;
377 returnVal = -1;
378 }
379 }
380
381 if(printUsage) {
382 // If not running silent print usage.
383 // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
384 if(returnVal == 2 || !runSilent) {
385 printUsage();
386 }
387 System.exit(returnVal);
388 }
389
390
391 // 2. Finally, we can now do the actual language detection
392 try {
393 MaoriTextDetector maoriTextDetector = null;
394 if(minConfidence == -1) {
395 maoriTextDetector = new MaoriTextDetector(runSilent);
396 } else {
397 maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
398 }
399
400 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
401 boolean textIsInMaori = false;
402
403 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
404 if(inFile != null) {
405 doPrint(runSilent, "Reading text from file " + inFile.getPath());
406 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
407 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
408 } // let outer try deal with any file/reading exceptions
409 }
410 else if (readFromStdIn) {
411 doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
412 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
413 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
414 } // let outer try deal with any file/reading exceptions
415 }
416
417 if(textIsInMaori) {
418 returnVal = 0;
419 } else {
420 returnVal = 1;
421 }
422
423 } catch(Exception e) {
424 e.printStackTrace();
425
426 } finally {
427 doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
428 System.exit(returnVal);
429 }
430 }
431
432}
Note: See TracBrowser for help on using the repository browser.