source: gs3-extensions/maori-lang-detection/src/MaoriDetector.java@ 33336

Last change on this file since 33336 was 33336, checked in by ak19, 5 years ago

Major rewrite to make this class more useful to callers. MaoriDetector.java, soon to be renamed, can now be called with a filepath to a txt file or with the hyphen to indicate it should read text from std input (terminate with Ctrl-D as usual). Then it will predict whether the language of the text is in Maori or not depending on the min confidence cut-off value, which presently defaults to 0.5. Still need to support silent mode and update the README with instructions. Now the language detection model file needed by OpenNLP to do the language detection needs to live at OPENNLP_HOME/models/langdetect-183.bin

File size: 16.9 KB
Line 
1/**
2 * http://opennlp.apache.org/news/model-langdetect-183.html
3 * language detector model: http://opennlp.apache.org/models.html
4 * Use of Apache OpenNLP in general:
5 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
6 * Use of OpenNLP for language detection:
7 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
8 *
9 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
10 */
11
12import java.io.*;
13import opennlp.tools.langdetect.*;
14import opennlp.tools.util.*;
15
16/**
17 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
18 * Then, to compile this program:
19 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriDetector.java
20 * To run this program, one of:
21 *
22 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --help
23 *
24 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --file <full/path/to/textfile>
25 *
26 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector -
27 * which expects text to stream in from standard input.
28 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
29 *
30 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
31 * Also has information on how to run this class if it's in a Java package.
32 */
33public class MaoriDetector {
34 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
35 public static final String MAORI_3LETTER_CODE = "mri";
36 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
37
38 /** Configurable: cut off minimum confidence value,
39 greater or equal to which determines that the best predicted language is acceptable to user of MaoriDetector. */
40 public final double MINIMUM_CONFIDENCE;
41 /** silentMode set to false means MaoriDetector won't print helpful messages while running. Set to true to run silently. */
42 public final boolean silentMode;
43
44 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
45 private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
46 private LanguageDetector myCategorizer = null;
47
48 /**
49 * String taken from our university website
50 * https://www.waikato.ac.nz/maori/
51 */
52 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
53
54 /** test input string for a negative result */
55 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
56
57
58 public MaoriDetector(boolean silentMode) throws Exception {
59 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
60 }
61
62 public MaoriDetector(boolean silentMode, double min_confidence) throws Exception {
63 this.silentMode = silentMode;
64 this.MINIMUM_CONFIDENCE = min_confidence;
65
66 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
67 String langDetectModelPath = System.getenv("OPENNLP_HOME");
68 if(System.getenv("OPENNLP_HOME") == null) {
69 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
70 }
71 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
72 File langDetectModelBinFile = new File(langDetectModelPath);
73 if(!langDetectModelBinFile.exists()) {
74 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
75 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
76 }
77
78
79 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
80 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
81 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
82 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
83
84 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
85
86 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
87 this.myCategorizer = new LanguageDetectorME(model);
88 }/*catch(Exception e) {
89 e.printStackTrace();
90 }*/
91
92 // instantiating function should handle critical exceptions. Constructors shouldn't.
93 }
94
95 /**
96 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
97 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
98 */
99 public boolean isTextInMaori(String text) {
100 return isTextInLanguage(MAORI_3LETTER_CODE, text);
101 }
102
103 /** @param langCode is 3 letter language code, ISO 639-2/3
104 * https://www.loc.gov/standards/iso639-2/php/code_list.php
105 * https://en.wikipedia.org/wiki/ISO_639-3
106 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
107 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
108 */
109 public boolean isTextInLanguage(String langCode, String text) {
110 // Get the most probable language
111 Language bestLanguage = myCategorizer.predictLanguage(text);
112 System.out.println("Best language: " + bestLanguage.getLang());
113 System.out.println("Best language confidence: " + bestLanguage.getConfidence());
114
115 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
116 }
117
118
119 /**
120 * Handle "smaller" textfiles/streams of text read in.
121 * Return value is the same as for isTextInMaori(String text);
122 */
123 public boolean isTextInMaori(BufferedReader reader) throws Exception {
124 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
125 }
126 /**
127 * Handle "smaller" textfiles/streams of text read in.
128 * Return value is the same as for isTextInLanguage(String langCode, String text);
129 */
130 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
131 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
132
133 StringBuilder text = new StringBuilder();
134 String line = null;
135
136
137 while((line = reader.readLine()) != null) { // readLine removes newline separator
138 text.append(line + "\n"); // add back (unix style) line ending
139 }
140 return isTextInLanguage(langCode, text.toString());
141 }
142
143 /*
144 * Need better handling of "larger" textfiles/streams of text read in:
145 * what if multiple languages with high confidence every NUM_LINES read in?
146 * Does this mean the file is multi-lingual with each section dominated by a different language?
147 * How best to convey such information to the user?
148 */
149 /**
150 * Rudimentary attempt to deal with very large files.
151 * Return value is the same as for isTextInMaori(String text);
152 */
153 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
154 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
155 }
156
157 /**
158 * Rudimentary attempt to deal with very large files.
159 * Return value is the same as for isTextInLanguage(String langCode, String text);
160 */
161 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
162 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
163
164 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
165
166 StringBuilder text = new StringBuilder();
167 String line = null;
168
169 double cumulativeConfidence = 0;
170 int numLoops = 0;
171
172 int i = 0;
173 String language = null;
174
175 while((line = reader.readLine()) != null) { // readLine removes newline separator
176 text.append(line + "\n"); // add back (unix style) line ending
177
178 i++; // read nth line of numLoop
179
180
181 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
182
183
184 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
185 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
186 System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
187 }
188 language = bestLanguage.getLang();
189 cumulativeConfidence += bestLanguage.getConfidence();
190
191 System.err.println("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
192
193 // finished analysing language of NUM_LINES of text
194 text = new StringBuilder();
195 i = 0;
196 numLoops++;
197 }
198 }
199
200 // process any (remaining) text that was less than n NUM_LINES
201 if(!text.toString().equals("")) {
202 text.append(line + "\n"); // add back (unix style) line ending
203 i++;
204
205 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
206
207 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
208 System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
209 }
210 language = bestLanguage.getLang();
211 cumulativeConfidence += bestLanguage.getConfidence();
212 System.err.println("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
213 }
214
215
216 int totalLinesRead = numLoops * NUM_LINES + i; // not used
217 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
218
219
220 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
221 }
222
223
224
225 /**
226 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
227 * Unused.
228 */
229 public void predictedLanguages(String text) {
230 // Get an array with the most probable languages
231
232 Language[] languages = myCategorizer.predictLanguages(text);
233
234 if(languages == null || languages.length <= 0) {
235 System.err.println("No languages predicted for the input text");
236 } else {
237 for(int i = 0; i < languages.length; i++) {
238 System.out.println("Language prediction " + i + ": " + languages[i]);
239 }
240 }
241
242 }
243
244 public static void printUsage() {
245 System.err.println("Run this program with:");
246 System.err.println("\t--help (-h)\tfor seeing this usage message again");
247 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
248 System.err.println("\t--file (-f)\tto provide an input file path");
249 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]");
250 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
251 System.err.println("\t\tof the predicted language that will be considered acceptable.");
252 System.err.println();
253 System.err.println("This program terminates with exit value:");
254 System.err.println("\t0 if the input text is in Maori");
255 System.err.println("\t1 if input text is not in Maori");
256 System.err.println();
257 System.err.println("\t-1 if the input arguments were wrong");
258 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language");
259 System.err.println("\t2 if the user asked to run this program with --help/-h.");
260 System.err.println();
261 }
262
263 /**
264 * The main program exits with:
265 * 0 if text is in Maori;
266 * 1 if text is not in Maori;
267 *
268 * -1 if the input arguments were wrong
269 * 255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language
270 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
271 * 2 if the user asked to run this program with --help/-h.
272 */
273 public static void main(String args[]) {
274 int returnVal = -1;
275
276 // 1. Check input arguments
277 boolean printUsage = false;
278 boolean readFromStdIn = false;
279 File inFile = null;
280 boolean runSilent = false;
281 double minConfidence = -1;
282
283 for (int i = 0; !printUsage && i < args.length; i++) {
284
285 // check for help first and quit after printing usage
286 if(args[i].equals("--help") || args[i].equals("-h")) {
287 printUsage = true;
288 returnVal = 2;
289 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
290 runSilent = true;
291 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
292 i++;
293 if(i >= args.length) {
294 System.err.println("ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
295 printUsage = true;
296 returnVal = -1;
297 } else {
298 try {
299 minConfidence = Double.parseDouble(args[i]);
300 if(minConfidence < 0 || minConfidence > 1) {
301 throw new NumberFormatException("Number out of range, must be between 0-1");
302 }
303 } catch(NumberFormatException nfe) {
304 System.err.println("ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
305 printUsage = true;
306 returnVal = -1;
307 }
308 }
309 } else if(args[i].equals("-")) {
310 readFromStdIn = true;
311 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
312 } else if(args[i].equals("--file") || args[i].equals("-f")) {
313 i++;
314 if(i >= args.length) {
315 System.err.println("ERROR: No input file provided with --file|-f flag.\n");
316 printUsage = true;
317 returnVal = -1;
318 } else {
319 String filePath = args[i];
320 inFile = new File(filePath);
321 if(!inFile.isFile()) {
322 System.err.println("ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
323 printUsage = true;
324 returnVal = -1;
325 }
326 }
327 } else { // unrecognised input argument
328 System.err.println("ERROR: Unrecognised " + i + "th argument to this program.\n");
329 printUsage = true;
330 returnVal = -1;
331 }
332 }
333
334 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
335 System.err.println("ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
336 printUsage = true;
337 returnVal = -1;
338 }
339
340 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
341 System.err.println("ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
342 printUsage = true;
343 returnVal = -1;
344 }
345
346 if(printUsage) {
347 if(!runSilent || returnVal == 2) { // if expressly asked for help or not running silent
348 printUsage();
349 }
350 System.exit(returnVal);
351 }
352
353 try {
354 MaoriDetector maoriTextDetector = null;
355 if(minConfidence == -1) {
356 maoriTextDetector = new MaoriDetector(runSilent);
357 } else {
358 maoriTextDetector = new MaoriDetector(runSilent, minConfidence);
359 }
360
361 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT);
362 boolean textIsInMaori = false;
363
364 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
365 if(inFile != null) {
366 System.err.println("Reading text from file " + inFile.getPath());
367 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
368 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
369 } // let outer try deal with any file/reading exceptions
370 }
371 else if (readFromStdIn) {
372 System.err.println("Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
373 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
374 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
375 } // let outer try deal with any file/reading exceptions
376 }
377
378 if(textIsInMaori) {
379 returnVal = 0;
380 } else {
381 returnVal = 1;
382 }
383
384 } catch(Exception e) {
385 e.printStackTrace();
386
387 } finally {
388 System.err.println("Exitting program with returnVal " + returnVal + "...\n");
389 System.exit(returnVal);
390 }
391 }
392
393 // test hardcoded string
394 public static void oldMain(String args[]) {
395 int returnVal = -1;
396 boolean silentMode = false;
397
398 try {
399 MaoriDetector maoriTextDetector = new MaoriDetector(silentMode);
400
401 boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT);
402 if(textIsInMaori) {
403 returnVal = 0;
404 } else {
405 returnVal = 1;
406 }
407
408 } catch(Exception e) {
409 e.printStackTrace();
410 } finally {
411 System.err.println("Exitting program with returnVal " + returnVal + "...\n");
412 System.exit(returnVal);
413 }
414 }
415
416
417}
Note: See TracBrowser for help on using the repository browser.