source: gs3-extensions/maori-lang-detection/src/MaoriTextDetector.java@ 33337

Last change on this file since 33337 was 33337, checked in by ak19, 5 years ago

Renaming the class to MaoriTextDetector, since it doesn't detect audio only text.

File size: 16.9 KB
Line 
1/**
2 * http://opennlp.apache.org/news/model-langdetect-183.html
3 * language detector model: http://opennlp.apache.org/models.html
4 * Use of Apache OpenNLP in general:
5 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
6 * Use of OpenNLP for language detection:
7 * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
8 *
9 * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
10 */
11
12import java.io.*;
13import opennlp.tools.langdetect.*;
14import opennlp.tools.util.*;
15
16/**
17 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
18 * Then, to compile this program:
19 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriDetector.java
20 * To run this program, one of:
21 *
22 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --help
23 *
24 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --file <full/path/to/textfile>
25 *
26 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector -
27 * which expects text to stream in from standard input.
28 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
29 *
30 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
31 * Also has information on how to run this class if it's in a Java package.
32 */
33public class MaoriDetector {
34 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
35 public static final String MAORI_3LETTER_CODE = "mri";
36 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
37
38 /** Configurable: cut off minimum confidence value,
39 greater or equal to which determines that the best predicted language is acceptable to user of MaoriDetector. */
40 public final double MINIMUM_CONFIDENCE;
41 /** silentMode set to false means MaoriDetector won't print helpful messages while running. Set to true to run silently. */
42 public final boolean silentMode;
43
44 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
45 private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
46 private LanguageDetector myCategorizer = null;
47
48 /**
49 * String taken from our university website
50 * https://www.waikato.ac.nz/maori/
51 */
52 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
53
54 /** test input string for a negative result */
55 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
56
57
58 public MaoriDetector(boolean silentMode) throws Exception {
59 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
60 }
61
62 public MaoriDetector(boolean silentMode, double min_confidence) throws Exception {
63 this.silentMode = silentMode;
64 this.MINIMUM_CONFIDENCE = min_confidence;
65
66 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
67 String langDetectModelPath = System.getenv("OPENNLP_HOME");
68 if(System.getenv("OPENNLP_HOME") == null) {
69 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
70 }
71 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
72 File langDetectModelBinFile = new File(langDetectModelPath);
73 if(!langDetectModelBinFile.exists()) {
74 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
75 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
76 }
77
78
79 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
80 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
81 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
82 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
83
84 LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
85
86 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
87 this.myCategorizer = new LanguageDetectorME(model);
88 }/*catch(Exception e) {
89 e.printStackTrace();
90 }*/
91
92 // instantiating function should handle critical exceptions. Constructors shouldn't.
93 }
94
95 /**
96 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
97 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
98 */
99 public boolean isTextInMaori(String text) {
100 return isTextInLanguage(MAORI_3LETTER_CODE, text);
101 }
102
103 /** @param langCode is 3 letter language code, ISO 639-2/3
104 * https://www.loc.gov/standards/iso639-2/php/code_list.php
105 * https://en.wikipedia.org/wiki/ISO_639-3
106 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
107 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
108 */
109 public boolean isTextInLanguage(String langCode, String text) {
110 // Get the most probable language
111 Language bestLanguage = myCategorizer.predictLanguage(text);
112 System.out.println("Best language: " + bestLanguage.getLang());
113 System.out.println("Best language confidence: " + bestLanguage.getConfidence());
114
115 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
116 }
117
118
119 /**
120 * Handle "smaller" textfiles/streams of text read in.
121 * Return value is the same as for isTextInMaori(String text);
122 */
123 public boolean isTextInMaori(BufferedReader reader) throws Exception {
124 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
125 }
126 /**
127 * Handle "smaller" textfiles/streams of text read in.
128 * Return value is the same as for isTextInLanguage(String langCode, String text);
129 */
130 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
131 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
132
133 StringBuilder text = new StringBuilder();
134 String line = null;
135
136
137 while((line = reader.readLine()) != null) { // readLine removes newline separator
138 text.append(line + "\n"); // add back (unix style) line ending
139 }
140 return isTextInLanguage(langCode, text.toString());
141 }
142
143 /*
144 * Need better handling of "larger" textfiles/streams of text read in:
145 * what if multiple languages with high confidence every NUM_LINES read in?
146 * Does this mean the file is multi-lingual with each section dominated by a different language?
147 * How best to convey such information to the user?
148 */
149 /**
150 * Rudimentary attempt to deal with very large files.
151 * Return value is the same as for isTextInMaori(String text);
152 */
153 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
154 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
155 }
156
157 /**
158 * Rudimentary attempt to deal with very large files.
159 * Return value is the same as for isTextInLanguage(String langCode, String text);
160 */
161 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
162 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
163
164 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
165
166 StringBuilder text = new StringBuilder();
167 String line = null;
168
169 double cumulativeConfidence = 0;
170 int numLoops = 0;
171
172 int i = 0;
173 String language = null;
174
175 while((line = reader.readLine()) != null) { // readLine removes newline separator
176 text.append(line + "\n"); // add back (unix style) line ending
177
178 i++; // read nth line of numLoop
179
180
181 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
182
183
184 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
185 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
186 System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
187 }
188 language = bestLanguage.getLang();
189 cumulativeConfidence += bestLanguage.getConfidence();
190
191 System.err.println("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
192
193 // finished analysing language of NUM_LINES of text
194 text = new StringBuilder();
195 i = 0;
196 numLoops++;
197 }
198 }
199
200 // process any (remaining) text that was less than n NUM_LINES
201 if(!text.toString().equals("")) {
202 text.append(line + "\n"); // add back (unix style) line ending
203 i++;
204
205 Language bestLanguage = myCategorizer.predictLanguage(text.toString());
206
207 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
208 System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
209 }
210 language = bestLanguage.getLang();
211 cumulativeConfidence += bestLanguage.getConfidence();
212 System.err.println("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
213 }
214
215
216 int totalLinesRead = numLoops * NUM_LINES + i; // not used
217 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
218
219
220 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
221 }
222
223
224
225 /**
226 * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
227 * Unused.
228 */
229 public void predictedLanguages(String text) {
230 // Get an array with the most probable languages
231
232 Language[] languages = myCategorizer.predictLanguages(text);
233
234 if(languages == null || languages.length <= 0) {
235 System.err.println("No languages predicted for the input text");
236 } else {
237 for(int i = 0; i < languages.length; i++) {
238 System.out.println("Language prediction " + i + ": " + languages[i]);
239 }
240 }
241
242 }
243
244 public static void printUsage() {
245 System.err.println("Run this program with:");
246 System.err.println("\t--help (-h)\tfor seeing this usage message again");
247 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
248 System.err.println("\t--file (-f)\tto provide an input file path");
249 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]");
250 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
251 System.err.println("\t\tof the predicted language that will be considered acceptable.");
252 System.err.println();
253 System.err.println("This program terminates with exit value:");
254 System.err.println("\t0 if the input text is in Maori");
255 System.err.println("\t1 if input text is not in Maori");
256 System.err.println();
257 System.err.println("\t-1 if the input arguments were wrong");
258 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language");
259 System.err.println("\t2 if the user asked to run this program with --help/-h.");
260 System.err.println();
261 }
262
263 /**
264 * The main program exits with:
265 * 0 if text is in Maori;
266 * 1 if text is not in Maori;
267 *
268 * -1 if the input arguments were wrong
269 * 255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language
270 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
271 * 2 if the user asked to run this program with --help/-h.
272 */
273 public static void main(String args[]) {
274 int returnVal = -1;
275
276 // 1. Check input arguments
277 boolean printUsage = false;
278 boolean readFromStdIn = false;
279 File inFile = null;
280 boolean runSilent = false;
281 double minConfidence = -1;
282
283 for (int i = 0; !printUsage && i < args.length; i++) {
284
285 // check for help first and quit after printing usage
286 if(args[i].equals("--help") || args[i].equals("-h")) {
287 printUsage = true;
288 returnVal = 2;
289 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
290 runSilent = true;
291 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
292 i++;
293 if(i >= args.length) {
294 System.err.println("ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
295 printUsage = true;
296 returnVal = -1;
297 } else {
298 try {
299 minConfidence = Double.parseDouble(args[i]);
300 if(minConfidence < 0 || minConfidence > 1) {
301 throw new NumberFormatException("Number out of range, must be between 0-1");
302 }
303 } catch(NumberFormatException nfe) {
304 System.err.println("ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
305 printUsage = true;
306 returnVal = -1;
307 }
308 }
309 } else if(args[i].equals("-")) {
310 readFromStdIn = true;
311 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
312 } else if(args[i].equals("--file") || args[i].equals("-f")) {
313 i++;
314 if(i >= args.length) {
315 System.err.println("ERROR: No input file provided with --file|-f flag.\n");
316 printUsage = true;
317 returnVal = -1;
318 } else {
319 String filePath = args[i];
320 inFile = new File(filePath);
321 if(!inFile.isFile()) {
322 System.err.println("ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
323 printUsage = true;
324 returnVal = -1;
325 }
326 }
327 } else { // unrecognised input argument
328 System.err.println("ERROR: Unrecognised " + i + "th argument to this program.\n");
329 printUsage = true;
330 returnVal = -1;
331 }
332 }
333
334 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
335 System.err.println("ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
336 printUsage = true;
337 returnVal = -1;
338 }
339
340 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
341 System.err.println("ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
342 printUsage = true;
343 returnVal = -1;
344 }
345
346 if(printUsage) {
347 if(!runSilent || returnVal == 2) { // if expressly asked for help or not running silent
348 printUsage();
349 }
350 System.exit(returnVal);
351 }
352
353 try {
354 MaoriDetector maoriTextDetector = null;
355 if(minConfidence == -1) {
356 maoriTextDetector = new MaoriDetector(runSilent);
357 } else {
358 maoriTextDetector = new MaoriDetector(runSilent, minConfidence);
359 }
360
361 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT);
362 boolean textIsInMaori = false;
363
364 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
365 if(inFile != null) {
366 System.err.println("Reading text from file " + inFile.getPath());
367 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
368 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
369 } // let outer try deal with any file/reading exceptions
370 }
371 else if (readFromStdIn) {
372 System.err.println("Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
373 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
374 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
375 } // let outer try deal with any file/reading exceptions
376 }
377
378 if(textIsInMaori) {
379 returnVal = 0;
380 } else {
381 returnVal = 1;
382 }
383
384 } catch(Exception e) {
385 e.printStackTrace();
386
387 } finally {
388 System.err.println("Exitting program with returnVal " + returnVal + "...\n");
389 System.exit(returnVal);
390 }
391 }
392
393 // test hardcoded string
394 public static void oldMain(String args[]) {
395 int returnVal = -1;
396 boolean silentMode = false;
397
398 try {
399 MaoriDetector maoriTextDetector = new MaoriDetector(silentMode);
400
401 boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT);
402 if(textIsInMaori) {
403 returnVal = 0;
404 } else {
405 returnVal = 1;
406 }
407
408 } catch(Exception e) {
409 e.printStackTrace();
410 } finally {
411 System.err.println("Exitting program with returnVal " + returnVal + "...\n");
412 System.exit(returnVal);
413 }
414 }
415
416
417}
Note: See TracBrowser for help on using the repository browser.