1 | /**
|
---|
2 | * Class that uses OpenNLP with the Language Detection Model to determine, with a default
|
---|
3 | * or configurable level of confidence, whether text (from a file or stdin) is in MÄori or not.
|
---|
4 | * Internal functions can be used for detecting any of the 103 languages currently supported by
|
---|
5 | * the OpenNLP Language Detection Model.
|
---|
6 | *
|
---|
7 | * http://opennlp.apache.org/news/model-langdetect-183.html
|
---|
8 | * language detector model: http://opennlp.apache.org/models.html
|
---|
9 | * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
|
---|
10 | * Use of Apache OpenNLP in general:
|
---|
11 | * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
|
---|
12 | * Use of OpenNLP for language detection:
|
---|
13 | * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
|
---|
14 | *
|
---|
15 | * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
|
---|
16 | * See also the accompanying README file.
|
---|
17 | *
|
---|
18 | * July 2019
|
---|
19 | */
|
---|
20 |
|
---|
21 | package org.greenstone.atea;
|
---|
22 |
|
---|
23 | import java.io.*;
|
---|
24 | import opennlp.tools.langdetect.*;
|
---|
25 | import opennlp.tools.sentdetect.*;
|
---|
26 | import opennlp.tools.util.*;
|
---|
27 |
|
---|
28 | import java.util.ArrayList;
|
---|
29 |
|
---|
30 | /**
|
---|
31 | * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
|
---|
32 | * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
|
---|
33 | * (which is the language detection model zipped up and renamed to .bin extension).
|
---|
34 | *
|
---|
35 | * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
|
---|
36 | * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
|
---|
37 | *
|
---|
38 | * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
|
---|
39 | *
|
---|
40 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
|
---|
41 | *
|
---|
42 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>
|
---|
43 | *
|
---|
44 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
|
---|
45 | * Press enter. This variant of the program expects text to stream in from standard input.
|
---|
46 | * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
|
---|
47 | *
|
---|
48 | * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
|
---|
49 | * Also has information on how to run this class if it's in a Java package.
|
---|
50 | */
|
---|
51 | public class MaoriTextDetector {
|
---|
52 | /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
|
---|
53 | public static final String MAORI_3LETTER_CODE = "mri";
|
---|
54 | public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
|
---|
55 |
|
---|
56 | /** Configurable: cut off minimum confidence value,
|
---|
57 | greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */
|
---|
58 | public final double MINIMUM_CONFIDENCE;
|
---|
59 |
|
---|
60 | /** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */
|
---|
61 | public final boolean silentMode;
|
---|
62 |
|
---|
63 | private final String OPENNLP_MODELS_RELATIVE_PATH = "models" + File.separator;
|
---|
64 |
|
---|
65 | /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
|
---|
66 | private final String LANG_DETECT_MODEL_RELATIVE_PATH = OPENNLP_MODELS_RELATIVE_PATH + "langdetect-183.bin";
|
---|
67 |
|
---|
68 | /**
|
---|
69 | * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
|
---|
70 | * Created once in the constructor, can be used as often as needed thereafter.
|
---|
71 | */
|
---|
72 | private LanguageDetector myCategorizer = null;
|
---|
73 |
|
---|
74 | /**
|
---|
75 | * The Sentence Detection object that does the sentence splitting for the language
|
---|
76 | * the sentece model was trained for.
|
---|
77 | */
|
---|
78 | private SentenceDetectorME sentenceDetector = null;
|
---|
79 |
|
---|
80 | /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
|
---|
81 | public static final String TEST_MRI_INPUT_TEXT = "Ko tÄnei te Whare WÄnanga o Waikato e whakatau nei i ngÄ iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngÄ maunga whakaruru e tau awhi nei.";
|
---|
82 |
|
---|
83 | /** test input string for a negative result */
|
---|
84 | public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
|
---|
85 |
|
---|
86 |
|
---|
87 | public MaoriTextDetector(boolean silentMode) throws Exception {
|
---|
88 | this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
|
---|
89 | }
|
---|
90 |
|
---|
91 | /** Constructor that uses the sentence Model we trained for MÄori */
|
---|
92 | public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
|
---|
93 | this(silentMode, min_confidence, "mri-sent_trained.bin");
|
---|
94 | }
|
---|
95 |
|
---|
96 | /** More general constructor that can use sentence detector models for other languages */
|
---|
97 | public MaoriTextDetector(boolean silentMode, double min_confidence,
|
---|
98 | String sentenceModelFileName) throws Exception
|
---|
99 | {
|
---|
100 | this.silentMode = silentMode;
|
---|
101 | this.MINIMUM_CONFIDENCE = min_confidence;
|
---|
102 |
|
---|
103 | // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
|
---|
104 | String langDetectModelPath = System.getenv("OPENNLP_HOME");
|
---|
105 | if(System.getenv("OPENNLP_HOME") == null) {
|
---|
106 | throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
|
---|
107 | }
|
---|
108 | langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
|
---|
109 | File langDetectModelBinFile = new File(langDetectModelPath);
|
---|
110 | if(!langDetectModelBinFile.exists()) {
|
---|
111 | throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
|
---|
112 | + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
|
---|
113 | + "\n\t*** with the model file 'langdetect-183.bin' in it.");
|
---|
114 | }
|
---|
115 |
|
---|
116 |
|
---|
117 | // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
|
---|
118 | // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
|
---|
119 | // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
|
---|
120 | try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
|
---|
121 |
|
---|
122 | LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
|
---|
123 |
|
---|
124 | // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
|
---|
125 | this.myCategorizer = new LanguageDetectorME(model);
|
---|
126 | }/*catch(Exception e) {
|
---|
127 | e.printStackTrace();
|
---|
128 | }*/
|
---|
129 |
|
---|
130 | // instantiating function should handle critical exceptions. Constructors shouldn't.
|
---|
131 |
|
---|
132 |
|
---|
133 |
|
---|
134 | // 3. Set up our sentence model and SentenceDetector object
|
---|
135 | String sentenceModelPath = System.getenv("OPENNLP_HOME") + File.separator
|
---|
136 | + OPENNLP_MODELS_RELATIVE_PATH + sentenceModelFileName; // "mri-sent_trained.bin" default
|
---|
137 | File sentenceModelBinFile = new File(sentenceModelPath);
|
---|
138 | if(!sentenceModelBinFile.exists()) {
|
---|
139 | throw new Exception("\n\t*** " + sentenceModelBinFile.getPath() + " doesn't exist."
|
---|
140 | + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder"
|
---|
141 | + "\n\t*** with the model file "+sentenceModelFileName+" in it.");
|
---|
142 | }
|
---|
143 | try (InputStream modelIn = new FileInputStream(sentenceModelPath)) {
|
---|
144 | // https://www.tutorialspoint.com/opennlp/opennlp_sentence_detection.htm
|
---|
145 | SentenceModel sentenceModel = new SentenceModel(modelIn);
|
---|
146 | this.sentenceDetector = new SentenceDetectorME(sentenceModel);
|
---|
147 |
|
---|
148 | } // instantiating function should handle this critical exception
|
---|
149 | }
|
---|
150 |
|
---|
151 | /**
|
---|
152 | * In this class' constructor, need to have set up the Sentence Detection Model
|
---|
153 | * for the langCode passed in to this function in order for the output to make
|
---|
154 | * sense for that language.
|
---|
155 | */
|
---|
156 | public ArrayList<String> getAllSentencesInLanguage(String langCode, String text, double confidenceCutoff)
|
---|
157 | {
|
---|
158 |
|
---|
159 | // we'll be storing just those sentences in text that are in the denoted language code
|
---|
160 | ArrayList<String> mriSentences = new ArrayList<String>();
|
---|
161 | // OpenNLP language detection works best with a minimum of 2 sentences
|
---|
162 | // See https://opennlp.apache.org/news/model-langdetect-183.html
|
---|
163 | // "It is important to note that this model is trained for and works well with
|
---|
164 | // longer texts that have at least 2 sentences or more from the same language."
|
---|
165 |
|
---|
166 | // For evaluating single languages, I used a very small data set and found that
|
---|
167 | // if the primary language detected is MRI AND if the confidence is >= 0.1, the
|
---|
168 | // results appear reasonably to be in te reo MÄori.
|
---|
169 |
|
---|
170 | String[] sentences = sentenceDetector.sentDetect(text);
|
---|
171 |
|
---|
172 | for(int i = 0; i < sentences.length; i++) {
|
---|
173 | String sentence = sentences[i];
|
---|
174 |
|
---|
175 | //System.err.println(sentence);
|
---|
176 |
|
---|
177 | Language bestLanguage = myCategorizer.predictLanguage(sentence);
|
---|
178 | double confidence = bestLanguage.getConfidence();
|
---|
179 |
|
---|
180 | if(bestLanguage.getLang().equals(langCode) && confidence >= confidenceCutoff) {
|
---|
181 | System.err.println("Adding sentence: " + sentence + "\n");
|
---|
182 | mriSentences.add(sentence);
|
---|
183 | } else {
|
---|
184 | System.err.println("SKIPPING sentence: " + sentence + "\n");
|
---|
185 | }
|
---|
186 | }
|
---|
187 | return mriSentences;
|
---|
188 | }
|
---|
189 |
|
---|
190 |
|
---|
191 | public ArrayList<String> getAllSentencesInMaori(String text) throws Exception {
|
---|
192 | // big assumption here: that we can split incoming text into sentences
|
---|
193 | // for any language (using the MÄori language trained sentence model),
|
---|
194 | // despite not knowing what language those sentences are in
|
---|
195 | // Hinges on MRI sentences detection being similar to at least ENG equivalent
|
---|
196 |
|
---|
197 |
|
---|
198 | // we'll be storing just those sentences in text that are in MÄori.
|
---|
199 |
|
---|
200 | // OpenNLP language detection works best with a minimum of 2 sentences
|
---|
201 | // See https://opennlp.apache.org/news/model-langdetect-183.html
|
---|
202 | // "It is important to note that this model is trained for and works well with
|
---|
203 | // longer texts that have at least 2 sentences or more from the same language."
|
---|
204 |
|
---|
205 | // For evaluating single languages, I used a very small data set and found that
|
---|
206 | // if the primary language detected is MRI AND if the confidence is >= 0.1, the
|
---|
207 | // results appear reasonably to be in te reo MÄori.
|
---|
208 |
|
---|
209 | final double confidenceCutoff = 0.1;
|
---|
210 | return getAllSentencesInLanguage(MAORI_3LETTER_CODE, text, confidenceCutoff);
|
---|
211 | }
|
---|
212 |
|
---|
213 |
|
---|
214 | /**
|
---|
215 | * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
|
---|
216 | * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
|
---|
217 | */
|
---|
218 | public boolean isTextInMaori(String text) {
|
---|
219 | return isTextInLanguage(MAORI_3LETTER_CODE, text);
|
---|
220 | }
|
---|
221 |
|
---|
222 | /** @param langCode is 3 letter language code, ISO 639-2/3
|
---|
223 | * https://www.loc.gov/standards/iso639-2/php/code_list.php
|
---|
224 | * https://en.wikipedia.org/wiki/ISO_639-3
|
---|
225 | * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
|
---|
226 | * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
|
---|
227 | */
|
---|
228 | public boolean isTextInLanguage(String langCode, String text) {
|
---|
229 | // Get the most probable language
|
---|
230 | Language bestLanguage = myCategorizer.predictLanguage(text);
|
---|
231 | doPrint("Best language: " + bestLanguage.getLang());
|
---|
232 | doPrint("Best language confidence: " + bestLanguage.getConfidence());
|
---|
233 |
|
---|
234 | return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
|
---|
235 | }
|
---|
236 |
|
---|
237 |
|
---|
238 | /**
|
---|
239 | * Handle "smaller" textfiles/streams of text read in.
|
---|
240 | * Return value is the same as for isTextInMaori(String text);
|
---|
241 | */
|
---|
242 | public boolean isTextInMaori(BufferedReader reader) throws Exception {
|
---|
243 | return isTextInLanguage(MAORI_3LETTER_CODE, reader);
|
---|
244 | }
|
---|
245 | /**
|
---|
246 | * Handle "smaller" textfiles/streams of text read in.
|
---|
247 | * Return value is the same as for isTextInLanguage(String langCode, String text);
|
---|
248 | */
|
---|
249 | public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
|
---|
250 | // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
|
---|
251 |
|
---|
252 | StringBuilder text = new StringBuilder();
|
---|
253 | String line = null;
|
---|
254 |
|
---|
255 |
|
---|
256 | while((line = reader.readLine()) != null) { // readLine removes newline separator
|
---|
257 | text.append(line + "\n"); // add back (unix style) line ending
|
---|
258 | }
|
---|
259 | return isTextInLanguage(langCode, text.toString());
|
---|
260 | }
|
---|
261 |
|
---|
262 | /*
|
---|
263 | * Need better handling of "larger" textfiles/streams of text read in:
|
---|
264 | * what if multiple languages with high confidence every NUM_LINES read in?
|
---|
265 | * Does this mean the file is multi-lingual with each section dominated by a different language?
|
---|
266 | * How best to convey such information to the user?
|
---|
267 | */
|
---|
268 | /**
|
---|
269 | * Rudimentary attempt to deal with very large files.
|
---|
270 | * Return value is the same as for isTextInMaori(String text);
|
---|
271 | */
|
---|
272 | public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
|
---|
273 | return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
|
---|
274 | }
|
---|
275 |
|
---|
276 | /**
|
---|
277 | * Rudimentary attempt to deal with very large files.
|
---|
278 | * Return value is the same as for isTextInLanguage(String langCode, String text);
|
---|
279 | */
|
---|
280 | public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
|
---|
281 | // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
|
---|
282 |
|
---|
283 | final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
|
---|
284 |
|
---|
285 | StringBuilder text = new StringBuilder();
|
---|
286 | String line = null;
|
---|
287 |
|
---|
288 | double cumulativeConfidence = 0;
|
---|
289 | int numLoops = 0;
|
---|
290 |
|
---|
291 | int i = 0;
|
---|
292 | String language = null;
|
---|
293 |
|
---|
294 | while((line = reader.readLine()) != null) { // readLine removes newline separator
|
---|
295 | text.append(line + "\n"); // add back (unix style) line ending
|
---|
296 |
|
---|
297 | i++; // read nth line of numLoop
|
---|
298 |
|
---|
299 |
|
---|
300 | if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
|
---|
301 |
|
---|
302 |
|
---|
303 | Language bestLanguage = myCategorizer.predictLanguage(text.toString());
|
---|
304 | if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
|
---|
305 | doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
|
---|
306 | }
|
---|
307 | language = bestLanguage.getLang();
|
---|
308 | cumulativeConfidence += bestLanguage.getConfidence();
|
---|
309 |
|
---|
310 | doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
|
---|
311 |
|
---|
312 | // finished analysing language of NUM_LINES of text
|
---|
313 | text = new StringBuilder();
|
---|
314 | i = 0;
|
---|
315 | numLoops++;
|
---|
316 | }
|
---|
317 | }
|
---|
318 |
|
---|
319 | // process any (remaining) text that was less than n NUM_LINES
|
---|
320 | if(!text.toString().equals("")) {
|
---|
321 | text.append(line + "\n"); // add back (unix style) line ending
|
---|
322 | i++;
|
---|
323 |
|
---|
324 | Language bestLanguage = myCategorizer.predictLanguage(text.toString());
|
---|
325 |
|
---|
326 | if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
|
---|
327 | doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
|
---|
328 | }
|
---|
329 | language = bestLanguage.getLang();
|
---|
330 | cumulativeConfidence += bestLanguage.getConfidence();
|
---|
331 | doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
|
---|
332 | }
|
---|
333 |
|
---|
334 |
|
---|
335 | int totalLinesRead = numLoops * NUM_LINES + i; // not used
|
---|
336 | double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
|
---|
337 |
|
---|
338 |
|
---|
339 | return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
|
---|
340 | }
|
---|
341 |
|
---|
342 |
|
---|
343 | /**
|
---|
344 | * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
|
---|
345 | * UNUSED.
|
---|
346 | */
|
---|
347 | public void predictedLanguages(String text) {
|
---|
348 | // Get an array with the most probable languages
|
---|
349 |
|
---|
350 | Language[] languages = myCategorizer.predictLanguages(text);
|
---|
351 |
|
---|
352 | if(languages == null || languages.length <= 0) {
|
---|
353 | doPrintErr("No languages predicted for the input text");
|
---|
354 | } else {
|
---|
355 | for(int i = 0; i < languages.length; i++) {
|
---|
356 | doPrint("Language prediction " + i + ": " + languages[i]);
|
---|
357 | }
|
---|
358 | }
|
---|
359 |
|
---|
360 | }
|
---|
361 |
|
---|
362 | public void doPrint(String msg) {
|
---|
363 | doPrint(this.silentMode, msg);
|
---|
364 | }
|
---|
365 | public void doPrintErr(String msg) {
|
---|
366 | doPrintErr(this.silentMode, msg);
|
---|
367 | }
|
---|
368 |
|
---|
369 | /********** STATIC METHODS *************/
|
---|
370 |
|
---|
371 | public static void doPrint(boolean runSilent, String msg) {
|
---|
372 | if(!runSilent) System.out.println(msg);
|
---|
373 | }
|
---|
374 | public static void doPrintErr(boolean runSilent, String msg) {
|
---|
375 | if(!runSilent) System.err.println(msg);
|
---|
376 | }
|
---|
377 |
|
---|
378 | public static void printUsage() {
|
---|
379 | System.err.println("Run this program with:");
|
---|
380 | System.err.println("\t--help (-h)\tfor seeing this usage message again");
|
---|
381 | System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
|
---|
382 | System.err.println("\t--file (-f)\tto provide an input file path");
|
---|
383 | System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value.");
|
---|
384 | System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
|
---|
385 | System.err.println("\t\tof the predicted language that will be considered acceptable.");
|
---|
386 | System.err.println();
|
---|
387 | System.err.println("This program terminates with exit value:");
|
---|
388 | System.err.println("\t0 if the input text is in Maori");
|
---|
389 | System.err.println("\t1 if input text is not in Maori");
|
---|
390 | System.err.println();
|
---|
391 | System.err.println("\t-1 if the input arguments were wrong");
|
---|
392 | System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
|
---|
393 | System.err.println("\t2 if the user asked to run this program with --help/-h.");
|
---|
394 | System.err.println();
|
---|
395 | }
|
---|
396 |
|
---|
397 | /**
|
---|
398 | * The main program exits with:
|
---|
399 | * 0 if text is in Maori;
|
---|
400 | * 1 if text is not in Maori;
|
---|
401 | *
|
---|
402 | * -1 if the input arguments were wrong
|
---|
403 | * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
|
---|
404 | * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
|
---|
405 | * 2 if the user asked to run this program with --help/-h.
|
---|
406 | */
|
---|
407 | public static void main(String args[]) {
|
---|
408 | int returnVal = -1;
|
---|
409 |
|
---|
410 | // 1. Check input arguments
|
---|
411 | boolean printUsage = false;
|
---|
412 | boolean readFromStdIn = false;
|
---|
413 | File inFile = null;
|
---|
414 | boolean runSilent = false;
|
---|
415 | double minConfidence = -1;
|
---|
416 |
|
---|
417 | for (int i = 0; !printUsage && i < args.length; i++) {
|
---|
418 |
|
---|
419 | // check for help first and quit after printing usage
|
---|
420 | if(args[i].equals("--help") || args[i].equals("-h")) {
|
---|
421 | printUsage = true;
|
---|
422 | returnVal = 2;
|
---|
423 | } else if(args[i].equals("--silent") || args[i].equals("-s")) {
|
---|
424 | runSilent = true;
|
---|
425 | } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
|
---|
426 | i++;
|
---|
427 | if(i >= args.length) {
|
---|
428 | doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
|
---|
429 | printUsage = true;
|
---|
430 | returnVal = -1;
|
---|
431 | } else {
|
---|
432 | try {
|
---|
433 | minConfidence = Double.parseDouble(args[i]);
|
---|
434 | if(minConfidence < 0 || minConfidence > 1) {
|
---|
435 | throw new NumberFormatException("Number out of range, must be between 0-1");
|
---|
436 | }
|
---|
437 | } catch(NumberFormatException nfe) {
|
---|
438 | doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
|
---|
439 | printUsage = true;
|
---|
440 | returnVal = -1;
|
---|
441 | }
|
---|
442 | }
|
---|
443 | } else if(args[i].equals("-")) {
|
---|
444 | readFromStdIn = true;
|
---|
445 | //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
|
---|
446 | } else if(args[i].equals("--file") || args[i].equals("-f")) {
|
---|
447 | i++;
|
---|
448 | if(i >= args.length) {
|
---|
449 | doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
|
---|
450 | printUsage = true;
|
---|
451 | returnVal = -1;
|
---|
452 | } else {
|
---|
453 | String filePath = args[i];
|
---|
454 | inFile = new File(filePath);
|
---|
455 | if(!inFile.isFile()) {
|
---|
456 | doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
|
---|
457 | printUsage = true;
|
---|
458 | returnVal = -1;
|
---|
459 | }
|
---|
460 | }
|
---|
461 | } else { // unrecognised input argument
|
---|
462 | doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
|
---|
463 | printUsage = true;
|
---|
464 | returnVal = -1;
|
---|
465 | }
|
---|
466 | }
|
---|
467 |
|
---|
468 | if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane
|
---|
469 | if(!readFromStdIn && inFile == null) { // at least one input source must be provided
|
---|
470 | doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
|
---|
471 | printUsage = true;
|
---|
472 | returnVal = -1;
|
---|
473 | }
|
---|
474 |
|
---|
475 | if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
|
---|
476 | doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
|
---|
477 | printUsage = true;
|
---|
478 | returnVal = -1;
|
---|
479 | }
|
---|
480 | }
|
---|
481 |
|
---|
482 | if(printUsage) {
|
---|
483 | // If not running silent print usage.
|
---|
484 | // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
|
---|
485 | if(returnVal == 2 || !runSilent) {
|
---|
486 | printUsage();
|
---|
487 | }
|
---|
488 | System.exit(returnVal);
|
---|
489 | }
|
---|
490 |
|
---|
491 |
|
---|
492 | // 2. Finally, we can now do the actual language detection
|
---|
493 | try {
|
---|
494 | MaoriTextDetector maoriTextDetector = null;
|
---|
495 | if(minConfidence == -1) {
|
---|
496 | maoriTextDetector = new MaoriTextDetector(runSilent);
|
---|
497 | } else {
|
---|
498 | maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
|
---|
499 | }
|
---|
500 |
|
---|
501 | // TODO
|
---|
502 | maoriTextDetector.getAllSentencesInMaori(
|
---|
503 | "Primary sources ~ Published Maramataka Mo Te Tau 1885, Nepia: Te Haaringi, Kai-ta Pukapuka, kei Hehitingi Tiriti, 1884. Maramataka Mo Te Tau 1886, Nepia: Na te Haaringi i ta ki tona Whare Perehi Pukapuka, 1885. Maramataka Mo Te Tau 1887, Nepia: Na te Haaringi i ta ki tona Whare Perehi Pukapuka, 1886. Maramataka Mo Te Tau 1888, Nepia: Na te Haaringi i ta ki tona Whare Perehi Pukapuka, 1887. Maramataka Mo Te Tau 1889, Nepia: Na te Haaringi i ta ki tona Whare Perehi Pukapuka, 1888. Maramataka Mo Te Tau 1890, Nepia: Na te Haaringi i ta ki tona Whare Perehi Pukapuka, 1889. Maramataka Mo Te Tau 1891, Kihipane: Na te Muri i ta ki tona Whare Perehi Pukapuka, 1890. Maramataka Mo Te Tau 1892, Nepia: Na te Haaringi, i ta ki tona Whare Perehi Pukapuka, 1891. Maramataka Mo Te Tau 1893, Kihipane: Na te Muri i ta ki tona Whare Perehi Pukapuka, 1892. Maramataka Mo Te Tau 1894, Kihipane: Na te Muri i ta ki tona Whare Perehi Pukapuka, 1893. Maramataka Me Te Tau 1895, Kihipane: Na te Muri i Ta ki tona whare perehi pukapuka, 1894. Maramataka Mo Te Tau 1896, Kihipane: Na te Muri i ta ki tona Whare Perehi Pukapuka, 1895. Maramataka Mo Te Tau 1897, Kihipane: Na te Muri i ta ki tona Whare Perehi Pukapuka 1896. Maramataka Mo Te Tau 1898, Turanga: Na te Wiremu Hapata i ta ki Te Rau Kahikatea, 1897. Ko Te Paipera Tapu Ara, Ko Te Kawenata Tawhito Me Te Kawenata Hou, He Mea Whakamaori Mai No Nga Reo I Oroko-Tuhituhia Ai, Ranana: He mea ta ki te perehi a W.M.Watts ma te Komiti Ta Paipera mo Ingarangi mo Te Ao Katoa, 1868. Ko Te Pukapuka O Nga Inoinga, Me Era Atu Tikanga, I Whakaritea E Te Hahi O Ingarani, Mo Te Minitatanga O Nga Hakarameta, O Era Atu Ritenga a Te Hahi: Me Nga Waiata Ano Hoki a Rawiri, Me Te Tikanga Mo Te Whiriwhiringa, Mo Te Whakaturanga, Me Te Whakatapunga O Nga Pihopa, O Nga Piriti, Me Nga Rikona, Me Nga Himene, Ranana: I taia tenei ki te perehi o te Komiti mo te whakapuaki i to mohiotanga ki a te Karaiti, 1858. Ko Te Pukapuka O Nga Inoinga, Me Era Atu Tikanga, I Whakaritea E Te Hahi O Ingarani, Mo Te Minitatanga O Nga Hakarameta, O Era Atu Ritenga a Te Hahi: Me Nga Waiata Ano Hoki a Rawiri, Me Te Tikanga Mo Te Whiriwhiringa, Mo Te Whakaturanga, Me Te Whakatapunga O Nga Pihopa, O Nga Piriti, Me Nga Rikona. 1883. The Book of Common Prayer, and Administration of the Sacraments, and Other Rites and Ceremonies of the Church, According to the Use of the United Church of England and Ireland: Together with the Proper Lessons for Sundays and Other Holy-Days, and a New Version of the Psalms of David, Oxford: Printed at 134 the University Press, 1852. The Book of Common Prayer and Administration of the Sacraments, and Other Rites and Ceremonies of the Church, According to the Church of England: Together with the Psalter or Psalms of David, Printed as They Are to Be Sung or Said in Churches: And the Form and Manner of Making, Ordaining, and Consecrating of Bishops, Priests, and Deacons, London: G.E. Eyre and W. Spottiswoode, after 1871 but before 1877. Brown, A.N., The Journals of A.N. Brown C.M.S. Missionary Tauranga Covering the Years 1840 to 1842, Tauranga: The Elms Trust, 1990 (Commemorative Edition). ______________, Select Sermons of A.N. Brown, Tauranga: The Elms Trust, 1997. Fitzgerald, Caroline (ed.), Te Wiremu Henry Williams: Early Years in the North, Wellington: Huia Publishers, 2011. The Hawke's Bay Almanac, Napier: James Wood, Hawke's Bay Herald, 1862, 1863, 1867.");
|
---|
504 |
|
---|
505 |
|
---|
506 | //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
|
---|
507 | boolean textIsInMaori = false;
|
---|
508 |
|
---|
509 | // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
|
---|
510 | if(inFile != null) {
|
---|
511 | doPrint(runSilent, "Reading text from file " + inFile.getPath());
|
---|
512 | try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
|
---|
513 | textIsInMaori = maoriTextDetector.isTextInMaori(reader);
|
---|
514 | } // let outer try deal with any file/reading exceptions
|
---|
515 | }
|
---|
516 | else if (readFromStdIn) {
|
---|
517 | doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
|
---|
518 | try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
|
---|
519 | textIsInMaori = maoriTextDetector.isTextInMaori(reader);
|
---|
520 | } // let outer try deal with any file/reading exceptions
|
---|
521 | }
|
---|
522 |
|
---|
523 | if(textIsInMaori) {
|
---|
524 | returnVal = 0;
|
---|
525 | } else {
|
---|
526 | returnVal = 1;
|
---|
527 | }
|
---|
528 |
|
---|
529 | } catch(Exception e) {
|
---|
530 | e.printStackTrace();
|
---|
531 |
|
---|
532 | } finally {
|
---|
533 | doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
|
---|
534 | System.exit(returnVal);
|
---|
535 | }
|
---|
536 | }
|
---|
537 |
|
---|
538 | }
|
---|