[33335] | 1 | /**
|
---|
[33350] | 2 | * Class that uses OpenNLP with the Language Detection Model to determine, with a default
|
---|
| 3 | * or configurable level of confidence, whether text (from a file or stdin) is in MÄori or not.
|
---|
| 4 | * Internal functions can be used for detecting any of the 103 languages currently supported by
|
---|
| 5 | * the OpenNLP Language Detection Model.
|
---|
| 6 | *
|
---|
[33335] | 7 | * http://opennlp.apache.org/news/model-langdetect-183.html
|
---|
| 8 | * language detector model: http://opennlp.apache.org/models.html
|
---|
[33355] | 9 | * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/
|
---|
[33335] | 10 | * Use of Apache OpenNLP in general:
|
---|
| 11 | * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
|
---|
| 12 | * Use of OpenNLP for language detection:
|
---|
| 13 | * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
|
---|
| 14 | *
|
---|
| 15 | * This code was based on the information and sample code at the above links and the links dispersed throughout this file.
|
---|
[33350] | 16 | * See also the accompanying README file.
|
---|
| 17 | *
|
---|
| 18 | * July 2019
|
---|
[33335] | 19 | */
|
---|
| 20 |
|
---|
[33397] | 21 | package org.greenstone.atea;
|
---|
| 22 |
|
---|
[33335] | 23 | import java.io.*;
|
---|
| 24 | import opennlp.tools.langdetect.*;
|
---|
| 25 | import opennlp.tools.util.*;
|
---|
| 26 |
|
---|
| 27 | /**
|
---|
[33336] | 28 | * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
|
---|
[33350] | 29 | * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
|
---|
| 30 | * (which is the language detection model zipped up and renamed to .bin extension).
|
---|
| 31 | *
|
---|
| 32 | * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
|
---|
[33397] | 33 | * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
|
---|
[33335] | 34 | *
|
---|
[33350] | 35 | * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
|
---|
| 36 | *
|
---|
[33397] | 37 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
|
---|
[33336] | 38 | *
|
---|
[33397] | 39 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>
|
---|
[33336] | 40 | *
|
---|
[33397] | 41 | * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
|
---|
[33350] | 42 | * Press enter. This variant of the program expects text to stream in from standard input.
|
---|
[33336] | 43 | * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
|
---|
| 44 | *
|
---|
[33335] | 45 | * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
|
---|
| 46 | * Also has information on how to run this class if it's in a Java package.
|
---|
| 47 | */
|
---|
[33338] | 48 | public class MaoriTextDetector {
|
---|
[33336] | 49 | /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
|
---|
| 50 | public static final String MAORI_3LETTER_CODE = "mri";
|
---|
| 51 | public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
|
---|
[33335] | 52 |
|
---|
[33336] | 53 | /** Configurable: cut off minimum confidence value,
|
---|
[33338] | 54 | greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */
|
---|
[33336] | 55 | public final double MINIMUM_CONFIDENCE;
|
---|
[33350] | 56 |
|
---|
[33338] | 57 | /** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */
|
---|
[33336] | 58 | public final boolean silentMode;
|
---|
| 59 |
|
---|
| 60 | /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
|
---|
| 61 | private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
|
---|
[33350] | 62 |
|
---|
| 63 | /**
|
---|
| 64 | * The LanguageDetectorModel object that will do the actual language detection/prediction for us.
|
---|
| 65 | * Created once in the constructor, can be used as often as needed thereafter.
|
---|
| 66 | */
|
---|
[33336] | 67 | private LanguageDetector myCategorizer = null;
|
---|
| 68 |
|
---|
[33350] | 69 | /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
|
---|
[33336] | 70 | public static final String TEST_MRI_INPUT_TEXT = "Ko tÄnei te Whare WÄnanga o Waikato e whakatau nei i ngÄ iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngÄ maunga whakaruru e tau awhi nei.";
|
---|
| 71 |
|
---|
| 72 | /** test input string for a negative result */
|
---|
| 73 | public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
|
---|
[33335] | 74 |
|
---|
[33336] | 75 |
|
---|
[33338] | 76 | public MaoriTextDetector(boolean silentMode) throws Exception {
|
---|
[33336] | 77 | this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
|
---|
| 78 | }
|
---|
| 79 |
|
---|
[33338] | 80 | public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
|
---|
[33336] | 81 | this.silentMode = silentMode;
|
---|
| 82 | this.MINIMUM_CONFIDENCE = min_confidence;
|
---|
| 83 |
|
---|
| 84 | // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
|
---|
| 85 | String langDetectModelPath = System.getenv("OPENNLP_HOME");
|
---|
| 86 | if(System.getenv("OPENNLP_HOME") == null) {
|
---|
| 87 | throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
|
---|
| 88 | }
|
---|
| 89 | langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
|
---|
| 90 | File langDetectModelBinFile = new File(langDetectModelPath);
|
---|
| 91 | if(!langDetectModelBinFile.exists()) {
|
---|
| 92 | throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
|
---|
| 93 | + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
|
---|
| 94 | }
|
---|
| 95 |
|
---|
| 96 |
|
---|
| 97 | // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
|
---|
[33335] | 98 | // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
|
---|
| 99 | // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
|
---|
[33336] | 100 | try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
|
---|
[33335] | 101 |
|
---|
| 102 | LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
|
---|
| 103 |
|
---|
| 104 | // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
|
---|
[33336] | 105 | this.myCategorizer = new LanguageDetectorME(model);
|
---|
| 106 | }/*catch(Exception e) {
|
---|
| 107 | e.printStackTrace();
|
---|
| 108 | }*/
|
---|
| 109 |
|
---|
| 110 | // instantiating function should handle critical exceptions. Constructors shouldn't.
|
---|
| 111 | }
|
---|
[33335] | 112 |
|
---|
[33336] | 113 | /**
|
---|
| 114 | * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
|
---|
| 115 | * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
|
---|
| 116 | */
|
---|
| 117 | public boolean isTextInMaori(String text) {
|
---|
| 118 | return isTextInLanguage(MAORI_3LETTER_CODE, text);
|
---|
| 119 | }
|
---|
[33335] | 120 |
|
---|
[33336] | 121 | /** @param langCode is 3 letter language code, ISO 639-2/3
|
---|
| 122 | * https://www.loc.gov/standards/iso639-2/php/code_list.php
|
---|
| 123 | * https://en.wikipedia.org/wiki/ISO_639-3
|
---|
| 124 | * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
|
---|
| 125 | * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
|
---|
| 126 | */
|
---|
| 127 | public boolean isTextInLanguage(String langCode, String text) {
|
---|
| 128 | // Get the most probable language
|
---|
| 129 | Language bestLanguage = myCategorizer.predictLanguage(text);
|
---|
[33338] | 130 | doPrint("Best language: " + bestLanguage.getLang());
|
---|
| 131 | doPrint("Best language confidence: " + bestLanguage.getConfidence());
|
---|
[33336] | 132 |
|
---|
| 133 | return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
|
---|
| 134 | }
|
---|
| 135 |
|
---|
| 136 |
|
---|
| 137 | /**
|
---|
| 138 | * Handle "smaller" textfiles/streams of text read in.
|
---|
| 139 | * Return value is the same as for isTextInMaori(String text);
|
---|
| 140 | */
|
---|
| 141 | public boolean isTextInMaori(BufferedReader reader) throws Exception {
|
---|
| 142 | return isTextInLanguage(MAORI_3LETTER_CODE, reader);
|
---|
| 143 | }
|
---|
| 144 | /**
|
---|
| 145 | * Handle "smaller" textfiles/streams of text read in.
|
---|
| 146 | * Return value is the same as for isTextInLanguage(String langCode, String text);
|
---|
| 147 | */
|
---|
| 148 | public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
|
---|
| 149 | // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
|
---|
| 150 |
|
---|
| 151 | StringBuilder text = new StringBuilder();
|
---|
| 152 | String line = null;
|
---|
| 153 |
|
---|
| 154 |
|
---|
| 155 | while((line = reader.readLine()) != null) { // readLine removes newline separator
|
---|
| 156 | text.append(line + "\n"); // add back (unix style) line ending
|
---|
| 157 | }
|
---|
| 158 | return isTextInLanguage(langCode, text.toString());
|
---|
| 159 | }
|
---|
| 160 |
|
---|
| 161 | /*
|
---|
| 162 | * Need better handling of "larger" textfiles/streams of text read in:
|
---|
| 163 | * what if multiple languages with high confidence every NUM_LINES read in?
|
---|
| 164 | * Does this mean the file is multi-lingual with each section dominated by a different language?
|
---|
| 165 | * How best to convey such information to the user?
|
---|
| 166 | */
|
---|
| 167 | /**
|
---|
| 168 | * Rudimentary attempt to deal with very large files.
|
---|
| 169 | * Return value is the same as for isTextInMaori(String text);
|
---|
| 170 | */
|
---|
| 171 | public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
|
---|
| 172 | return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
|
---|
| 173 | }
|
---|
| 174 |
|
---|
| 175 | /**
|
---|
| 176 | * Rudimentary attempt to deal with very large files.
|
---|
| 177 | * Return value is the same as for isTextInLanguage(String langCode, String text);
|
---|
| 178 | */
|
---|
| 179 | public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
|
---|
| 180 | // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
|
---|
| 181 |
|
---|
| 182 | final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
|
---|
| 183 |
|
---|
| 184 | StringBuilder text = new StringBuilder();
|
---|
| 185 | String line = null;
|
---|
| 186 |
|
---|
| 187 | double cumulativeConfidence = 0;
|
---|
| 188 | int numLoops = 0;
|
---|
| 189 |
|
---|
| 190 | int i = 0;
|
---|
| 191 | String language = null;
|
---|
| 192 |
|
---|
| 193 | while((line = reader.readLine()) != null) { // readLine removes newline separator
|
---|
| 194 | text.append(line + "\n"); // add back (unix style) line ending
|
---|
[33335] | 195 |
|
---|
[33336] | 196 | i++; // read nth line of numLoop
|
---|
| 197 |
|
---|
| 198 |
|
---|
| 199 | if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
|
---|
| 200 |
|
---|
| 201 |
|
---|
| 202 | Language bestLanguage = myCategorizer.predictLanguage(text.toString());
|
---|
| 203 | if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
|
---|
[33338] | 204 | doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
|
---|
[33335] | 205 | }
|
---|
[33336] | 206 | language = bestLanguage.getLang();
|
---|
| 207 | cumulativeConfidence += bestLanguage.getConfidence();
|
---|
| 208 |
|
---|
[33338] | 209 | doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
|
---|
[33336] | 210 |
|
---|
| 211 | // finished analysing language of NUM_LINES of text
|
---|
| 212 | text = new StringBuilder();
|
---|
| 213 | i = 0;
|
---|
| 214 | numLoops++;
|
---|
| 215 | }
|
---|
| 216 | }
|
---|
| 217 |
|
---|
| 218 | // process any (remaining) text that was less than n NUM_LINES
|
---|
| 219 | if(!text.toString().equals("")) {
|
---|
| 220 | text.append(line + "\n"); // add back (unix style) line ending
|
---|
| 221 | i++;
|
---|
[33335] | 222 |
|
---|
[33336] | 223 | Language bestLanguage = myCategorizer.predictLanguage(text.toString());
|
---|
| 224 |
|
---|
| 225 | if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
|
---|
[33338] | 226 | doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");
|
---|
[33336] | 227 | }
|
---|
| 228 | language = bestLanguage.getLang();
|
---|
| 229 | cumulativeConfidence += bestLanguage.getConfidence();
|
---|
[33338] | 230 | doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
|
---|
[33336] | 231 | }
|
---|
| 232 |
|
---|
| 233 |
|
---|
| 234 | int totalLinesRead = numLoops * NUM_LINES + i; // not used
|
---|
| 235 | double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
|
---|
| 236 |
|
---|
| 237 |
|
---|
| 238 | return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
|
---|
| 239 | }
|
---|
| 240 |
|
---|
| 241 |
|
---|
| 242 | /**
|
---|
| 243 | * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
|
---|
[33350] | 244 | * UNUSED.
|
---|
[33336] | 245 | */
|
---|
| 246 | public void predictedLanguages(String text) {
|
---|
| 247 | // Get an array with the most probable languages
|
---|
| 248 |
|
---|
| 249 | Language[] languages = myCategorizer.predictLanguages(text);
|
---|
| 250 |
|
---|
| 251 | if(languages == null || languages.length <= 0) {
|
---|
[33338] | 252 | doPrintErr("No languages predicted for the input text");
|
---|
[33336] | 253 | } else {
|
---|
| 254 | for(int i = 0; i < languages.length; i++) {
|
---|
[33338] | 255 | doPrint("Language prediction " + i + ": " + languages[i]);
|
---|
[33336] | 256 | }
|
---|
| 257 | }
|
---|
| 258 |
|
---|
| 259 | }
|
---|
| 260 |
|
---|
[33338] | 261 | public void doPrint(String msg) {
|
---|
| 262 | doPrint(this.silentMode, msg);
|
---|
| 263 | }
|
---|
| 264 | public void doPrintErr(String msg) {
|
---|
| 265 | doPrintErr(this.silentMode, msg);
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 | /********** STATIC METHODS *************/
|
---|
| 269 |
|
---|
| 270 | public static void doPrint(boolean runSilent, String msg) {
|
---|
| 271 | if(!runSilent) System.out.println(msg);
|
---|
| 272 | }
|
---|
| 273 | public static void doPrintErr(boolean runSilent, String msg) {
|
---|
| 274 | if(!runSilent) System.err.println(msg);
|
---|
| 275 | }
|
---|
| 276 |
|
---|
[33336] | 277 | public static void printUsage() {
|
---|
| 278 | System.err.println("Run this program with:");
|
---|
| 279 | System.err.println("\t--help (-h)\tfor seeing this usage message again");
|
---|
| 280 | System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
|
---|
| 281 | System.err.println("\t--file (-f)\tto provide an input file path");
|
---|
| 282 | System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]");
|
---|
| 283 | System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
|
---|
| 284 | System.err.println("\t\tof the predicted language that will be considered acceptable.");
|
---|
| 285 | System.err.println();
|
---|
| 286 | System.err.println("This program terminates with exit value:");
|
---|
| 287 | System.err.println("\t0 if the input text is in Maori");
|
---|
| 288 | System.err.println("\t1 if input text is not in Maori");
|
---|
| 289 | System.err.println();
|
---|
| 290 | System.err.println("\t-1 if the input arguments were wrong");
|
---|
[33338] | 291 | System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
|
---|
[33336] | 292 | System.err.println("\t2 if the user asked to run this program with --help/-h.");
|
---|
| 293 | System.err.println();
|
---|
| 294 | }
|
---|
| 295 |
|
---|
| 296 | /**
|
---|
| 297 | * The main program exits with:
|
---|
| 298 | * 0 if text is in Maori;
|
---|
| 299 | * 1 if text is not in Maori;
|
---|
| 300 | *
|
---|
| 301 | * -1 if the input arguments were wrong
|
---|
[33338] | 302 | * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
|
---|
[33336] | 303 | * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
|
---|
| 304 | * 2 if the user asked to run this program with --help/-h.
|
---|
| 305 | */
|
---|
| 306 | public static void main(String args[]) {
|
---|
| 307 | int returnVal = -1;
|
---|
| 308 |
|
---|
| 309 | // 1. Check input arguments
|
---|
| 310 | boolean printUsage = false;
|
---|
| 311 | boolean readFromStdIn = false;
|
---|
| 312 | File inFile = null;
|
---|
| 313 | boolean runSilent = false;
|
---|
| 314 | double minConfidence = -1;
|
---|
| 315 |
|
---|
| 316 | for (int i = 0; !printUsage && i < args.length; i++) {
|
---|
| 317 |
|
---|
| 318 | // check for help first and quit after printing usage
|
---|
| 319 | if(args[i].equals("--help") || args[i].equals("-h")) {
|
---|
| 320 | printUsage = true;
|
---|
| 321 | returnVal = 2;
|
---|
| 322 | } else if(args[i].equals("--silent") || args[i].equals("-s")) {
|
---|
| 323 | runSilent = true;
|
---|
| 324 | } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
|
---|
| 325 | i++;
|
---|
| 326 | if(i >= args.length) {
|
---|
[33338] | 327 | doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
|
---|
[33336] | 328 | printUsage = true;
|
---|
| 329 | returnVal = -1;
|
---|
| 330 | } else {
|
---|
| 331 | try {
|
---|
| 332 | minConfidence = Double.parseDouble(args[i]);
|
---|
| 333 | if(minConfidence < 0 || minConfidence > 1) {
|
---|
| 334 | throw new NumberFormatException("Number out of range, must be between 0-1");
|
---|
| 335 | }
|
---|
| 336 | } catch(NumberFormatException nfe) {
|
---|
[33338] | 337 | doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
|
---|
[33336] | 338 | printUsage = true;
|
---|
| 339 | returnVal = -1;
|
---|
| 340 | }
|
---|
| 341 | }
|
---|
| 342 | } else if(args[i].equals("-")) {
|
---|
| 343 | readFromStdIn = true;
|
---|
| 344 | //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
|
---|
| 345 | } else if(args[i].equals("--file") || args[i].equals("-f")) {
|
---|
| 346 | i++;
|
---|
| 347 | if(i >= args.length) {
|
---|
[33338] | 348 | doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
|
---|
[33336] | 349 | printUsage = true;
|
---|
| 350 | returnVal = -1;
|
---|
| 351 | } else {
|
---|
| 352 | String filePath = args[i];
|
---|
| 353 | inFile = new File(filePath);
|
---|
| 354 | if(!inFile.isFile()) {
|
---|
[33338] | 355 | doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
|
---|
[33336] | 356 | printUsage = true;
|
---|
| 357 | returnVal = -1;
|
---|
| 358 | }
|
---|
| 359 | }
|
---|
| 360 | } else { // unrecognised input argument
|
---|
[33338] | 361 | doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
|
---|
[33336] | 362 | printUsage = true;
|
---|
| 363 | returnVal = -1;
|
---|
| 364 | }
|
---|
| 365 | }
|
---|
| 366 |
|
---|
[33397] | 367 | if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane
|
---|
| 368 | if(!readFromStdIn && inFile == null) { // at least one input source must be provided
|
---|
| 369 | doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
|
---|
| 370 | printUsage = true;
|
---|
| 371 | returnVal = -1;
|
---|
| 372 | }
|
---|
| 373 |
|
---|
| 374 | if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
|
---|
| 375 | doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
|
---|
| 376 | printUsage = true;
|
---|
| 377 | returnVal = -1;
|
---|
| 378 | }
|
---|
[33336] | 379 | }
|
---|
| 380 |
|
---|
| 381 | if(printUsage) {
|
---|
[33338] | 382 | // If not running silent print usage.
|
---|
| 383 | // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
|
---|
| 384 | if(returnVal == 2 || !runSilent) {
|
---|
[33336] | 385 | printUsage();
|
---|
| 386 | }
|
---|
| 387 | System.exit(returnVal);
|
---|
| 388 | }
|
---|
[33350] | 389 |
|
---|
| 390 |
|
---|
| 391 | // 2. Finally, we can now do the actual language detection
|
---|
[33336] | 392 | try {
|
---|
[33338] | 393 | MaoriTextDetector maoriTextDetector = null;
|
---|
[33336] | 394 | if(minConfidence == -1) {
|
---|
[33338] | 395 | maoriTextDetector = new MaoriTextDetector(runSilent);
|
---|
[33336] | 396 | } else {
|
---|
[33338] | 397 | maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
|
---|
[33336] | 398 | }
|
---|
| 399 |
|
---|
[33338] | 400 | //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
|
---|
[33336] | 401 | boolean textIsInMaori = false;
|
---|
| 402 |
|
---|
| 403 | // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
|
---|
| 404 | if(inFile != null) {
|
---|
[33338] | 405 | doPrint(runSilent, "Reading text from file " + inFile.getPath());
|
---|
[33336] | 406 | try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
|
---|
| 407 | textIsInMaori = maoriTextDetector.isTextInMaori(reader);
|
---|
| 408 | } // let outer try deal with any file/reading exceptions
|
---|
| 409 | }
|
---|
| 410 | else if (readFromStdIn) {
|
---|
[33338] | 411 | doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
|
---|
[33336] | 412 | try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
|
---|
| 413 | textIsInMaori = maoriTextDetector.isTextInMaori(reader);
|
---|
| 414 | } // let outer try deal with any file/reading exceptions
|
---|
| 415 | }
|
---|
| 416 |
|
---|
| 417 | if(textIsInMaori) {
|
---|
| 418 | returnVal = 0;
|
---|
| 419 | } else {
|
---|
| 420 | returnVal = 1;
|
---|
| 421 | }
|
---|
| 422 |
|
---|
[33335] | 423 | } catch(Exception e) {
|
---|
| 424 | e.printStackTrace();
|
---|
[33336] | 425 |
|
---|
| 426 | } finally {
|
---|
[33338] | 427 | doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
|
---|
[33336] | 428 | System.exit(returnVal);
|
---|
[33335] | 429 | }
|
---|
[33336] | 430 | }
|
---|
| 431 |
|
---|
[33335] | 432 | }
|
---|