Changeset 33336 for gs3-extensions
- Timestamp:
- 2019-07-20T22:58:17+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection/src
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/MaoriDetector.java
r33335 r33336 15 15 16 16 /** 17 * Run as: 18 * wharariki:[115]/Scratch/ak19/openNLP-lang-detect/src>javac -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector.java 19 * wharariki:[116]/Scratch/ak19/openNLP-lang-detect/src>java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector 17 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation. 18 * Then, to compile this program: 19 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriDetector.java 20 * To run this program, one of: 21 * 22 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --help 23 * 24 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --file <full/path/to/textfile> 25 * 26 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector - 27 * which expects text to stream in from standard input. 28 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn. 20 29 * 21 30 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath … … 23 32 */ 24 33 public class MaoriDetector { 25 34 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */ 35 public static final String MAORI_3LETTER_CODE = "mri"; 36 public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50; 37 38 /** Configurable: cut off minimum confidence value, 39 greater or equal to which determines that the best predicted language is acceptable to user of MaoriDetector. */ 40 public final double MINIMUM_CONFIDENCE; 41 /** silentMode set to false means MaoriDetector won't print helpful messages while running. Set to true to run silently. */ 42 public final boolean silentMode; 43 44 /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */ 45 private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin"; 46 private LanguageDetector myCategorizer = null; 47 26 48 /** 27 * Taken from our university website49 * String taken from our university website 28 50 * https://www.waikato.ac.nz/maori/ 29 51 */ 30 public static final String TEST_INPUT_TEXT = "Ko tÄnei te Whare WÄnanga o Waikato e whakatau nei i ngÄ iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngÄ maunga whakaruru e tau awhi nei."; 31 32 public static void main(String args[]) { 52 public static final String TEST_MRI_INPUT_TEXT = "Ko tÄnei te Whare WÄnanga o Waikato e whakatau nei i ngÄ iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngÄ maunga whakaruru e tau awhi nei."; 53 54 /** test input string for a negative result */ 55 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language"; 56 57 58 public MaoriDetector(boolean silentMode) throws Exception { 59 this(silentMode, DEFAULT_MINIMUM_CONFIDENCE); 60 } 61 62 public MaoriDetector(boolean silentMode, double min_confidence) throws Exception { 63 this.silentMode = silentMode; 64 this.MINIMUM_CONFIDENCE = min_confidence; 65 66 // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists); 67 String langDetectModelPath = System.getenv("OPENNLP_HOME"); 68 if(System.getenv("OPENNLP_HOME") == null) { 69 throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder."); 70 } 71 langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH; 72 File langDetectModelBinFile = new File(langDetectModelPath); 73 if(!langDetectModelBinFile.exists()) { 74 throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist." 75 + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it."); 76 } 77 78 79 // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model. 33 80 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api 34 81 // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html 35 try (InputStream modelIn = new FileInputStream( "/Scratch/ak19/openNLP-lang-detect/langdetect-183.bin")) {82 try (InputStream modelIn = new FileInputStream(langDetectModelPath)) { 36 83 37 84 LanguageDetectorModel model = new LanguageDetectorModel(modelIn); 38 85 39 86 // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect 40 LanguageDetector myCategorizer = new LanguageDetectorME(model); 41 42 // Get the most probable language 43 Language bestLanguage = myCategorizer.predictLanguage(TEST_INPUT_TEXT); 44 System.out.println("Best language: " + bestLanguage.getLang()); 45 System.out.println("Best language confidence: " + bestLanguage.getConfidence()); 46 47 48 // Get an array with the most probable languages 49 Language[] languages = myCategorizer.predictLanguages(TEST_INPUT_TEXT); 50 /* 51 if(languages == null || languages.length <= 0) { 52 System.err.println("No languages predicted for the input text"); 87 this.myCategorizer = new LanguageDetectorME(model); 88 }/*catch(Exception e) { 89 e.printStackTrace(); 90 }*/ 91 92 // instantiating function should handle critical exceptions. Constructors shouldn't. 93 } 94 95 /** 96 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, 97 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). 98 */ 99 public boolean isTextInMaori(String text) { 100 return isTextInLanguage(MAORI_3LETTER_CODE, text); 101 } 102 103 /** @param langCode is 3 letter language code, ISO 639-2/3 104 * https://www.loc.gov/standards/iso639-2/php/code_list.php 105 * https://en.wikipedia.org/wiki/ISO_639-3 106 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, 107 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). 108 */ 109 public boolean isTextInLanguage(String langCode, String text) { 110 // Get the most probable language 111 Language bestLanguage = myCategorizer.predictLanguage(text); 112 System.out.println("Best language: " + bestLanguage.getLang()); 113 System.out.println("Best language confidence: " + bestLanguage.getConfidence()); 114 115 return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE); 116 } 117 118 119 /** 120 * Handle "smaller" textfiles/streams of text read in. 121 * Return value is the same as for isTextInMaori(String text); 122 */ 123 public boolean isTextInMaori(BufferedReader reader) throws Exception { 124 return isTextInLanguage(MAORI_3LETTER_CODE, reader); 125 } 126 /** 127 * Handle "smaller" textfiles/streams of text read in. 128 * Return value is the same as for isTextInLanguage(String langCode, String text); 129 */ 130 public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception { 131 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file 132 133 StringBuilder text = new StringBuilder(); 134 String line = null; 135 136 137 while((line = reader.readLine()) != null) { // readLine removes newline separator 138 text.append(line + "\n"); // add back (unix style) line ending 139 } 140 return isTextInLanguage(langCode, text.toString()); 141 } 142 143 /* 144 * Need better handling of "larger" textfiles/streams of text read in: 145 * what if multiple languages with high confidence every NUM_LINES read in? 146 * Does this mean the file is multi-lingual with each section dominated by a different language? 147 * How best to convey such information to the user? 148 */ 149 /** 150 * Rudimentary attempt to deal with very large files. 151 * Return value is the same as for isTextInMaori(String text); 152 */ 153 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception { 154 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader); 155 } 156 157 /** 158 * Rudimentary attempt to deal with very large files. 159 * Return value is the same as for isTextInLanguage(String langCode, String text); 160 */ 161 public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception { 162 // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file 163 164 final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence 165 166 StringBuilder text = new StringBuilder(); 167 String line = null; 168 169 double cumulativeConfidence = 0; 170 int numLoops = 0; 171 172 int i = 0; 173 String language = null; 174 175 while((line = reader.readLine()) != null) { // readLine removes newline separator 176 text.append(line + "\n"); // add back (unix style) line ending 177 178 i++; // read nth line of numLoop 179 180 181 if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence 182 183 184 Language bestLanguage = myCategorizer.predictLanguage(text.toString()); 185 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines 186 System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language."); 187 } 188 language = bestLanguage.getLang(); 189 cumulativeConfidence += bestLanguage.getConfidence(); 190 191 System.err.println("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")"); 192 193 // finished analysing language of NUM_LINES of text 194 text = new StringBuilder(); 195 i = 0; 196 numLoops++; 197 } 198 } 199 200 // process any (remaining) text that was less than n NUM_LINES 201 if(!text.toString().equals("")) { 202 text.append(line + "\n"); // add back (unix style) line ending 203 i++; 204 205 Language bestLanguage = myCategorizer.predictLanguage(text.toString()); 206 207 if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines 208 System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language."); 209 } 210 language = bestLanguage.getLang(); 211 cumulativeConfidence += bestLanguage.getConfidence(); 212 System.err.println("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")"); 213 } 214 215 216 int totalLinesRead = numLoops * NUM_LINES + i; // not used 217 double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES 218 219 220 return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE); 221 } 222 223 224 225 /** 226 * Prints to STDOUT the predicted languages of the input text in order of descending confidence. 227 * Unused. 228 */ 229 public void predictedLanguages(String text) { 230 // Get an array with the most probable languages 231 232 Language[] languages = myCategorizer.predictLanguages(text); 233 234 if(languages == null || languages.length <= 0) { 235 System.err.println("No languages predicted for the input text"); 236 } else { 237 for(int i = 0; i < languages.length; i++) { 238 System.out.println("Language prediction " + i + ": " + languages[i]); 239 } 240 } 241 242 } 243 244 public static void printUsage() { 245 System.err.println("Run this program with:"); 246 System.err.println("\t--help (-h)\tfor seeing this usage message again"); 247 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)"); 248 System.err.println("\t--file (-f)\tto provide an input file path"); 249 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]"); 250 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")"); 251 System.err.println("\t\tof the predicted language that will be considered acceptable."); 252 System.err.println(); 253 System.err.println("This program terminates with exit value:"); 254 System.err.println("\t0 if the input text is in Maori"); 255 System.err.println("\t1 if input text is not in Maori"); 256 System.err.println(); 257 System.err.println("\t-1 if the input arguments were wrong"); 258 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language"); 259 System.err.println("\t2 if the user asked to run this program with --help/-h."); 260 System.err.println(); 261 } 262 263 /** 264 * The main program exits with: 265 * 0 if text is in Maori; 266 * 1 if text is not in Maori; 267 * 268 * -1 if the input arguments were wrong 269 * 255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language 270 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception? 271 * 2 if the user asked to run this program with --help/-h. 272 */ 273 public static void main(String args[]) { 274 int returnVal = -1; 275 276 // 1. Check input arguments 277 boolean printUsage = false; 278 boolean readFromStdIn = false; 279 File inFile = null; 280 boolean runSilent = false; 281 double minConfidence = -1; 282 283 for (int i = 0; !printUsage && i < args.length; i++) { 284 285 // check for help first and quit after printing usage 286 if(args[i].equals("--help") || args[i].equals("-h")) { 287 printUsage = true; 288 returnVal = 2; 289 } else if(args[i].equals("--silent") || args[i].equals("-s")) { 290 runSilent = true; 291 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) { 292 i++; 293 if(i >= args.length) { 294 System.err.println("ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n"); 295 printUsage = true; 296 returnVal = -1; 297 } else { 298 try { 299 minConfidence = Double.parseDouble(args[i]); 300 if(minConfidence < 0 || minConfidence > 1) { 301 throw new NumberFormatException("Number out of range, must be between 0-1"); 302 } 303 } catch(NumberFormatException nfe) { 304 System.err.println("ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n"); 305 printUsage = true; 306 returnVal = -1; 307 } 308 } 309 } else if(args[i].equals("-")) { 310 readFromStdIn = true; 311 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin 312 } else if(args[i].equals("--file") || args[i].equals("-f")) { 313 i++; 314 if(i >= args.length) { 315 System.err.println("ERROR: No input file provided with --file|-f flag.\n"); 316 printUsage = true; 317 returnVal = -1; 318 } else { 319 String filePath = args[i]; 320 inFile = new File(filePath); 321 if(!inFile.isFile()) { 322 System.err.println("ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n"); 323 printUsage = true; 324 returnVal = -1; 325 } 326 } 327 } else { // unrecognised input argument 328 System.err.println("ERROR: Unrecognised " + i + "th argument to this program.\n"); 329 printUsage = true; 330 returnVal = -1; 331 } 332 } 333 334 if(!readFromStdIn && inFile == null) { // at least one input source must be provided 335 System.err.println("ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n"); 336 printUsage = true; 337 returnVal = -1; 338 } 339 340 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file 341 System.err.println("ERROR: instructed to read from both STDIN and from an input file. Not possible.\n"); 342 printUsage = true; 343 returnVal = -1; 344 } 345 346 if(printUsage) { 347 if(!runSilent || returnVal == 2) { // if expressly asked for help or not running silent 348 printUsage(); 349 } 350 System.exit(returnVal); 351 } 352 353 try { 354 MaoriDetector maoriTextDetector = null; 355 if(minConfidence == -1) { 356 maoriTextDetector = new MaoriDetector(runSilent); 53 357 } else { 54 for(int i = 0; i < languages.length; i++) { 55 System.out.println("Language prediction " + i + ": " + languages[i]); 56 } 57 }*/ 358 maoriTextDetector = new MaoriDetector(runSilent, minConfidence); 359 } 360 361 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); 362 boolean textIsInMaori = false; 363 364 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html 365 if(inFile != null) { 366 System.err.println("Reading text from file " + inFile.getPath()); 367 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 368 textIsInMaori = maoriTextDetector.isTextInMaori(reader); 369 } // let outer try deal with any file/reading exceptions 370 } 371 else if (readFromStdIn) { 372 System.err.println("Waiting to read text from STDIN... (press Ctrl-D when done entering text)>"); 373 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) { 374 textIsInMaori = maoriTextDetector.isTextInMaori(reader); 375 } // let outer try deal with any file/reading exceptions 376 } 377 378 if(textIsInMaori) { 379 returnVal = 0; 380 } else { 381 returnVal = 1; 382 } 58 383 59 384 } catch(Exception e) { 60 385 e.printStackTrace(); 61 } 62 63 System.err.println("Exitting program...\n"); 64 System.exit(0); 65 } 386 387 } finally { 388 System.err.println("Exitting program with returnVal " + returnVal + "...\n"); 389 System.exit(returnVal); 390 } 391 } 392 393 // test hardcoded string 394 public static void oldMain(String args[]) { 395 int returnVal = -1; 396 boolean silentMode = false; 397 398 try { 399 MaoriDetector maoriTextDetector = new MaoriDetector(silentMode); 400 401 boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); 402 if(textIsInMaori) { 403 returnVal = 0; 404 } else { 405 returnVal = 1; 406 } 407 408 } catch(Exception e) { 409 e.printStackTrace(); 410 } finally { 411 System.err.println("Exitting program with returnVal " + returnVal + "...\n"); 412 System.exit(returnVal); 413 } 414 } 415 416 66 417 }
Note:
See TracChangeset
for help on using the changeset viewer.