/** * Class that uses OpenNLP with the Language Detection Model to determine, with a default * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not. * Internal functions can be used for detecting any of the 103 languages currently supported by * the OpenNLP Language Detection Model. * * http://opennlp.apache.org/news/model-langdetect-183.html * language detector model: http://opennlp.apache.org/models.html * Pre-trained models for OpenNLP 1.5: http://opennlp.sourceforge.net/models-1.5/ * Use of Apache OpenNLP in general: * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api * Use of OpenNLP for language detection: * http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect * * This code was based on the information and sample code at the above links and the links dispersed throughout this file. * See also the accompanying README file. * * July 2019 */ package org.greenstone.atea; import java.io.*; import opennlp.tools.langdetect.*; import opennlp.tools.util.*; /** * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation. * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there * (which is the language detection model zipped up and renamed to .bin extension). * * Then, to compile this program, do the following from the "src" folder (the folder containing this java file): * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java * * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file): * * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help * * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile> * * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector - * Press enter. This variant of the program expects text to stream in from standard input. * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn. * * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath * Also has information on how to run this class if it's in a Java package. */ public class MaoriTextDetector { /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */ public static final String MAORI_3LETTER_CODE = "mri"; public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50; /** Configurable: cut off minimum confidence value, greater or equal to which determines that the best predicted language is acceptable to user of MaoriTextDetector. */ public final double MINIMUM_CONFIDENCE; /** silentMode set to false means MaoriTextDetector won't print helpful messages while running. Set to true to run silently. */ public final boolean silentMode; /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */ private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin"; /** * The LanguageDetectorModel object that will do the actual language detection/prediction for us. * Created once in the constructor, can be used as often as needed thereafter. */ private LanguageDetector myCategorizer = null; /** String taken from our university website, https://www.waikato.ac.nz/maori/ */ public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei."; /** test input string for a negative result */ public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language"; public MaoriTextDetector(boolean silentMode) throws Exception { this(silentMode, DEFAULT_MINIMUM_CONFIDENCE); } public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception { this.silentMode = silentMode; this.MINIMUM_CONFIDENCE = min_confidence; // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists); String langDetectModelPath = System.getenv("OPENNLP_HOME"); if(System.getenv("OPENNLP_HOME") == null) { throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder."); } langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH; File langDetectModelBinFile = new File(langDetectModelPath); if(!langDetectModelBinFile.exists()) { throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist." + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it."); } // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model. // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html try (InputStream modelIn = new FileInputStream(langDetectModelPath)) { LanguageDetectorModel model = new LanguageDetectorModel(modelIn); // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect this.myCategorizer = new LanguageDetectorME(model); }/*catch(Exception e) { e.printStackTrace(); }*/ // instantiating function should handle critical exceptions. Constructors shouldn't. } /** * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). */ public boolean isTextInMaori(String text) { return isTextInLanguage(MAORI_3LETTER_CODE, text); } /** @param langCode is 3 letter language code, ISO 639-2/3 * https://www.loc.gov/standards/iso639-2/php/code_list.php * https://en.wikipedia.org/wiki/ISO_639-3 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). */ public boolean isTextInLanguage(String langCode, String text) { // Get the most probable language Language bestLanguage = myCategorizer.predictLanguage(text); doPrint("Best language: " + bestLanguage.getLang()); doPrint("Best language confidence: " + bestLanguage.getConfidence()); return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE); } /** * Handle "smaller" textfiles/streams of text read in. * Return value is the same as for isTextInMaori(String text); */ public boolean isTextInMaori(BufferedReader reader) throws Exception { return isTextInLanguage(MAORI_3LETTER_CODE, reader); } /** * Handle "smaller" textfiles/streams of text read in. * Return value is the same as for isTextInLanguage(String langCode, String text); */ public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception { // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file StringBuilder text = new StringBuilder(); String line = null; while((line = reader.readLine()) != null) { // readLine removes newline separator text.append(line + "\n"); // add back (unix style) line ending } return isTextInLanguage(langCode, text.toString()); } /* * Need better handling of "larger" textfiles/streams of text read in: * what if multiple languages with high confidence every NUM_LINES read in? * Does this mean the file is multi-lingual with each section dominated by a different language? * How best to convey such information to the user? */ /** * Rudimentary attempt to deal with very large files. * Return value is the same as for isTextInMaori(String text); */ public boolean isLargeTextInMaori(BufferedReader reader) throws Exception { return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader); } /** * Rudimentary attempt to deal with very large files. * Return value is the same as for isTextInLanguage(String langCode, String text); */ public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception { // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence StringBuilder text = new StringBuilder(); String line = null; double cumulativeConfidence = 0; int numLoops = 0; int i = 0; String language = null; while((line = reader.readLine()) != null) { // readLine removes newline separator text.append(line + "\n"); // add back (unix style) line ending i++; // read nth line of numLoop if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence Language bestLanguage = myCategorizer.predictLanguage(text.toString()); if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language."); } language = bestLanguage.getLang(); cumulativeConfidence += bestLanguage.getConfidence(); doPrintErr("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")"); // finished analysing language of NUM_LINES of text text = new StringBuilder(); i = 0; numLoops++; } } // process any (remaining) text that was less than n NUM_LINES if(!text.toString().equals("")) { text.append(line + "\n"); // add back (unix style) line ending i++; Language bestLanguage = myCategorizer.predictLanguage(text.toString()); if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines doPrintErr("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language."); } language = bestLanguage.getLang(); cumulativeConfidence += bestLanguage.getConfidence(); doPrintErr("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")"); } int totalLinesRead = numLoops * NUM_LINES + i; // not used double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE); } /** * Prints to STDOUT the predicted languages of the input text in order of descending confidence. * UNUSED. */ public void predictedLanguages(String text) { // Get an array with the most probable languages Language[] languages = myCategorizer.predictLanguages(text); if(languages == null || languages.length <= 0) { doPrintErr("No languages predicted for the input text"); } else { for(int i = 0; i < languages.length; i++) { doPrint("Language prediction " + i + ": " + languages[i]); } } } public void doPrint(String msg) { doPrint(this.silentMode, msg); } public void doPrintErr(String msg) { doPrintErr(this.silentMode, msg); } /********** STATIC METHODS *************/ public static void doPrint(boolean runSilent, String msg) { if(!runSilent) System.out.println(msg); } public static void doPrintErr(boolean runSilent, String msg) { if(!runSilent) System.err.println(msg); } public static void printUsage() { System.err.println("Run this program with:"); System.err.println("\t--help (-h)\tfor seeing this usage message again"); System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)"); System.err.println("\t--file (-f)\tto provide an input file path"); System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]"); System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")"); System.err.println("\t\tof the predicted language that will be considered acceptable."); System.err.println(); System.err.println("This program terminates with exit value:"); System.err.println("\t0 if the input text is in Maori"); System.err.println("\t1 if input text is not in Maori"); System.err.println(); System.err.println("\t-1 if the input arguments were wrong"); System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language"); System.err.println("\t2 if the user asked to run this program with --help/-h."); System.err.println(); } /** * The main program exits with: * 0 if text is in Maori; * 1 if text is not in Maori; * * -1 if the input arguments were wrong * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception? * 2 if the user asked to run this program with --help/-h. */ public static void main(String args[]) { int returnVal = -1; // 1. Check input arguments boolean printUsage = false; boolean readFromStdIn = false; File inFile = null; boolean runSilent = false; double minConfidence = -1; for (int i = 0; !printUsage && i < args.length; i++) { // check for help first and quit after printing usage if(args[i].equals("--help") || args[i].equals("-h")) { printUsage = true; returnVal = 2; } else if(args[i].equals("--silent") || args[i].equals("-s")) { runSilent = true; } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) { i++; if(i >= args.length) { doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n"); printUsage = true; returnVal = -1; } else { try { minConfidence = Double.parseDouble(args[i]); if(minConfidence < 0 || minConfidence > 1) { throw new NumberFormatException("Number out of range, must be between 0-1"); } } catch(NumberFormatException nfe) { doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n"); printUsage = true; returnVal = -1; } } } else if(args[i].equals("-")) { readFromStdIn = true; //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin } else if(args[i].equals("--file") || args[i].equals("-f")) { i++; if(i >= args.length) { doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n"); printUsage = true; returnVal = -1; } else { String filePath = args[i]; inFile = new File(filePath); if(!inFile.isFile()) { doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n"); printUsage = true; returnVal = -1; } } } else { // unrecognised input argument doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n"); printUsage = true; returnVal = -1; } } if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane if(!readFromStdIn && inFile == null) { // at least one input source must be provided doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file ).\n"); printUsage = true; returnVal = -1; } if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n"); printUsage = true; returnVal = -1; } } if(printUsage) { // If not running silent print usage. // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr. if(returnVal == 2 || !runSilent) { printUsage(); } System.exit(returnVal); } // 2. Finally, we can now do the actual language detection try { MaoriTextDetector maoriTextDetector = null; if(minConfidence == -1) { maoriTextDetector = new MaoriTextDetector(runSilent); } else { maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence); } //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string boolean textIsInMaori = false; // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html if(inFile != null) { doPrint(runSilent, "Reading text from file " + inFile.getPath()); try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { textIsInMaori = maoriTextDetector.isTextInMaori(reader); } // let outer try deal with any file/reading exceptions } else if (readFromStdIn) { doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>"); try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) { textIsInMaori = maoriTextDetector.isTextInMaori(reader); } // let outer try deal with any file/reading exceptions } if(textIsInMaori) { returnVal = 0; } else { returnVal = 1; } } catch(Exception e) { e.printStackTrace(); } finally { doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n"); System.exit(returnVal); } } }