/** * Class that uses OpenNLP with the Language Detection Model to determine, with a default * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not. * July 2019. * * Oct 2019: * - Uses a Sentence Model that we trained for Māori (see bin/script/gen_SentenceDetection_model.sh) * for being able to split Māori language text into sentences. * - Refactored into TextLanguageDetector as base class with this class now inheriting from it. */ package org.greenstone.atea; import opennlp.tools.langdetect.*; import opennlp.tools.sentdetect.*; import opennlp.tools.util.*; import java.io.*; import java.util.ArrayList; //import org.apache.log4j.Logger; /** * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation. * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there * (which is the language detection model zipped up and renamed to .bin extension). * Ensure that the mri-sent_trained.bin sentence model for Māori that we trained also lives * in the "models" folder. * * Then, to compile this program, do the following from the "src" folder (the folder containing this java file): * maori-lang-detection/src$ javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java * * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file): * * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help * * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile> * * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector - * Press enter. This variant of the program expects text to stream in from standard input. * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn. * * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath * Also has information on how to run this class if it's in a Java package. */ public class MaoriTextDetector extends TextLanguageDetector { //static Logger logger = Logger.getLogger(org.greenstone.atea.MaoriTextDetector.class.getName()); /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */ public static final String MAORI_3LETTER_CODE = "mri"; /** String taken from our university website, https://www.waikato.ac.nz/maori/ */ public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei."; /** test input string for a negative result */ public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language"; /** Constructor with default confidence for language detection. * Uses the trained Maori sentence model. */ public MaoriTextDetector(boolean silentMode) throws Exception { super(silentMode, DEFAULT_MINIMUM_CONFIDENCE, "mri-sent_trained.bin"); } /** Constructor with configurable confidence level in language detection * that uses the sentence Model we trained for Māori */ public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception { super(silentMode, min_confidence, "mri-sent_trained.bin"); } /** * Function that takes a text and returns those sentences in Māori. * @param text: the string of text from which sentences in the requested * language are to be identified and returned. * @return an ArrayList where: * - the first element is the total number of sentences in the text parameter * - remaining elements are the sentences in the text parameter that were in the * requested language. */ public ArrayList getAllSentencesInMaori(String text) { // big assumption here: that we can split incoming text into sentences // for any language (using the Māori language trained sentence model), // despite not knowing what language those sentences are in // Hinges on MRI sentences detection being similar to at least ENG equivalent // we'll be storing just those sentences in the text that are in Māori. // OpenNLP language detection works best with a minimum of 2 sentences // See https://opennlp.apache.org/news/model-langdetect-183.html // "It is important to note that this model is trained for and works well with // longer texts that have at least 2 sentences or more from the same language." // For evaluating single languages, I used a very small data set and found that // if the primary language detected is MRI AND if the confidence is >= 0.1, the // results appear reasonably to be in te reo Māori. final double confidenceCutoff = 0.1; return getAllSentencesInLanguage(MAORI_3LETTER_CODE, text, confidenceCutoff); } /** * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). */ public boolean isTextInMaori(String text) { return isTextInLanguage(MAORI_3LETTER_CODE, text); } /** * Handle "smaller" textfiles/streams of text read in. * Return value is the same as for isTextInMaori(String text); */ public boolean isTextInMaori(BufferedReader reader) throws Exception { return isTextInLanguage(MAORI_3LETTER_CODE, reader); } /* * Need better handling of "larger" textfiles/streams of text read in: * what if multiple languages with high confidence every NUM_LINES read in? * Does this mean the file is multi-lingual with each section dominated by a different language? * How best to convey such information to the user? */ /** * Rudimentary attempt to deal with very large files. * Return value is the same as for isTextInMaori(String text); */ public boolean isLargeTextInMaori(BufferedReader reader) throws Exception { return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader); } /********** STATIC METHODS *************/ public static void doPrint(boolean runSilent, String msg) { if(!runSilent) System.out.println(msg); } public static void doPrintErr(boolean runSilent, String msg) { if(!runSilent) System.err.println(msg); } public static void printUsage() { System.err.println("Run this program with:"); System.err.println("\t--help (-h)\tfor seeing this usage message again"); System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)"); System.err.println("\t--file (-f)\tto provide an input file path"); System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value."); System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")"); System.err.println("\t\tof the predicted language that will be considered acceptable."); System.err.println(); System.err.println("This program terminates with exit value:"); System.err.println("\t0 if the input text is in Maori"); System.err.println("\t1 if input text is not in Maori"); System.err.println(); System.err.println("\t-1 if the input arguments were wrong"); System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language"); System.err.println("\t2 if the user asked to run this program with --help/-h."); System.err.println(); } /** * The main program exits with: * 0 if text is in Maori; * 1 if text is not in Maori; * * -1 if the input arguments were wrong * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception? * 2 if the user asked to run this program with --help/-h. */ public static void main(String args[]) { int returnVal = -1; // 1. Check input arguments boolean printUsage = false; boolean readFromStdIn = false; File inFile = null; boolean runSilent = false; double minConfidence = -1; for (int i = 0; !printUsage && i < args.length; i++) { // check for help first and quit after printing usage if(args[i].equals("--help") || args[i].equals("-h")) { printUsage = true; returnVal = 2; } else if(args[i].equals("--silent") || args[i].equals("-s")) { runSilent = true; } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) { i++; if(i >= args.length) { doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n"); printUsage = true; returnVal = -1; } else { try { minConfidence = Double.parseDouble(args[i]); if(minConfidence < 0 || minConfidence > 1) { throw new NumberFormatException("Number out of range, must be between 0-1"); } } catch(NumberFormatException nfe) { doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n"); printUsage = true; returnVal = -1; } } } else if(args[i].equals("-")) { readFromStdIn = true; //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin } else if(args[i].equals("--file") || args[i].equals("-f")) { i++; if(i >= args.length) { doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n"); printUsage = true; returnVal = -1; } else { String filePath = args[i]; inFile = new File(filePath); if(!inFile.isFile()) { doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n"); printUsage = true; returnVal = -1; } } } else { // unrecognised input argument doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n"); printUsage = true; returnVal = -1; } } if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane if(!readFromStdIn && inFile == null) { // at least one input source must be provided doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file ).\n"); printUsage = true; returnVal = -1; } if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n"); printUsage = true; returnVal = -1; } } if(printUsage) { // If not running silent print usage. // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr. if(returnVal == 2 || !runSilent) { printUsage(); } System.exit(returnVal); } // 2. Finally, we can now do the actual language detection try { MaoriTextDetector maoriTextDetector = null; if(minConfidence == -1) { maoriTextDetector = new MaoriTextDetector(runSilent); } else { maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence); } //maoriTextDetector.getAllSentencesInMaori(); //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string boolean textIsInMaori = false; // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html if(inFile != null) { doPrint(runSilent, "Reading text from file " + inFile.getPath()); try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { textIsInMaori = maoriTextDetector.isTextInMaori(reader); } // let outer try deal with any file/reading exceptions } else if (readFromStdIn) { doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>"); try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) { textIsInMaori = maoriTextDetector.isTextInMaori(reader); } // let outer try deal with any file/reading exceptions } if(textIsInMaori) { returnVal = 0; } else { returnVal = 1; } } catch(Exception e) { e.printStackTrace(); } finally { doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n"); System.exit(returnVal); } } }