1 | /**
|
---|
2 | * Class that uses OpenNLP with the Language Detection Model to determine, with a default
|
---|
3 | * or configurable level of confidence, whether text (from a file or stdin) is in MÄori or not.
|
---|
4 | * July 2019.
|
---|
5 | *
|
---|
6 | * Oct 2019:
|
---|
7 | * - Uses a Sentence Model that we trained for MÄori (see bin/script/gen_SentenceDetection_model.sh)
|
---|
8 | * for being able to split MÄori language text into sentences.
|
---|
9 | * - Refactored into TextLanguageDetector as base class with this class now inheriting from it.
|
---|
10 | */
|
---|
11 |
|
---|
12 | package org.greenstone.atea;
|
---|
13 |
|
---|
14 |
|
---|
15 | import opennlp.tools.langdetect.*;
|
---|
16 | import opennlp.tools.sentdetect.*;
|
---|
17 | import opennlp.tools.util.*;
|
---|
18 |
|
---|
19 | import java.io.*;
|
---|
20 | import java.util.ArrayList;
|
---|
21 |
|
---|
22 | //import org.apache.log4j.Logger;
|
---|
23 |
|
---|
24 |
|
---|
25 | /**
|
---|
26 | * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
|
---|
27 | * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
|
---|
28 | * (which is the language detection model zipped up and renamed to .bin extension).
|
---|
29 | * Ensure that the mri-sent_trained.bin sentence model for MÄori that we trained also lives
|
---|
30 | * in the "models" folder.
|
---|
31 | *
|
---|
32 | * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
|
---|
33 | * maori-lang-detection/src$ javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
|
---|
34 | *
|
---|
35 | * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
|
---|
36 | *
|
---|
37 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
|
---|
38 | *
|
---|
39 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>
|
---|
40 | *
|
---|
41 | * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
|
---|
42 | * Press enter. This variant of the program expects text to stream in from standard input.
|
---|
43 | * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
|
---|
44 | *
|
---|
45 | * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
|
---|
46 | * Also has information on how to run this class if it's in a Java package.
|
---|
47 | */
|
---|
48 | public class MaoriTextDetector extends TextLanguageDetector {
|
---|
49 | //static Logger logger = Logger.getLogger(org.greenstone.atea.MaoriTextDetector.class.getName());
|
---|
50 |
|
---|
51 | /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
|
---|
52 | public static final String MAORI_3LETTER_CODE = "mri";
|
---|
53 |
|
---|
54 | /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
|
---|
55 | public static final String TEST_MRI_INPUT_TEXT = "Ko tÄnei te Whare WÄnanga o Waikato e whakatau nei i ngÄ iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngÄ maunga whakaruru e tau awhi nei.";
|
---|
56 |
|
---|
57 | /** test input string for a negative result */
|
---|
58 | public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
|
---|
59 |
|
---|
60 | /** Constructor with default confidence for language detection.
|
---|
61 | * Uses the trained Maori sentence model.
|
---|
62 | */
|
---|
63 | public MaoriTextDetector(boolean silentMode) throws Exception {
|
---|
64 | super(silentMode, DEFAULT_MINIMUM_CONFIDENCE, "mri-sent_trained.bin");
|
---|
65 | }
|
---|
66 |
|
---|
67 | /** Constructor with configurable confidence level in language detection
|
---|
68 | * that uses the sentence Model we trained for MÄori */
|
---|
69 | public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
|
---|
70 | super(silentMode, min_confidence, "mri-sent_trained.bin");
|
---|
71 | }
|
---|
72 |
|
---|
73 | /**
|
---|
74 | * Function that takes a text and returns those sentences in MÄori.
|
---|
75 | * @param text: the string of text from which sentences in the requested
|
---|
76 | * language are to be identified and returned.
|
---|
77 | * @return an ArrayList where:
|
---|
78 | * - the first element is the total number of sentences in the text parameter
|
---|
79 | * - remaining elements are the sentences in the text parameter that were in the
|
---|
80 | * requested language.
|
---|
81 | */
|
---|
82 | public ArrayList<String> getAllSentencesInMaori(String text) {
|
---|
83 | // big assumption here: that we can split incoming text into sentences
|
---|
84 | // for any language (using the MÄori language trained sentence model),
|
---|
85 | // despite not knowing what language those sentences are in
|
---|
86 | // Hinges on MRI sentences detection being similar to at least ENG equivalent
|
---|
87 |
|
---|
88 |
|
---|
89 | // we'll be storing just those sentences in the text that are in MÄori.
|
---|
90 |
|
---|
91 | // OpenNLP language detection works best with a minimum of 2 sentences
|
---|
92 | // See https://opennlp.apache.org/news/model-langdetect-183.html
|
---|
93 | // "It is important to note that this model is trained for and works well with
|
---|
94 | // longer texts that have at least 2 sentences or more from the same language."
|
---|
95 |
|
---|
96 | // For evaluating single languages, I used a very small data set and found that
|
---|
97 | // if the primary language detected is MRI AND if the confidence is >= 0.1, the
|
---|
98 | // results appear reasonably to be in te reo MÄori.
|
---|
99 |
|
---|
100 | final double confidenceCutoff = 0.1;
|
---|
101 | return getAllSentencesInLanguage(MAORI_3LETTER_CODE, text, confidenceCutoff);
|
---|
102 | }
|
---|
103 |
|
---|
104 |
|
---|
105 | /**
|
---|
106 | * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
|
---|
107 | * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
|
---|
108 | */
|
---|
109 | public boolean isTextInMaori(String text) {
|
---|
110 | return isTextInLanguage(MAORI_3LETTER_CODE, text);
|
---|
111 | }
|
---|
112 |
|
---|
113 |
|
---|
114 | /**
|
---|
115 | * Handle "smaller" textfiles/streams of text read in.
|
---|
116 | * Return value is the same as for isTextInMaori(String text);
|
---|
117 | */
|
---|
118 | public boolean isTextInMaori(BufferedReader reader) throws Exception {
|
---|
119 | return isTextInLanguage(MAORI_3LETTER_CODE, reader);
|
---|
120 | }
|
---|
121 |
|
---|
122 | /*
|
---|
123 | * Need better handling of "larger" textfiles/streams of text read in:
|
---|
124 | * what if multiple languages with high confidence every NUM_LINES read in?
|
---|
125 | * Does this mean the file is multi-lingual with each section dominated by a different language?
|
---|
126 | * How best to convey such information to the user?
|
---|
127 | */
|
---|
128 | /**
|
---|
129 | * Rudimentary attempt to deal with very large files.
|
---|
130 | * Return value is the same as for isTextInMaori(String text);
|
---|
131 | */
|
---|
132 | public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
|
---|
133 | return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
|
---|
134 | }
|
---|
135 |
|
---|
136 |
|
---|
137 | /********** STATIC METHODS *************/
|
---|
138 |
|
---|
139 | public static void doPrint(boolean runSilent, String msg) {
|
---|
140 | if(!runSilent) System.out.println(msg);
|
---|
141 | }
|
---|
142 | public static void doPrintErr(boolean runSilent, String msg) {
|
---|
143 | if(!runSilent) System.err.println(msg);
|
---|
144 | }
|
---|
145 |
|
---|
146 | public static void printUsage() {
|
---|
147 | System.err.println("Run this program with:");
|
---|
148 | System.err.println("\t--help (-h)\tfor seeing this usage message again");
|
---|
149 | System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
|
---|
150 | System.err.println("\t--file (-f)\tto provide an input file path");
|
---|
151 | System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value.");
|
---|
152 | System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
|
---|
153 | System.err.println("\t\tof the predicted language that will be considered acceptable.");
|
---|
154 | System.err.println();
|
---|
155 | System.err.println("This program terminates with exit value:");
|
---|
156 | System.err.println("\t0 if the input text is in Maori");
|
---|
157 | System.err.println("\t1 if input text is not in Maori");
|
---|
158 | System.err.println();
|
---|
159 | System.err.println("\t-1 if the input arguments were wrong");
|
---|
160 | System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
|
---|
161 | System.err.println("\t2 if the user asked to run this program with --help/-h.");
|
---|
162 | System.err.println();
|
---|
163 | }
|
---|
164 |
|
---|
165 | /**
|
---|
166 | * The main program exits with:
|
---|
167 | * 0 if text is in Maori;
|
---|
168 | * 1 if text is not in Maori;
|
---|
169 | *
|
---|
170 | * -1 if the input arguments were wrong
|
---|
171 | * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
|
---|
172 | * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
|
---|
173 | * 2 if the user asked to run this program with --help/-h.
|
---|
174 | */
|
---|
175 | public static void main(String args[]) {
|
---|
176 | int returnVal = -1;
|
---|
177 |
|
---|
178 | // 1. Check input arguments
|
---|
179 | boolean printUsage = false;
|
---|
180 | boolean readFromStdIn = false;
|
---|
181 | File inFile = null;
|
---|
182 | boolean runSilent = false;
|
---|
183 | double minConfidence = -1;
|
---|
184 |
|
---|
185 | for (int i = 0; !printUsage && i < args.length; i++) {
|
---|
186 |
|
---|
187 | // check for help first and quit after printing usage
|
---|
188 | if(args[i].equals("--help") || args[i].equals("-h")) {
|
---|
189 | printUsage = true;
|
---|
190 | returnVal = 2;
|
---|
191 | } else if(args[i].equals("--silent") || args[i].equals("-s")) {
|
---|
192 | runSilent = true;
|
---|
193 | } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
|
---|
194 | i++;
|
---|
195 | if(i >= args.length) {
|
---|
196 | doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
|
---|
197 | printUsage = true;
|
---|
198 | returnVal = -1;
|
---|
199 | } else {
|
---|
200 | try {
|
---|
201 | minConfidence = Double.parseDouble(args[i]);
|
---|
202 | if(minConfidence < 0 || minConfidence > 1) {
|
---|
203 | throw new NumberFormatException("Number out of range, must be between 0-1");
|
---|
204 | }
|
---|
205 | } catch(NumberFormatException nfe) {
|
---|
206 | doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
|
---|
207 | printUsage = true;
|
---|
208 | returnVal = -1;
|
---|
209 | }
|
---|
210 | }
|
---|
211 | } else if(args[i].equals("-")) {
|
---|
212 | readFromStdIn = true;
|
---|
213 | //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
|
---|
214 | } else if(args[i].equals("--file") || args[i].equals("-f")) {
|
---|
215 | i++;
|
---|
216 | if(i >= args.length) {
|
---|
217 | doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
|
---|
218 | printUsage = true;
|
---|
219 | returnVal = -1;
|
---|
220 | } else {
|
---|
221 | String filePath = args[i];
|
---|
222 | inFile = new File(filePath);
|
---|
223 | if(!inFile.isFile()) {
|
---|
224 | doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
|
---|
225 | printUsage = true;
|
---|
226 | returnVal = -1;
|
---|
227 | }
|
---|
228 | }
|
---|
229 | } else { // unrecognised input argument
|
---|
230 | doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
|
---|
231 | printUsage = true;
|
---|
232 | returnVal = -1;
|
---|
233 | }
|
---|
234 | }
|
---|
235 |
|
---|
236 | if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane
|
---|
237 | if(!readFromStdIn && inFile == null) { // at least one input source must be provided
|
---|
238 | doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
|
---|
239 | printUsage = true;
|
---|
240 | returnVal = -1;
|
---|
241 | }
|
---|
242 |
|
---|
243 | if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
|
---|
244 | doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
|
---|
245 | printUsage = true;
|
---|
246 | returnVal = -1;
|
---|
247 | }
|
---|
248 | }
|
---|
249 |
|
---|
250 | if(printUsage) {
|
---|
251 | // If not running silent print usage.
|
---|
252 | // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
|
---|
253 | if(returnVal == 2 || !runSilent) {
|
---|
254 | printUsage();
|
---|
255 | }
|
---|
256 | System.exit(returnVal);
|
---|
257 | }
|
---|
258 |
|
---|
259 |
|
---|
260 | // 2. Finally, we can now do the actual language detection
|
---|
261 | try {
|
---|
262 | MaoriTextDetector maoriTextDetector = null;
|
---|
263 | if(minConfidence == -1) {
|
---|
264 | maoriTextDetector = new MaoriTextDetector(runSilent);
|
---|
265 | } else {
|
---|
266 | maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
|
---|
267 | }
|
---|
268 |
|
---|
269 |
|
---|
270 | //maoriTextDetector.getAllSentencesInMaori();
|
---|
271 |
|
---|
272 |
|
---|
273 | //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
|
---|
274 | boolean textIsInMaori = false;
|
---|
275 |
|
---|
276 | // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
|
---|
277 | if(inFile != null) {
|
---|
278 | doPrint(runSilent, "Reading text from file " + inFile.getPath());
|
---|
279 | try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
|
---|
280 | textIsInMaori = maoriTextDetector.isTextInMaori(reader);
|
---|
281 | } // let outer try deal with any file/reading exceptions
|
---|
282 | }
|
---|
283 | else if (readFromStdIn) {
|
---|
284 | doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
|
---|
285 | try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
|
---|
286 | textIsInMaori = maoriTextDetector.isTextInMaori(reader);
|
---|
287 | } // let outer try deal with any file/reading exceptions
|
---|
288 | }
|
---|
289 |
|
---|
290 | if(textIsInMaori) {
|
---|
291 | returnVal = 0;
|
---|
292 | } else {
|
---|
293 | returnVal = 1;
|
---|
294 | }
|
---|
295 |
|
---|
296 | } catch(Exception e) {
|
---|
297 | e.printStackTrace();
|
---|
298 |
|
---|
299 | } finally {
|
---|
300 | doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
|
---|
301 | System.exit(returnVal);
|
---|
302 | }
|
---|
303 | }
|
---|
304 |
|
---|
305 | }
|
---|