source: other-projects/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java@ 33652

Last change on this file since 33652 was 33615, checked in by ak19, 5 years ago
  1. Worked out how to configure log4j to log both to console and logfile, so modified the existing laboured code to use this better way. 2. Added some Mongodb links under MoreReading.
File size: 12.9 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not.
4 * July 2019.
5 *
6 * Oct 2019:
7 * - Uses a Sentence Model that we trained for Māori (see bin/script/gen_SentenceDetection_model.sh)
8 * for being able to split Māori language text into sentences.
9 * - Refactored into TextLanguageDetector as base class with this class now inheriting from it.
10 */
11
12package org.greenstone.atea;
13
14
15import opennlp.tools.langdetect.*;
16import opennlp.tools.sentdetect.*;
17import opennlp.tools.util.*;
18
19import java.io.*;
20import java.util.ArrayList;
21
22//import org.apache.log4j.Logger;
23
24
25/**
26 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
27 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
28 * (which is the language detection model zipped up and renamed to .bin extension).
29 * Ensure that the mri-sent_trained.bin sentence model for Māori that we trained also lives
30 * in the "models" folder.
31 *
32 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
33 * maori-lang-detection/src$ javac -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
34 *
35 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
36 *
37 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
38 *
39 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>
40 *
41 * maori-lang-detection/src$ java -cp ".:../conf:../lib/*:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
42 * Press enter. This variant of the program expects text to stream in from standard input.
43 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
44 *
45 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
46 * Also has information on how to run this class if it's in a Java package.
47 */
48public class MaoriTextDetector extends TextLanguageDetector {
49 //static Logger logger = Logger.getLogger(org.greenstone.atea.MaoriTextDetector.class.getName());
50
51 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
52 public static final String MAORI_3LETTER_CODE = "mri";
53
54 /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
55 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
56
57 /** test input string for a negative result */
58 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
59
60 /** Constructor with default confidence for language detection.
61 * Uses the trained Maori sentence model.
62 */
63 public MaoriTextDetector(boolean silentMode) throws Exception {
64 super(silentMode, DEFAULT_MINIMUM_CONFIDENCE, "mri-sent_trained.bin");
65 }
66
67 /** Constructor with configurable confidence level in language detection
68 * that uses the sentence Model we trained for Māori */
69 public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
70 super(silentMode, min_confidence, "mri-sent_trained.bin");
71 }
72
73 /**
74 * Function that takes a text and returns those sentences in Māori.
75 * @param text: the string of text from which sentences in the requested
76 * language are to be identified and returned.
77 * @return an ArrayList where:
78 * - the first element is the total number of sentences in the text parameter
79 * - remaining elements are the sentences in the text parameter that were in the
80 * requested language.
81 */
82 public ArrayList<String> getAllSentencesInMaori(String text) {
83 // big assumption here: that we can split incoming text into sentences
84 // for any language (using the Māori language trained sentence model),
85 // despite not knowing what language those sentences are in
86 // Hinges on MRI sentences detection being similar to at least ENG equivalent
87
88
89 // we'll be storing just those sentences in the text that are in Māori.
90
91 // OpenNLP language detection works best with a minimum of 2 sentences
92 // See https://opennlp.apache.org/news/model-langdetect-183.html
93 // "It is important to note that this model is trained for and works well with
94 // longer texts that have at least 2 sentences or more from the same language."
95
96 // For evaluating single languages, I used a very small data set and found that
97 // if the primary language detected is MRI AND if the confidence is >= 0.1, the
98 // results appear reasonably to be in te reo Māori.
99
100 final double confidenceCutoff = 0.1;
101 return getAllSentencesInLanguage(MAORI_3LETTER_CODE, text, confidenceCutoff);
102 }
103
104
105 /**
106 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
107 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
108 */
109 public boolean isTextInMaori(String text) {
110 return isTextInLanguage(MAORI_3LETTER_CODE, text);
111 }
112
113
114 /**
115 * Handle "smaller" textfiles/streams of text read in.
116 * Return value is the same as for isTextInMaori(String text);
117 */
118 public boolean isTextInMaori(BufferedReader reader) throws Exception {
119 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
120 }
121
122 /*
123 * Need better handling of "larger" textfiles/streams of text read in:
124 * what if multiple languages with high confidence every NUM_LINES read in?
125 * Does this mean the file is multi-lingual with each section dominated by a different language?
126 * How best to convey such information to the user?
127 */
128 /**
129 * Rudimentary attempt to deal with very large files.
130 * Return value is the same as for isTextInMaori(String text);
131 */
132 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
133 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
134 }
135
136
137 /********** STATIC METHODS *************/
138
139 public static void doPrint(boolean runSilent, String msg) {
140 if(!runSilent) System.out.println(msg);
141 }
142 public static void doPrintErr(boolean runSilent, String msg) {
143 if(!runSilent) System.err.println(msg);
144 }
145
146 public static void printUsage() {
147 System.err.println("Run this program with:");
148 System.err.println("\t--help (-h)\tfor seeing this usage message again");
149 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
150 System.err.println("\t--file (-f)\tto provide an input file path");
151 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value.");
152 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
153 System.err.println("\t\tof the predicted language that will be considered acceptable.");
154 System.err.println();
155 System.err.println("This program terminates with exit value:");
156 System.err.println("\t0 if the input text is in Maori");
157 System.err.println("\t1 if input text is not in Maori");
158 System.err.println();
159 System.err.println("\t-1 if the input arguments were wrong");
160 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
161 System.err.println("\t2 if the user asked to run this program with --help/-h.");
162 System.err.println();
163 }
164
165 /**
166 * The main program exits with:
167 * 0 if text is in Maori;
168 * 1 if text is not in Maori;
169 *
170 * -1 if the input arguments were wrong
171 * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
172 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
173 * 2 if the user asked to run this program with --help/-h.
174 */
175 public static void main(String args[]) {
176 int returnVal = -1;
177
178 // 1. Check input arguments
179 boolean printUsage = false;
180 boolean readFromStdIn = false;
181 File inFile = null;
182 boolean runSilent = false;
183 double minConfidence = -1;
184
185 for (int i = 0; !printUsage && i < args.length; i++) {
186
187 // check for help first and quit after printing usage
188 if(args[i].equals("--help") || args[i].equals("-h")) {
189 printUsage = true;
190 returnVal = 2;
191 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
192 runSilent = true;
193 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
194 i++;
195 if(i >= args.length) {
196 doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
197 printUsage = true;
198 returnVal = -1;
199 } else {
200 try {
201 minConfidence = Double.parseDouble(args[i]);
202 if(minConfidence < 0 || minConfidence > 1) {
203 throw new NumberFormatException("Number out of range, must be between 0-1");
204 }
205 } catch(NumberFormatException nfe) {
206 doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
207 printUsage = true;
208 returnVal = -1;
209 }
210 }
211 } else if(args[i].equals("-")) {
212 readFromStdIn = true;
213 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
214 } else if(args[i].equals("--file") || args[i].equals("-f")) {
215 i++;
216 if(i >= args.length) {
217 doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
218 printUsage = true;
219 returnVal = -1;
220 } else {
221 String filePath = args[i];
222 inFile = new File(filePath);
223 if(!inFile.isFile()) {
224 doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
225 printUsage = true;
226 returnVal = -1;
227 }
228 }
229 } else { // unrecognised input argument
230 doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
231 printUsage = true;
232 returnVal = -1;
233 }
234 }
235
236 if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane
237 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
238 doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
239 printUsage = true;
240 returnVal = -1;
241 }
242
243 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
244 doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
245 printUsage = true;
246 returnVal = -1;
247 }
248 }
249
250 if(printUsage) {
251 // If not running silent print usage.
252 // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
253 if(returnVal == 2 || !runSilent) {
254 printUsage();
255 }
256 System.exit(returnVal);
257 }
258
259
260 // 2. Finally, we can now do the actual language detection
261 try {
262 MaoriTextDetector maoriTextDetector = null;
263 if(minConfidence == -1) {
264 maoriTextDetector = new MaoriTextDetector(runSilent);
265 } else {
266 maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
267 }
268
269
270 //maoriTextDetector.getAllSentencesInMaori();
271
272
273 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
274 boolean textIsInMaori = false;
275
276 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
277 if(inFile != null) {
278 doPrint(runSilent, "Reading text from file " + inFile.getPath());
279 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
280 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
281 } // let outer try deal with any file/reading exceptions
282 }
283 else if (readFromStdIn) {
284 doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
285 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
286 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
287 } // let outer try deal with any file/reading exceptions
288 }
289
290 if(textIsInMaori) {
291 returnVal = 0;
292 } else {
293 returnVal = 1;
294 }
295
296 } catch(Exception e) {
297 e.printStackTrace();
298
299 } finally {
300 doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
301 System.exit(returnVal);
302 }
303 }
304
305}
Note: See TracBrowser for help on using the repository browser.