source: gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java@ 33587

Last change on this file since 33587 was 33587, checked in by ak19, 5 years ago
  1. Better stats reporting on crawled sites: not just if a page was in MRI or not, but for those that contained any text, there's also reporting on how many sentences were detected as MRI (even if the overall text body of the page was not detected as being primarily MRI). This can be useful later when or if we want to store MRI language sentences/paragraphs. Currently only useful if I've implemented it sensibly. 2. MaoriTextDetector.java::getAllSentencesInMaori() and TextLanguageDetector.java::getAllSentencesInLanguage() now store the total number of sentences in the text parameter as the first element in the ArrayList returned.
File size: 12.7 KB
Line 
1/**
2 * Class that uses OpenNLP with the Language Detection Model to determine, with a default
3 * or configurable level of confidence, whether text (from a file or stdin) is in Māori or not.
4 * July 2019.
5 *
6 * Oct 2019:
7 * - Uses a Sentence Model that we trained for Māori (see bin/script/gen_SentenceDetection_model.sh)
8 * for being able to split Māori language text into sentences.
9 * - Refactored into TextLanguageDetector as base class with this class now inheriting from it.
10 */
11
12package org.greenstone.atea;
13
14import java.io.*;
15import opennlp.tools.langdetect.*;
16import opennlp.tools.sentdetect.*;
17import opennlp.tools.util.*;
18
19import java.util.ArrayList;
20
21/**
22 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
23 * Create a folder called "models" within the $OPENNLP_HOME folder, and put the file "langdetect-183.bin" in there
24 * (which is the language detection model zipped up and renamed to .bin extension).
25 * Ensure that the mri-sent_trained.bin sentence model for Māori that we trained also lives
26 * in the "models" folder.
27 *
28 * Then, to compile this program, do the following from the "src" folder (the folder containing this java file):
29 * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org/greenstone/atea/MaoriTextDetector.java
30 *
31 * To run this program, issue one of the following commands from the "src" folder (the folder containing this java file):
32 *
33 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --help
34 *
35 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector --file <full/path/to/textfile>
36 *
37 * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" org.greenstone.atea.MaoriTextDetector -
38 * Press enter. This variant of the program expects text to stream in from standard input.
39 * If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
40 *
41 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
42 * Also has information on how to run this class if it's in a Java package.
43 */
44public class MaoriTextDetector extends TextLanguageDetector {
45 /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
46 public static final String MAORI_3LETTER_CODE = "mri";
47
48 /** String taken from our university website, https://www.waikato.ac.nz/maori/ */
49 public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
50
51 /** test input string for a negative result */
52 public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
53
54 /** Constructor with default confidence for language detection.
55 * Uses the trained Maori sentence model.
56 */
57 public MaoriTextDetector(boolean silentMode) throws Exception {
58 super(silentMode, DEFAULT_MINIMUM_CONFIDENCE, "mri-sent_trained.bin");
59 }
60
61 /** Constructor with configurable confidence level in language detection
62 * that uses the sentence Model we trained for Māori */
63 public MaoriTextDetector(boolean silentMode, double min_confidence) throws Exception {
64 super(silentMode, min_confidence, "mri-sent_trained.bin");
65 }
66
67 /**
68 * Function that takes a text and returns those sentences in Māori.
69 * @param text: the string of text from which sentences in the requested
70 * language are to be identified and returned.
71 * @return an ArrayList where:
72 * - the first element is the total number of sentences in the text parameter
73 * - remaining elements are the sentences in the text parameter that were in the
74 * requested language.
75 */
76 public ArrayList<String> getAllSentencesInMaori(String text) {
77 // big assumption here: that we can split incoming text into sentences
78 // for any language (using the Māori language trained sentence model),
79 // despite not knowing what language those sentences are in
80 // Hinges on MRI sentences detection being similar to at least ENG equivalent
81
82
83 // we'll be storing just those sentences in text that are in Māori.
84
85 // OpenNLP language detection works best with a minimum of 2 sentences
86 // See https://opennlp.apache.org/news/model-langdetect-183.html
87 // "It is important to note that this model is trained for and works well with
88 // longer texts that have at least 2 sentences or more from the same language."
89
90 // For evaluating single languages, I used a very small data set and found that
91 // if the primary language detected is MRI AND if the confidence is >= 0.1, the
92 // results appear reasonably to be in te reo Māori.
93
94 final double confidenceCutoff = 0.1;
95 return getAllSentencesInLanguage(MAORI_3LETTER_CODE, text, confidenceCutoff);
96 }
97
98
99 /**
100 * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
101 * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
102 */
103 public boolean isTextInMaori(String text) {
104 return isTextInLanguage(MAORI_3LETTER_CODE, text);
105 }
106
107
108 /**
109 * Handle "smaller" textfiles/streams of text read in.
110 * Return value is the same as for isTextInMaori(String text);
111 */
112 public boolean isTextInMaori(BufferedReader reader) throws Exception {
113 return isTextInLanguage(MAORI_3LETTER_CODE, reader);
114 }
115
116 /*
117 * Need better handling of "larger" textfiles/streams of text read in:
118 * what if multiple languages with high confidence every NUM_LINES read in?
119 * Does this mean the file is multi-lingual with each section dominated by a different language?
120 * How best to convey such information to the user?
121 */
122 /**
123 * Rudimentary attempt to deal with very large files.
124 * Return value is the same as for isTextInMaori(String text);
125 */
126 public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
127 return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
128 }
129
130
131 /********** STATIC METHODS *************/
132
133 public static void doPrint(boolean runSilent, String msg) {
134 if(!runSilent) System.out.println(msg);
135 }
136 public static void doPrintErr(boolean runSilent, String msg) {
137 if(!runSilent) System.err.println(msg);
138 }
139
140 public static void printUsage() {
141 System.err.println("Run this program with:");
142 System.err.println("\t--help (-h)\tfor seeing this usage message again");
143 System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
144 System.err.println("\t--file (-f)\tto provide an input file path");
145 System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value.");
146 System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
147 System.err.println("\t\tof the predicted language that will be considered acceptable.");
148 System.err.println();
149 System.err.println("This program terminates with exit value:");
150 System.err.println("\t0 if the input text is in Maori");
151 System.err.println("\t1 if input text is not in Maori");
152 System.err.println();
153 System.err.println("\t-1 if the input arguments were wrong");
154 System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language");
155 System.err.println("\t2 if the user asked to run this program with --help/-h.");
156 System.err.println();
157 }
158
159 /**
160 * The main program exits with:
161 * 0 if text is in Maori;
162 * 1 if text is not in Maori;
163 *
164 * -1 if the input arguments were wrong
165 * 255(!) if an Exception occurred in instantiating the MaoriTextDetector when attempting to detect the text's language
166 * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
167 * 2 if the user asked to run this program with --help/-h.
168 */
169 public static void main(String args[]) {
170 int returnVal = -1;
171
172 // 1. Check input arguments
173 boolean printUsage = false;
174 boolean readFromStdIn = false;
175 File inFile = null;
176 boolean runSilent = false;
177 double minConfidence = -1;
178
179 for (int i = 0; !printUsage && i < args.length; i++) {
180
181 // check for help first and quit after printing usage
182 if(args[i].equals("--help") || args[i].equals("-h")) {
183 printUsage = true;
184 returnVal = 2;
185 } else if(args[i].equals("--silent") || args[i].equals("-s")) {
186 runSilent = true;
187 } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
188 i++;
189 if(i >= args.length) {
190 doPrintErr(runSilent, "ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
191 printUsage = true;
192 returnVal = -1;
193 } else {
194 try {
195 minConfidence = Double.parseDouble(args[i]);
196 if(minConfidence < 0 || minConfidence > 1) {
197 throw new NumberFormatException("Number out of range, must be between 0-1");
198 }
199 } catch(NumberFormatException nfe) {
200 doPrintErr(runSilent, "ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
201 printUsage = true;
202 returnVal = -1;
203 }
204 }
205 } else if(args[i].equals("-")) {
206 readFromStdIn = true;
207 //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
208 } else if(args[i].equals("--file") || args[i].equals("-f")) {
209 i++;
210 if(i >= args.length) {
211 doPrintErr(runSilent, "ERROR: No input file provided with --file|-f flag.\n");
212 printUsage = true;
213 returnVal = -1;
214 } else {
215 String filePath = args[i];
216 inFile = new File(filePath);
217 if(!inFile.isFile()) {
218 doPrintErr(runSilent, "ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
219 printUsage = true;
220 returnVal = -1;
221 }
222 }
223 } else { // unrecognised input argument
224 doPrintErr(runSilent, "ERROR: Unrecognised " + i + "th argument to this program.\n");
225 printUsage = true;
226 returnVal = -1;
227 }
228 }
229
230 if(returnVal != 2) { // returnVal == 2 for help. Only if the user did not request --help/-h, do we continue to make sure the arguments provided are sane
231 if(!readFromStdIn && inFile == null) { // at least one input source must be provided
232 doPrintErr(runSilent, "ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
233 printUsage = true;
234 returnVal = -1;
235 }
236
237 if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
238 doPrintErr(runSilent, "ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
239 printUsage = true;
240 returnVal = -1;
241 }
242 }
243
244 if(printUsage) {
245 // If not running silent print usage.
246 // OR if expressly asked for help, then it doesn't matter if we're running silent: still print usage to stderr.
247 if(returnVal == 2 || !runSilent) {
248 printUsage();
249 }
250 System.exit(returnVal);
251 }
252
253
254 // 2. Finally, we can now do the actual language detection
255 try {
256 MaoriTextDetector maoriTextDetector = null;
257 if(minConfidence == -1) {
258 maoriTextDetector = new MaoriTextDetector(runSilent);
259 } else {
260 maoriTextDetector = new MaoriTextDetector(runSilent, minConfidence);
261 }
262
263
264 //maoriTextDetector.getAllSentencesInMaori();
265
266
267 //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); // test hardcoded string
268 boolean textIsInMaori = false;
269
270 // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
271 if(inFile != null) {
272 doPrint(runSilent, "Reading text from file " + inFile.getPath());
273 try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
274 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
275 } // let outer try deal with any file/reading exceptions
276 }
277 else if (readFromStdIn) {
278 doPrint(runSilent, "Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
279 try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
280 textIsInMaori = maoriTextDetector.isTextInMaori(reader);
281 } // let outer try deal with any file/reading exceptions
282 }
283
284 if(textIsInMaori) {
285 returnVal = 0;
286 } else {
287 returnVal = 1;
288 }
289
290 } catch(Exception e) {
291 e.printStackTrace();
292
293 } finally {
294 doPrint(runSilent, "Exitting program with returnVal " + returnVal + "...\n");
295 System.exit(returnVal);
296 }
297 }
298
299}
Note: See TracBrowser for help on using the repository browser.