Changeset 33336 for gs3-extensions


Ignore:
Timestamp:
2019-07-20T22:58:17+12:00 (5 years ago)
Author:
ak19
Message:

Major rewrite to make this class more useful to callers. MaoriDetector.java, soon to be renamed, can now be called with a filepath to a txt file or with the hyphen to indicate it should read text from std input (terminate with Ctrl-D as usual). Then it will predict whether the language of the text is in Maori or not depending on the min confidence cut-off value, which presently defaults to 0.5. Still need to support silent mode and update the README with instructions. Now the language detection model file needed by OpenNLP to do the language detection needs to live at OPENNLP_HOME/models/langdetect-183.bin

Location:
gs3-extensions/maori-lang-detection/src
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/MaoriDetector.java

    r33335 r33336  
    1515
    1616/**
    17  * Run as:
    18  *    wharariki:[115]/Scratch/ak19/openNLP-lang-detect/src>javac -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector.java
    19  *    wharariki:[116]/Scratch/ak19/openNLP-lang-detect/src>java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector
     17 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation.
     18 * Then, to compile this program:
     19 *    maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriDetector.java
     20 * To run this program, one of:
     21 *
     22 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --help
     23 *
     24 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --file <full/path/to/textfile>
     25 *
     26 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector -
     27 *       which expects text to stream in from standard input.
     28 *       If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn.
    2029 *
    2130 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath
     
    2332 */
    2433public class MaoriDetector {
    25 
     34    /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */
     35    public static final String MAORI_3LETTER_CODE = "mri";
     36    public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50;
     37
     38    /** Configurable: cut off minimum confidence value,
     39    greater or equal to which determines that the best predicted language is acceptable to user of MaoriDetector. */
     40    public final double MINIMUM_CONFIDENCE;
     41    /** silentMode set to false means MaoriDetector won't print helpful messages while running. Set to true to run silently. */
     42    public final boolean silentMode;
     43
     44    /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */
     45    private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin";
     46    private LanguageDetector myCategorizer = null;
     47   
    2648    /**
    27      * Taken from our university website
     49     * String taken from our university website
    2850     * https://www.waikato.ac.nz/maori/
    2951     */
    30     public static final String TEST_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
    31    
    32     public static void main(String args[]) {
     52    public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei.";
     53
     54    /** test input string for a negative result */
     55    public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language";
     56   
     57   
     58    public MaoriDetector(boolean silentMode) throws Exception {
     59    this(silentMode, DEFAULT_MINIMUM_CONFIDENCE);
     60    }
     61   
     62    public MaoriDetector(boolean silentMode, double min_confidence) throws Exception {
     63    this.silentMode = silentMode;
     64    this.MINIMUM_CONFIDENCE = min_confidence;
     65
     66    // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists);
     67    String langDetectModelPath = System.getenv("OPENNLP_HOME");
     68    if(System.getenv("OPENNLP_HOME") == null) {
     69        throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder.");
     70    }   
     71    langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH;
     72    File langDetectModelBinFile = new File(langDetectModelPath);
     73    if(!langDetectModelBinFile.exists()) {
     74        throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist."
     75                + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it.");
     76    }
     77
     78
     79    // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model.
    3380    // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api
    3481    // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
    35     try (InputStream modelIn = new FileInputStream("/Scratch/ak19/openNLP-lang-detect/langdetect-183.bin")) {
     82    try (InputStream modelIn = new FileInputStream(langDetectModelPath)) {
    3683
    3784        LanguageDetectorModel model = new LanguageDetectorModel(modelIn);
    3885
    3986        // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect
    40         LanguageDetector myCategorizer = new LanguageDetectorME(model);
    41 
    42         // Get the most probable language
    43         Language bestLanguage = myCategorizer.predictLanguage(TEST_INPUT_TEXT);
    44         System.out.println("Best language: " + bestLanguage.getLang());
    45         System.out.println("Best language confidence: " + bestLanguage.getConfidence());
    46 
    47        
    48         // Get an array with the most probable languages
    49         Language[] languages = myCategorizer.predictLanguages(TEST_INPUT_TEXT);
    50         /*
    51         if(languages == null || languages.length <= 0) {
    52         System.err.println("No languages predicted for the input text");
     87        this.myCategorizer = new LanguageDetectorME(model);
     88    }/*catch(Exception e) {
     89        e.printStackTrace();
     90        }*/
     91   
     92    // instantiating function should handle critical exceptions. Constructors shouldn't.
     93    }   
     94
     95    /**
     96     * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
     97     * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
     98     */
     99    public boolean isTextInMaori(String text) {
     100    return isTextInLanguage(MAORI_3LETTER_CODE, text);
     101    }
     102
     103    /** @param langCode is 3 letter language code, ISO 639-2/3
     104     * https://www.loc.gov/standards/iso639-2/php/code_list.php
     105     * https://en.wikipedia.org/wiki/ISO_639-3
     106     * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set,
     107     * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence).
     108     */
     109    public boolean isTextInLanguage(String langCode, String text) {
     110    // Get the most probable language
     111    Language bestLanguage = myCategorizer.predictLanguage(text);
     112    System.out.println("Best language: " + bestLanguage.getLang());
     113    System.out.println("Best language confidence: " + bestLanguage.getConfidence());
     114
     115    return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE);
     116    }
     117   
     118   
     119    /**
     120     * Handle "smaller" textfiles/streams of text read in.
     121     * Return value is the same as for isTextInMaori(String text);
     122     */
     123    public boolean isTextInMaori(BufferedReader reader) throws Exception {
     124    return isTextInLanguage(MAORI_3LETTER_CODE, reader);
     125    }
     126    /**
     127     * Handle "smaller" textfiles/streams of text read in.
     128     * Return value is the same as for isTextInLanguage(String langCode, String text);
     129     */
     130    public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception {
     131    // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
     132   
     133    StringBuilder text = new StringBuilder();
     134    String line = null;
     135
     136   
     137    while((line = reader.readLine()) != null) { // readLine removes newline separator
     138        text.append(line + "\n"); // add back (unix style) line ending
     139    }
     140    return isTextInLanguage(langCode, text.toString());
     141    }
     142   
     143    /*
     144     * Need better handling of "larger" textfiles/streams of text read in:
     145     * what if multiple languages with high confidence every NUM_LINES read in?
     146     * Does this mean the file is multi-lingual with each section dominated by a different language?
     147     * How best to convey such information to the user?
     148     */
     149    /**
     150     * Rudimentary attempt to deal with very large files.
     151     * Return value is the same as for isTextInMaori(String text);
     152     */
     153    public boolean isLargeTextInMaori(BufferedReader reader) throws Exception {
     154    return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader);
     155    }
     156
     157    /**
     158     * Rudimentary attempt to deal with very large files.
     159     * Return value is the same as for isTextInLanguage(String langCode, String text);
     160     */   
     161    public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception {
     162    // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file
     163   
     164    final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence
     165
     166    StringBuilder text = new StringBuilder();
     167    String line = null;
     168   
     169    double cumulativeConfidence = 0;
     170    int numLoops = 0;
     171   
     172    int i = 0;
     173    String language = null;
     174   
     175    while((line = reader.readLine()) != null) { // readLine removes newline separator
     176        text.append(line + "\n"); // add back (unix style) line ending
     177       
     178        i++; // read nth line of numLoop
     179       
     180       
     181        if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence
     182       
     183       
     184        Language bestLanguage = myCategorizer.predictLanguage(text.toString());
     185        if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
     186            System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");           
     187        }
     188        language = bestLanguage.getLang();
     189        cumulativeConfidence += bestLanguage.getConfidence();
     190       
     191        System.err.println("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
     192       
     193        // finished analysing language of NUM_LINES of text
     194        text = new StringBuilder();
     195        i = 0;
     196        numLoops++;
     197        }       
     198    }
     199   
     200    // process any (remaining) text that was less than n NUM_LINES
     201    if(!text.toString().equals("")) {
     202        text.append(line + "\n"); // add back (unix style) line ending     
     203        i++;
     204       
     205        Language bestLanguage = myCategorizer.predictLanguage(text.toString());
     206       
     207        if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines
     208        System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");           
     209        }
     210        language = bestLanguage.getLang();
     211        cumulativeConfidence += bestLanguage.getConfidence();
     212        System.err.println("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");
     213    }
     214   
     215   
     216    int totalLinesRead = numLoops * NUM_LINES + i; // not used
     217    double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES
     218   
     219   
     220    return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE);
     221    }
     222
     223   
     224
     225    /**
     226     * Prints to STDOUT the predicted languages of the input text in order of descending confidence.
     227     * Unused.
     228     */
     229    public void predictedLanguages(String text) {
     230    // Get an array with the most probable languages
     231   
     232    Language[] languages = myCategorizer.predictLanguages(text);
     233   
     234    if(languages == null || languages.length <= 0) {
     235        System.err.println("No languages predicted for the input text");
     236    } else {
     237        for(int i = 0; i < languages.length; i++) {
     238        System.out.println("Language prediction " + i + ": " + languages[i]);
     239        }
     240    }
     241   
     242    }
     243
     244    public static void printUsage() {
     245    System.err.println("Run this program with:");
     246    System.err.println("\t--help (-h)\tfor seeing this usage message again");
     247    System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)");
     248    System.err.println("\t--file (-f)\tto provide an input file path");
     249    System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]");
     250    System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")");
     251    System.err.println("\t\tof the predicted language that will be considered acceptable.");
     252    System.err.println();
     253    System.err.println("This program terminates with exit value:");
     254    System.err.println("\t0 if the input text is in Maori");
     255    System.err.println("\t1 if input text is not in Maori");
     256    System.err.println();
     257    System.err.println("\t-1 if the input arguments were wrong");
     258    System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language");
     259    System.err.println("\t2 if the user asked to run this program with --help/-h.");
     260    System.err.println();
     261    }
     262
     263    /**
     264     * The main program exits with:
     265     *    0 if text is in Maori;
     266     *    1 if text is not in Maori;
     267     *
     268     *    -1 if the input arguments were wrong
     269     *    255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language
     270     * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception?
     271     *    2 if the user asked to run this program with --help/-h.
     272     */
     273    public static void main(String args[]) {
     274    int returnVal = -1;
     275
     276    // 1. Check input arguments
     277    boolean printUsage = false;
     278    boolean readFromStdIn = false;
     279    File inFile = null;
     280    boolean runSilent = false;
     281    double minConfidence = -1;
     282
     283    for (int i = 0; !printUsage && i < args.length; i++) {
     284       
     285        // check for help first and quit after printing usage
     286        if(args[i].equals("--help") || args[i].equals("-h")) {
     287        printUsage = true;
     288        returnVal = 2;     
     289        } else if(args[i].equals("--silent") || args[i].equals("-s")) {
     290        runSilent = true;
     291        } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) {
     292        i++;
     293        if(i >= args.length) {
     294            System.err.println("ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n");
     295            printUsage = true;
     296            returnVal = -1;
     297        } else {
     298            try {
     299            minConfidence = Double.parseDouble(args[i]);
     300            if(minConfidence < 0 || minConfidence > 1) {
     301                throw new NumberFormatException("Number out of range, must be between 0-1");
     302            }
     303            } catch(NumberFormatException nfe) {
     304            System.err.println("ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n");
     305            printUsage = true;
     306            returnVal = -1;
     307            }
     308        }
     309        } else if(args[i].equals("-")) {
     310        readFromStdIn = true;
     311        //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin
     312        } else if(args[i].equals("--file") || args[i].equals("-f")) {
     313        i++;
     314        if(i >= args.length) {
     315            System.err.println("ERROR: No input file provided with --file|-f flag.\n");
     316            printUsage = true;
     317            returnVal = -1;
     318        } else {
     319            String filePath = args[i];
     320            inFile = new File(filePath);
     321            if(!inFile.isFile()) {
     322            System.err.println("ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n");
     323            printUsage = true;
     324            returnVal = -1;
     325            }
     326        }
     327        } else { // unrecognised input argument
     328        System.err.println("ERROR: Unrecognised " + i + "th argument to this program.\n");
     329        printUsage = true;
     330        returnVal = -1;
     331        }
     332    }
     333
     334    if(!readFromStdIn && inFile == null) { // at least one input source must be provided
     335        System.err.println("ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n");
     336        printUsage = true;
     337        returnVal = -1;
     338    }
     339   
     340    if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file
     341        System.err.println("ERROR: instructed to read from both STDIN and from an input file. Not possible.\n");
     342        printUsage = true;
     343        returnVal = -1;
     344    }
     345   
     346    if(printUsage) {
     347        if(!runSilent || returnVal == 2) { // if expressly asked for help or not running silent
     348        printUsage();
     349        }
     350        System.exit(returnVal);
     351    }   
     352   
     353    try {
     354        MaoriDetector maoriTextDetector = null;
     355        if(minConfidence == -1) {
     356        maoriTextDetector = new MaoriDetector(runSilent);
    53357        } else {
    54         for(int i = 0; i < languages.length; i++) {
    55             System.out.println("Language prediction " + i + ": " + languages[i]);
    56         }
    57         }*/
     358        maoriTextDetector = new MaoriDetector(runSilent, minConfidence);
     359        }
     360       
     361        //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT);
     362        boolean textIsInMaori = false;
     363       
     364        // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html
     365        if(inFile != null) {
     366        System.err.println("Reading text from file " + inFile.getPath());
     367        try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
     368            textIsInMaori = maoriTextDetector.isTextInMaori(reader);
     369        } // let outer try deal with any file/reading exceptions
     370        }
     371        else if (readFromStdIn) {
     372        System.err.println("Waiting to read text from STDIN... (press Ctrl-D when done entering text)>");
     373        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) {
     374            textIsInMaori = maoriTextDetector.isTextInMaori(reader);           
     375        } // let outer try deal with any file/reading exceptions
     376        }
     377   
     378        if(textIsInMaori) {
     379        returnVal = 0;
     380        } else {
     381        returnVal = 1;
     382        }
    58383       
    59384    } catch(Exception e) {
    60385        e.printStackTrace();
    61     }
    62    
    63     System.err.println("Exitting program...\n");
    64     System.exit(0);
    65     }
     386       
     387    } finally {
     388        System.err.println("Exitting program with returnVal " + returnVal + "...\n");
     389        System.exit(returnVal);
     390    }
     391    }
     392
     393    // test hardcoded string
     394    public static void oldMain(String args[]) {
     395    int returnVal = -1;
     396    boolean silentMode = false;
     397   
     398    try {
     399        MaoriDetector maoriTextDetector = new MaoriDetector(silentMode);
     400       
     401        boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT);
     402        if(textIsInMaori) {
     403        returnVal = 0;
     404        } else {
     405        returnVal = 1;
     406        }
     407       
     408    } catch(Exception e) {
     409        e.printStackTrace();
     410    } finally {
     411        System.err.println("Exitting program with returnVal " + returnVal + "...\n");
     412        System.exit(returnVal);
     413    }
     414    }
     415
     416   
    66417}
Note: See TracChangeset for help on using the changeset viewer.