Changeset 33336

Show
Ignore:
Timestamp:
20.07.2019 22:58:17 (5 weeks ago)
Author:
ak19
Message:

Major rewrite to make this class more useful to callers. MaoriDetector?.java, soon to be renamed, can now be called with a filepath to a txt file or with the hyphen to indicate it should read text from std input (terminate with Ctrl-D as usual). Then it will predict whether the language of the text is in Maori or not depending on the min confidence cut-off value, which presently defaults to 0.5. Still need to support silent mode and update the README with instructions. Now the language detection model file needed by OpenNLP to do the language detection needs to live at OPENNLP_HOME/models/langdetect-183.bin

Location:
gs3-extensions/maori-lang-detection/src
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/MaoriDetector.java

    r33335 r33336  
    1515 
    1616/** 
    17  * Run as: 
    18  *    wharariki:[115]/Scratch/ak19/openNLP-lang-detect/src>javac -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector.java 
    19  *    wharariki:[116]/Scratch/ak19/openNLP-lang-detect/src>java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector 
     17 * EXPORT OPENNLP_HOME environment variable to be your apache OpenNLP installation. 
     18 * Then, to compile this program: 
     19 *    maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar" MaoriDetector.java 
     20 * To run this program, one of: 
     21 * 
     22 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --help 
     23 * 
     24 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector --file <full/path/to/textfile> 
     25 * 
     26 *    maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/*" MaoriDetector - 
     27 *       which expects text to stream in from standard input. 
     28 *       If entering text manually, then remember to press Ctrl-D to indicate the usual end of StdIn. 
    2029 * 
    2130 * https://stackoverflow.com/questions/219585/including-all-the-jars-in-a-directory-within-the-java-classpath 
     
    2332 */ 
    2433public class MaoriDetector { 
    25  
     34    /** The 3 letter language code for Maori in ISO 639-2 or ISO 639-3 */  
     35    public static final String MAORI_3LETTER_CODE = "mri"; 
     36    public static final double DEFAULT_MINIMUM_CONFIDENCE = 0.50; 
     37 
     38    /** Configurable: cut off minimum confidence value, 
     39    greater or equal to which determines that the best predicted language is acceptable to user of MaoriDetector. */ 
     40    public final double MINIMUM_CONFIDENCE; 
     41    /** silentMode set to false means MaoriDetector won't print helpful messages while running. Set to true to run silently. */ 
     42    public final boolean silentMode; 
     43 
     44    /** Language Detection Model file for OpenNLP is expected to be at $OPENNLP_HOME/models/langdetect-183.bin */ 
     45    private final String LANG_DETECT_MODEL_RELATIVE_PATH = "models" + File.separator + "langdetect-183.bin"; 
     46    private LanguageDetector myCategorizer = null; 
     47     
    2648    /**  
    27      * Taken from our university website 
     49     * String taken from our university website 
    2850     * https://www.waikato.ac.nz/maori/ 
    2951     */ 
    30     public static final String TEST_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei."; 
    31      
    32     public static void main(String args[]) { 
     52    public static final String TEST_MRI_INPUT_TEXT = "Ko tēnei te Whare Wānanga o Waikato e whakatau nei i ngā iwi o te ao, ki roto i te riu o te awa e rere nei, ki runga i te whenua e hora nei, ki raro i te taumaru o ngā maunga whakaruru e tau awhi nei."; 
     53 
     54    /** test input string for a negative result */ 
     55    public static final String TEST_ENG_INPUT_TEXT = "The main program exits with -1 if an Exception occurred when attempting to detect the text's language"; 
     56     
     57     
     58    public MaoriDetector(boolean silentMode) throws Exception { 
     59    this(silentMode, DEFAULT_MINIMUM_CONFIDENCE); 
     60    } 
     61     
     62    public MaoriDetector(boolean silentMode, double min_confidence) throws Exception { 
     63    this.silentMode = silentMode; 
     64    this.MINIMUM_CONFIDENCE = min_confidence; 
     65 
     66    // 1. Check we can find the Language Detect Model file in the correct location (check that $OPENNLP_HOME/models/langdetect-183.bin exists); 
     67    String langDetectModelPath = System.getenv("OPENNLP_HOME"); 
     68    if(System.getenv("OPENNLP_HOME") == null) { 
     69        throw new Exception("\n\t*** Environment variable OPENNLP_HOME must be set to your Apache OpenNLP installation folder."); 
     70    }    
     71    langDetectModelPath = langDetectModelPath + File.separator + LANG_DETECT_MODEL_RELATIVE_PATH; 
     72    File langDetectModelBinFile = new File(langDetectModelPath); 
     73    if(!langDetectModelBinFile.exists()) { 
     74        throw new Exception("\n\t*** " + langDetectModelBinFile.getPath() + " doesn't exist." 
     75                + "\n\t*** Ensure the $OPENNLP_HOME folder contains a 'models' folder with the model file 'langdetect-183.bin' in it."); 
     76    } 
     77 
     78 
     79    // 2. Set up our language detector Model and the Categorizer for language predictions based on the Model. 
    3380    // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#intro.api 
    3481    // https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html 
    35     try (InputStream modelIn = new FileInputStream("/Scratch/ak19/openNLP-lang-detect/langdetect-183.bin")) { 
     82    try (InputStream modelIn = new FileInputStream(langDetectModelPath)) { 
    3683 
    3784        LanguageDetectorModel model = new LanguageDetectorModel(modelIn); 
    3885 
    3986        // http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.langdetect 
    40         LanguageDetector myCategorizer = new LanguageDetectorME(model); 
    41  
    42         // Get the most probable language 
    43         Language bestLanguage = myCategorizer.predictLanguage(TEST_INPUT_TEXT); 
    44         System.out.println("Best language: " + bestLanguage.getLang()); 
    45         System.out.println("Best language confidence: " + bestLanguage.getConfidence()); 
    46  
    47          
    48         // Get an array with the most probable languages 
    49         Language[] languages = myCategorizer.predictLanguages(TEST_INPUT_TEXT); 
    50         /* 
    51         if(languages == null || languages.length <= 0) { 
    52         System.err.println("No languages predicted for the input text"); 
     87        this.myCategorizer = new LanguageDetectorME(model); 
     88    }/*catch(Exception e) { 
     89        e.printStackTrace(); 
     90        }*/ 
     91     
     92    // instantiating function should handle critical exceptions. Constructors shouldn't. 
     93    }     
     94 
     95    /** 
     96     * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, 
     97     * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). 
     98     */ 
     99    public boolean isTextInMaori(String text) {  
     100    return isTextInLanguage(MAORI_3LETTER_CODE, text); 
     101    } 
     102 
     103    /** @param langCode is 3 letter language code, ISO 639-2/3  
     104     * https://www.loc.gov/standards/iso639-2/php/code_list.php 
     105     * https://en.wikipedia.org/wiki/ISO_639-3 
     106     * @return true if the input text is Maori (mri) with MINIMUM_CONFIDENCE levels of confidence (if set, 
     107     * else DEFAULT_MINIMUM_CONFIDENCE levels of confidence). 
     108     */ 
     109    public boolean isTextInLanguage(String langCode, String text) { 
     110    // Get the most probable language 
     111    Language bestLanguage = myCategorizer.predictLanguage(text); 
     112    System.out.println("Best language: " + bestLanguage.getLang()); 
     113    System.out.println("Best language confidence: " + bestLanguage.getConfidence()); 
     114 
     115    return (bestLanguage.getLang().equals(langCode) && bestLanguage.getConfidence() >= this.MINIMUM_CONFIDENCE); 
     116    } 
     117     
     118     
     119    /** 
     120     * Handle "smaller" textfiles/streams of text read in. 
     121     * Return value is the same as for isTextInMaori(String text); 
     122     */ 
     123    public boolean isTextInMaori(BufferedReader reader) throws Exception { 
     124    return isTextInLanguage(MAORI_3LETTER_CODE, reader); 
     125    } 
     126    /** 
     127     * Handle "smaller" textfiles/streams of text read in. 
     128     * Return value is the same as for isTextInLanguage(String langCode, String text); 
     129     */ 
     130    public boolean isTextInLanguage(String langCode, BufferedReader reader) throws Exception { 
     131    // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file 
     132     
     133    StringBuilder text = new StringBuilder(); 
     134    String line = null; 
     135 
     136     
     137    while((line = reader.readLine()) != null) { // readLine removes newline separator 
     138        text.append(line + "\n"); // add back (unix style) line ending 
     139    } 
     140    return isTextInLanguage(langCode, text.toString()); 
     141    } 
     142     
     143    /* 
     144     * Need better handling of "larger" textfiles/streams of text read in: 
     145     * what if multiple languages with high confidence every NUM_LINES read in? 
     146     * Does this mean the file is multi-lingual with each section dominated by a different language? 
     147     * How best to convey such information to the user? 
     148     */ 
     149    /** 
     150     * Rudimentary attempt to deal with very large files. 
     151     * Return value is the same as for isTextInMaori(String text); 
     152     */ 
     153    public boolean isLargeTextInMaori(BufferedReader reader) throws Exception { 
     154    return isLargeTextInLanguage(MAORI_3LETTER_CODE, reader); 
     155    } 
     156 
     157    /** 
     158     * Rudimentary attempt to deal with very large files. 
     159     * Return value is the same as for isTextInLanguage(String langCode, String text); 
     160     */     
     161    public boolean isLargeTextInLanguage(String langCode, BufferedReader reader) throws Exception { 
     162    // https://stackoverflow.com/questions/326390/how-do-i-create-a-java-string-from-the-contents-of-a-file 
     163     
     164    final int NUM_LINES = 100; // arbitrary 100 lines read, predict language, calculate confidence 
     165 
     166    StringBuilder text = new StringBuilder(); 
     167    String line = null; 
     168     
     169    double cumulativeConfidence = 0; 
     170    int numLoops = 0; 
     171     
     172    int i = 0; 
     173    String language = null; 
     174     
     175    while((line = reader.readLine()) != null) { // readLine removes newline separator 
     176        text.append(line + "\n"); // add back (unix style) line ending 
     177         
     178        i++; // read nth line of numLoop 
     179         
     180         
     181        if(i == NUM_LINES) { // arbitrary 100 lines read, predict language, calculate confidence 
     182         
     183         
     184        Language bestLanguage = myCategorizer.predictLanguage(text.toString()); 
     185        if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines 
     186            System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");            
     187        } 
     188        language = bestLanguage.getLang(); 
     189        cumulativeConfidence += bestLanguage.getConfidence(); 
     190         
     191        System.err.println("Best predicted language for last " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")");  
     192         
     193        // finished analysing language of NUM_LINES of text 
     194        text = new StringBuilder(); 
     195        i = 0; 
     196        numLoops++; 
     197        }        
     198    } 
     199     
     200    // process any (remaining) text that was less than n NUM_LINES 
     201    if(!text.toString().equals("")) { 
     202        text.append(line + "\n"); // add back (unix style) line ending       
     203        i++; 
     204         
     205        Language bestLanguage = myCategorizer.predictLanguage(text.toString()); 
     206         
     207        if(language != null && !bestLanguage.getLang().equals(language)) { // predicted lang of current n lines not the same as predicted lang for prev n lines 
     208        System.err.println("**** WARNING: text seems to contain content in multiple languages or unable to consistently predict the same language.");            
     209        } 
     210        language = bestLanguage.getLang(); 
     211        cumulativeConfidence += bestLanguage.getConfidence(); 
     212        System.err.println("Best predicted language for final " + NUM_LINES + " lines: " + language + "(confidence: " + bestLanguage.getConfidence() + ")"); 
     213    } 
     214     
     215     
     216    int totalLinesRead = numLoops * NUM_LINES + i; // not used 
     217    double avgConfidence = cumulativeConfidence/(numLoops + 1); // not quite the average as the text processed outside the loop may have fewer lines than NUM_LINES 
     218     
     219     
     220    return (language.equals(langCode) && avgConfidence >= this.MINIMUM_CONFIDENCE); 
     221    } 
     222 
     223     
     224 
     225    /** 
     226     * Prints to STDOUT the predicted languages of the input text in order of descending confidence. 
     227     * Unused. 
     228     */ 
     229    public void predictedLanguages(String text) { 
     230    // Get an array with the most probable languages 
     231     
     232    Language[] languages = myCategorizer.predictLanguages(text); 
     233     
     234    if(languages == null || languages.length <= 0) { 
     235        System.err.println("No languages predicted for the input text"); 
     236    } else { 
     237        for(int i = 0; i < languages.length; i++) { 
     238        System.out.println("Language prediction " + i + ": " + languages[i]); 
     239        } 
     240    } 
     241     
     242    } 
     243 
     244    public static void printUsage() { 
     245    System.err.println("Run this program with:"); 
     246    System.err.println("\t--help (-h)\tfor seeing this usage message again"); 
     247    System.err.println("\t-\tto have input text read from STDIN (as always, hit Ctrl-D to mark end of text stream)"); 
     248    System.err.println("\t--file (-f)\tto provide an input file path"); 
     249    System.err.println("\t--silent (-s): optional, to run silently and just exit with exit value. [not yet implemented]"); 
     250    System.err.println("\t--min-confidence (-c): optional, to override the default minimum confidence value (" + DEFAULT_MINIMUM_CONFIDENCE + ")"); 
     251    System.err.println("\t\tof the predicted language that will be considered acceptable."); 
     252    System.err.println(); 
     253    System.err.println("This program terminates with exit value:"); 
     254    System.err.println("\t0 if the input text is in Maori"); 
     255    System.err.println("\t1 if input text is not in Maori"); 
     256    System.err.println(); 
     257    System.err.println("\t-1 if the input arguments were wrong"); 
     258    System.err.println("\t255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language"); 
     259    System.err.println("\t2 if the user asked to run this program with --help/-h."); 
     260    System.err.println(); 
     261    } 
     262 
     263    /** 
     264     * The main program exits with: 
     265     *    0 if text is in Maori; 
     266     *    1 if text is not in Maori; 
     267     * 
     268     *    -1 if the input arguments were wrong 
     269     *    255(!) if an Exception occurred in instantiating the MaoriDetector when attempting to detect the text's language 
     270     * QTODO: why does the program exit value end up as 255 and not -1 when returnVal remains at -1 on Exception? 
     271     *    2 if the user asked to run this program with --help/-h. 
     272     */ 
     273    public static void main(String args[]) { 
     274    int returnVal = -1; 
     275 
     276    // 1. Check input arguments 
     277    boolean printUsage = false; 
     278    boolean readFromStdIn = false; 
     279    File inFile = null; 
     280    boolean runSilent = false; 
     281    double minConfidence = -1; 
     282 
     283    for (int i = 0; !printUsage && i < args.length; i++) { 
     284         
     285        // check for help first and quit after printing usage 
     286        if(args[i].equals("--help") || args[i].equals("-h")) { 
     287        printUsage = true; 
     288        returnVal = 2;       
     289        } else if(args[i].equals("--silent") || args[i].equals("-s")) { 
     290        runSilent = true; 
     291        } else if(args[i].equals("--min-confidence") || args[i].equals("-c")) { 
     292        i++; 
     293        if(i >= args.length) { 
     294            System.err.println("ERROR: No minimum confidence value provided with --min-confidence|-c flag.\n"); 
     295            printUsage = true; 
     296            returnVal = -1; 
     297        } else { 
     298            try { 
     299            minConfidence = Double.parseDouble(args[i]); 
     300            if(minConfidence < 0 || minConfidence > 1) { 
     301                throw new NumberFormatException("Number out of range, must be between 0-1"); 
     302            } 
     303            } catch(NumberFormatException nfe) { 
     304            System.err.println("ERROR: value for min-confidence is the wrong format or out of range. It must be a (decimal point) number between 0-1.\n"); 
     305            printUsage = true; 
     306            returnVal = -1; 
     307            } 
     308        } 
     309        } else if(args[i].equals("-")) { 
     310        readFromStdIn = true; 
     311        //break; // don't bother continuing to check input arguments for any --file flag if we're told to read from stdin 
     312        } else if(args[i].equals("--file") || args[i].equals("-f")) { 
     313        i++; 
     314        if(i >= args.length) { 
     315            System.err.println("ERROR: No input file provided with --file|-f flag.\n"); 
     316            printUsage = true; 
     317            returnVal = -1; 
     318        } else { 
     319            String filePath = args[i]; 
     320            inFile = new File(filePath); 
     321            if(!inFile.isFile()) { 
     322            System.err.println("ERROR: Can't read text. Input file argument provided does not exist or is not a file.\n"); 
     323            printUsage = true; 
     324            returnVal = -1; 
     325            } 
     326        } 
     327        } else { // unrecognised input argument 
     328        System.err.println("ERROR: Unrecognised " + i + "th argument to this program.\n"); 
     329        printUsage = true; 
     330        returnVal = -1; 
     331        } 
     332    } 
     333 
     334    if(!readFromStdIn && inFile == null) { // at least one input source must be provided 
     335        System.err.println("ERROR: must specify source to read text from, either STDIN (-) or input file (--file <file>).\n"); 
     336        printUsage = true; 
     337        returnVal = -1; 
     338    } 
     339     
     340    if(readFromStdIn && inFile != null) { // this program can't be asked to read from stdin and from an input file 
     341        System.err.println("ERROR: instructed to read from both STDIN and from an input file. Not possible.\n"); 
     342        printUsage = true; 
     343        returnVal = -1; 
     344    } 
     345     
     346    if(printUsage) { 
     347        if(!runSilent || returnVal == 2) { // if expressly asked for help or not running silent 
     348        printUsage(); 
     349        } 
     350        System.exit(returnVal); 
     351    }    
     352     
     353    try { 
     354        MaoriDetector maoriTextDetector = null; 
     355        if(minConfidence == -1) { 
     356        maoriTextDetector = new MaoriDetector(runSilent); 
    53357        } else { 
    54         for(int i = 0; i < languages.length; i++) { 
    55             System.out.println("Language prediction " + i + ": " + languages[i]); 
    56         } 
    57         }*/ 
     358        maoriTextDetector = new MaoriDetector(runSilent, minConfidence); 
     359        } 
     360         
     361        //boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); 
     362        boolean textIsInMaori = false; 
     363         
     364        // Using try with resources, https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html 
     365        if(inFile != null) { 
     366        System.err.println("Reading text from file " + inFile.getPath()); 
     367        try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) { 
     368            textIsInMaori = maoriTextDetector.isTextInMaori(reader); 
     369        } // let outer try deal with any file/reading exceptions 
     370        } 
     371        else if (readFromStdIn) { 
     372        System.err.println("Waiting to read text from STDIN... (press Ctrl-D when done entering text)>"); 
     373        try (BufferedReader reader = new BufferedReader(new InputStreamReader(System.in))) { 
     374            textIsInMaori = maoriTextDetector.isTextInMaori(reader);             
     375        } // let outer try deal with any file/reading exceptions 
     376        } 
     377     
     378        if(textIsInMaori) { 
     379        returnVal = 0; 
     380        } else { 
     381        returnVal = 1; 
     382        } 
    58383         
    59384    } catch(Exception e) { 
    60385        e.printStackTrace(); 
    61     } 
    62      
    63     System.err.println("Exitting program...\n"); 
    64     System.exit(0); 
    65     } 
     386         
     387    } finally { 
     388        System.err.println("Exitting program with returnVal " + returnVal + "...\n"); 
     389        System.exit(returnVal); 
     390    } 
     391    } 
     392 
     393    // test hardcoded string 
     394    public static void oldMain(String args[]) { 
     395    int returnVal = -1; 
     396    boolean silentMode = false; 
     397     
     398    try { 
     399        MaoriDetector maoriTextDetector = new MaoriDetector(silentMode); 
     400         
     401        boolean textIsInMaori = maoriTextDetector.isTextInMaori(TEST_MRI_INPUT_TEXT); 
     402        if(textIsInMaori) { 
     403        returnVal = 0; 
     404        } else { 
     405        returnVal = 1; 
     406        } 
     407         
     408    } catch(Exception e) { 
     409        e.printStackTrace(); 
     410    } finally { 
     411        System.err.println("Exitting program with returnVal " + returnVal + "...\n"); 
     412        System.exit(returnVal); 
     413    } 
     414    } 
     415 
     416     
    66417}