#!/bin/bash ##################################################################################################### # Following instructions at # https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp # But contrary to their example, don't insert empty newlines separating the sentences input file # Each sentence just needs to be on its own line. But no empty lines between each sentence. ##################################################################################################### # 1. Create training file of Maori sentences in the correct format # Train this on the 2011 set (rather than 2017) set of 100k Maori language sentences # because the 2011 one appears to have fewer accidentally incorporated English sentences # Need to run this script from the top level folder of this extension if [ ! -z $1 ]; then if [ "x$1" = "x--help" ]; then echo "" echo "Usage: $0 " echo " Try the one in ./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt" echo " (Unzip mri-opennlp-corpus.tar.gz)" echo " It's better than the 100k sentences from 2017, as that contains a lot of English lines." echo "" exit else infile=$1 fi else # use the file opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt as inputfile # Check the file exists at the expected location. If not, check we have the tarball containing it and untar that. # If the tarball doesn't exist, try to get what we want from svn if we have svn. If no svn either, bail. if [ ! -f opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt ]; then if [ -f mri-opennlp-corpus.tar.gz ]; then tar -xvzf mri-opennlp-corpus.tar.gz else svn --help > /dev/null if [ "x$?" != "x0" ]; then echo "Attempted to train the Sentence Detector on Maori sentences on missing mri_web_2011_100K-sentences.txt" echo "The tarball mri-opennlp-corpus.tar.gz (containing mri_web_2011_100K-sentences.txt) didn't exist either." echo "And couldn't get opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt using SVN: no SVN." echo "Exitting..." exit else # Get just what we need from svn # Based on https://www.apache.org/dist/opennlp/models/langdetect/1.8.3/README.txt # To auto-confirm repeated svn https check, see https://serverfault.com/questions/37929/how-do-you-accept-an-ssl-certificate-through-the-svn-command-line svn co --depth immediates --trust-server-cert --non-interactive https://svn.apache.org/repos/bigdata/opennlp/trunk opennlp-corpus cd opennlp-corpus svn up --set-depth immediates --trust-server-cert --non-interactive cd leipzig svn up --set-depth immediates --trust-server-cert --non-interactive cd resources/ svn up --set-depth infinity --trust-server-cert --non-interactive cd ../data echo "About to download the sample Maori language 100k sentences file. This may take a while (<1 minute)." svn up --trust-server-cert --non-interactive mri_web_2011_100K-sentences.txt cd .. # in opennlp-corpus/leipzig chmod u+x create_langdetect_model.sh cd ../.. fi fi fi infile=./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt fi echo $infile #tail -100 $infile # Ensure OPENNLP_HOME is set if [ "x$OPENNLP_HOME" = "x" ]; then echo "OPENNLP_HOME not set, attempting to set it to the local apache-opennlp (v1.9.1). ENSURE THIS EXISTS OR SET OPENNLP_HOME YOURSELF!" #if [ -d apache-opennlp-* ]; then cd apache-opennlp-* if [ "x$?" = "x0" ]; then export OPENNLP_HOME=`pwd` cd .. else echo "No OPENNLP_HOME set and could not find a subfolder 'apache-opennlp-...' to set it to." echo "Set OPENNLP_HOME yourself before running this script. Exitting..." exit fi fi mkdir -p $OPENNLP_HOME/training_data sentences_train_file=$OPENNLP_HOME/training_data/mri-sent.train if [ -f "$trainfile" ]; then echo "Removing existing $sentences_train_file" rm "$sentences_train_file" fi # Get just the sentences in the inputfile (remove the starting number followed by tab from each line) # see https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp # Don't add an extra newline at end of each sentence (don't insert an empty line between each sentence) # https://stackoverflow.com/questions/7619438/bash-read-a-file-line-by-line-and-process-each-segment-as-parameters-to-other-p # how come it removes the first number automatically? #while read file num sentence #do # echo "$num $sentence" >> $sentences_train_file #done < "$infile" # Don't add the extra newline at the end, creating empty lines between sentences output, # or the SentenceDetector in step 3 will do a poor job. # (The example at the stackoverflow link may have included newlines separating sentences for clarity.) #cat $infile | awk -F '\t' '{ print $2 "\n" }' > $sentences_train_file cat $infile | awk -F '\t' '{ print $2 }' > $sentences_train_file # Why did my way below not work? # openNLP-lang-detect/opennlp-corpus/leipzig/data>echo "100000\tYWCA Boarding house : ĀwhinaServices and support Kei te pūmanawa o Tāmaki Makaurau a YMCA." | awk -F "\t" '{ print $2 }' # $1 contains everything and $2 nothing. Why? # The problem appears to be that echo behaves differently from cat and less. Ask Dr Bainbridge what makes echo different. # Note that I tried manually inserting \t, after copying the original line with tabspacing had no effect. Still no difference. # Note 2: echo doesn't appear to preserve copied tab spaces. # Answer: echo doesn't treat \n as newline and \t as tab and so on, unless the -e flag is passed in: # echo -e "100000\tYWCA Boarding house : ĀwhinaServices and support Kei te pūmanawa o Tāmaki Makaurau a YMCA." | awk -F "\t" '{ print $2 }' # 2. Create mri sentences model from training sentences file #$OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model mri-sent_trained.bin -lang en -data mri-sent.train -encoding UTF-8 mkdir -p $OPENNLP_HOME/models $OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model $OPENNLP_HOME/models/mri-sent_trained.bin -lang en -data $sentences_train_file -encoding UTF-8 # 3. Let's try to split some Maori language text into sentences using our model created from training on sentences. # http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.cli.sentdetect # Usage: opennlp SentenceDetector model < sentences echo "" echo "Testing the model." echo "Running the SentenceDetector with the generated mri-sent_trained.bin model on sample_mri_paragraphs.txt..." echo "****************************" $OPENNLP_HOME/bin/opennlp SentenceDetector $OPENNLP_HOME/models/mri-sent_trained.bin < models-trainingdata-and-sampletxts/sample_mri_paragraphs.txt echo "****************************" echo ""