[33355] | 1 | #!/bin/bash
|
---|
| 2 |
|
---|
| 3 | #####################################################################################################
|
---|
| 4 | # Following instructions at
|
---|
| 5 | # https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp
|
---|
| 6 | # But contrary to their example, don't insert empty newlines separating the sentences input file
|
---|
| 7 | # Each sentence just needs to be on its own line. But no empty lines between each sentence.
|
---|
| 8 | #####################################################################################################
|
---|
| 9 |
|
---|
| 10 | # 1. Create training file of Maori sentences in the correct format
|
---|
| 11 | # Train this on the 2011 set (rather than 2017) set of 100k Maori language sentences
|
---|
| 12 | # because the 2011 one appears to have fewer accidentally incorporated English sentences
|
---|
| 13 |
|
---|
| 14 |
|
---|
| 15 | if [ ! -z $1 ]; then
|
---|
| 16 | if [ "x$1" = "x--help" ]; then
|
---|
| 17 | echo ""
|
---|
| 18 | echo "Usage: $0 <mri sentences.txt>"
|
---|
| 19 | echo " Try the one in ./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt"
|
---|
| 20 | echo " (Unzip mri-opennlp-corpus.tar.gz)"
|
---|
| 21 | echo " It's better than the 100k sentences from 2017, as that contains a lot of English lines."
|
---|
| 22 | echo ""
|
---|
| 23 | exit
|
---|
| 24 | else
|
---|
| 25 | infile=$1
|
---|
| 26 | fi
|
---|
| 27 | else
|
---|
| 28 | # use the file opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt as inputfile
|
---|
| 29 | # Check the file exists at the expected location. If not, check we have the tarball containing it and untar that.
|
---|
| 30 | # If the tarball doesn't exist, try to get what we want from svn if we have svn. If no svn either, bail.
|
---|
| 31 | if [ ! -f opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt ]; then
|
---|
| 32 | if [ -f mri-opennlp-corpus.tar.gz ]; then
|
---|
| 33 | tar -xvzf mri-opennlp-corpus.tar.gz
|
---|
| 34 | else
|
---|
| 35 | svn --help > /dev/null
|
---|
| 36 | if [ "x$?" != "x0" ]; then
|
---|
| 37 | echo "Attempted to train the Sentence Detector on Maori sentences on missing mri_web_2011_100K-sentences.txt"
|
---|
| 38 | echo "The tarball mri-opennlp-corpus.tar.gz (containing mri_web_2011_100K-sentences.txt) didn't exist either."
|
---|
| 39 | echo "And couldn't get opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt using SVN: no SVN."
|
---|
| 40 | echo "Exitting..."
|
---|
| 41 | exit
|
---|
| 42 | else
|
---|
| 43 | # Get just what we need from svn
|
---|
| 44 | # Based on https://www.apache.org/dist/opennlp/models/langdetect/1.8.3/README.txt
|
---|
[33357] | 45 | # To auto-confirm repeated svn https check, see https://serverfault.com/questions/37929/how-do-you-accept-an-ssl-certificate-through-the-svn-command-line
|
---|
[33355] | 46 | svn co --depth immediates --trust-server-cert --non-interactive https://svn.apache.org/repos/bigdata/opennlp/trunk opennlp-corpus
|
---|
| 47 | cd opennlp-corpus
|
---|
| 48 | svn up --set-depth immediates --trust-server-cert --non-interactive
|
---|
| 49 | cd leipzig
|
---|
| 50 | svn up --set-depth immediates --trust-server-cert --non-interactive
|
---|
| 51 | cd resources/
|
---|
| 52 | svn up --set-depth infinity --trust-server-cert --non-interactive
|
---|
| 53 | cd ../data
|
---|
| 54 | echo "About to download the sample Maori language 100k sentences file. This may take a while (<1 minute)."
|
---|
| 55 | svn up --trust-server-cert --non-interactive mri_web_2011_100K-sentences.txt
|
---|
| 56 | cd ..
|
---|
| 57 | # in opennlp-corpus/leipzig
|
---|
| 58 | chmod u+x create_langdetect_model.sh
|
---|
| 59 | cd ../..
|
---|
| 60 | fi
|
---|
| 61 | fi
|
---|
| 62 | fi
|
---|
| 63 | infile=./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt
|
---|
| 64 | fi
|
---|
| 65 |
|
---|
| 66 | echo $infile
|
---|
| 67 |
|
---|
| 68 | #tail -100 $infile
|
---|
| 69 |
|
---|
| 70 |
|
---|
| 71 | mkdir -p $OPENNLP_HOME/training_data
|
---|
| 72 | sentences_train_file=$OPENNLP_HOME/training_data/mri-sent.train
|
---|
| 73 | if [ -f "$trainfile" ]; then
|
---|
| 74 | echo "Removing existing $sentences_train_file"
|
---|
| 75 | rm "$sentences_train_file"
|
---|
| 76 | fi
|
---|
| 77 |
|
---|
| 78 | # Get just the sentences in the inputfile (remove the starting number followed by tab from each line)
|
---|
| 79 | # see https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp
|
---|
| 80 | # Don't add an extra newline at end of each sentence (don't insert an empty line between each sentence)
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 | # https://stackoverflow.com/questions/7619438/bash-read-a-file-line-by-line-and-process-each-segment-as-parameters-to-other-p
|
---|
| 84 | # how come it removes the first number automatically?
|
---|
| 85 | #while read file num sentence
|
---|
| 86 | #do
|
---|
| 87 | # echo "$num $sentence" >> $sentences_train_file
|
---|
| 88 | #done < "$infile"
|
---|
| 89 |
|
---|
| 90 | # Don't add the extra newline at the end, creating empty lines between sentences output,
|
---|
| 91 | # or the SentenceDetector in step 3 will do a poor job.
|
---|
| 92 | # (The example at the stackoverflow link may have included newlines separating sentences for clarity.)
|
---|
| 93 | #cat $infile | awk -F '\t' '{ print $2 "\n" }' > $sentences_train_file
|
---|
| 94 | cat $infile | awk -F '\t' '{ print $2 }' > $sentences_train_file
|
---|
| 95 |
|
---|
| 96 | # Why did my way below not work?
|
---|
| 97 | # openNLP-lang-detect/opennlp-corpus/leipzig/data>echo "100000\tYWCA Boarding house : ÄwhinaServices and support Kei te pÅ«manawa o TÄmaki Makaurau a YMCA." | awk -F "\t" '{ print $2 }'
|
---|
| 98 | # $1 contains everything and $2 nothing. Why?
|
---|
| 99 | # The problem appears to be that echo behaves differently from cat and less. Ask Dr Bainbridge what makes echo different.
|
---|
| 100 | # Note that I tried manually inserting \t, after copying the original line with tabspacing had no effect. Still no difference.
|
---|
| 101 | # Note 2: echo doesn't appear to preserve copied tab spaces.
|
---|
| 102 |
|
---|
| 103 |
|
---|
| 104 | # 2. Create mri sentences model from training sentences file
|
---|
| 105 | #$OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model mri-sent_trained.bin -lang en -data mri-sent.train -encoding UTF-8
|
---|
| 106 |
|
---|
| 107 | if [ "x$OPENNLP_HOME" = "x" ]; then
|
---|
| 108 | echo "OPENNLP_HOME not set, attempting to set it to apache-opennlp-1.9.1 (ENSURE THIS EXISTS OR SET OPENNLP_HOME YOURSELF!)"
|
---|
| 109 | if [ -d apache-opennlp-* ]; then
|
---|
| 110 | cd apache-opennlp-*
|
---|
| 111 | export OPENNLP_HOME=`pwd`
|
---|
| 112 | cd ..
|
---|
| 113 | else
|
---|
| 114 | echo "No OPENNLP_HOME set and could not find a subfolder 'apache-opennlp-...' to set it to."
|
---|
| 115 | echo "Set OPENNLP_HOME yourself before running this script. Exitting..."
|
---|
| 116 | fi
|
---|
| 117 | fi
|
---|
| 118 |
|
---|
| 119 | mkdir -p $OPENNLP_HOME/models
|
---|
| 120 |
|
---|
| 121 | $OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model $OPENNLP_HOME/models/mri-sent_trained.bin -lang en -data $sentences_train_file -encoding UTF-8
|
---|
| 122 |
|
---|
| 123 |
|
---|
| 124 | # 3. Let's try to split some Maori language text into sentences using our model created from training on sentences.
|
---|
| 125 | # http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.cli.sentdetect
|
---|
| 126 | # Usage: opennlp SentenceDetector model < sentences
|
---|
| 127 | echo ""
|
---|
| 128 | echo "Testing the model."
|
---|
| 129 | echo "Running the SentenceDetector with the generated mri-sent_trained.bin model on sample_mri_paragraphs.txt..."
|
---|
| 130 | echo "****************************"
|
---|
[33356] | 131 | $OPENNLP_HOME/bin/opennlp SentenceDetector $OPENNLP_HOME/models/mri-sent_trained.bin < models-trainingdata-and-sampletxts/sample_mri_paragraphs.txt
|
---|
[33355] | 132 | echo "****************************"
|
---|
| 133 | echo ""
|
---|