root/gs3-extensions/maori-lang-detection/gen_SentenceDetection_model.sh @ 33377

Revision 33377, 6.6 KB (checked in by ak19, 7 weeks ago)

Changes to get gen_SentenceDetection_model.sh to run still from the toplevel directory of this extenstion but located in bin/script.

  • Property svn:executable set to *
Line 
1#!/bin/bash
2
3#####################################################################################################
4# Following instructions at
5# https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp
6# But contrary to their example, don't insert empty newlines separating the sentences input file
7# Each sentence just needs to be on its own line. But no empty lines between each sentence.
8#####################################################################################################
9
10# 1. Create training file of Maori sentences in the correct format
11# Train this on the 2011 set (rather than 2017) set of 100k Maori language sentences
12# because the 2011 one appears to have fewer accidentally incorporated English sentences
13
14
15# Need to run this script from the top level folder of this extension
16
17if [ ! -z $1 ]; then
18    if [ "x$1" = "x--help" ]; then
19    echo ""
20    echo "Usage: $0 <mri sentences.txt>"
21    echo "   Try the one in ./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt"
22    echo "   (Unzip mri-opennlp-corpus.tar.gz)"
23    echo "   It's better than the 100k sentences from 2017, as that contains a lot of English lines."
24    echo ""
25    exit
26    else
27    infile=$1
28    fi
29else
30    # use the file opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt as inputfile
31    # Check the file exists at the expected location. If not, check we have the tarball containing it and untar that.
32    # If the tarball doesn't exist, try to get what we want from svn if we have svn. If no svn either, bail.
33    if [ ! -f opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt ]; then
34    if [ -f mri-opennlp-corpus.tar.gz ]; then
35        tar -xvzf mri-opennlp-corpus.tar.gz
36    else
37        svn --help > /dev/null
38        if [ "x$?" != "x0" ]; then
39        echo "Attempted to train the Sentence Detector on Maori sentences on missing mri_web_2011_100K-sentences.txt"
40        echo "The tarball mri-opennlp-corpus.tar.gz (containing mri_web_2011_100K-sentences.txt) didn't exist either."
41        echo "And couldn't get opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt using SVN: no SVN."
42        echo "Exitting..."
43        exit
44        else
45        # Get just what we need from svn
46        # Based on https://www.apache.org/dist/opennlp/models/langdetect/1.8.3/README.txt
47        # To auto-confirm repeated svn https check, see https://serverfault.com/questions/37929/how-do-you-accept-an-ssl-certificate-through-the-svn-command-line
48        svn co --depth immediates --trust-server-cert --non-interactive https://svn.apache.org/repos/bigdata/opennlp/trunk opennlp-corpus
49        cd opennlp-corpus
50        svn up --set-depth immediates --trust-server-cert --non-interactive
51        cd leipzig
52        svn up --set-depth immediates --trust-server-cert --non-interactive
53        cd resources/
54        svn up --set-depth infinity --trust-server-cert --non-interactive
55        cd ../data
56        echo "About to download the sample Maori language 100k sentences file. This may take a while (<1 minute)."
57        svn up --trust-server-cert --non-interactive mri_web_2011_100K-sentences.txt
58        cd ..
59        # in opennlp-corpus/leipzig
60        chmod u+x create_langdetect_model.sh
61        cd ../..
62        fi
63    fi 
64    fi
65    infile=./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt
66fi
67
68echo $infile
69
70#tail -100 $infile
71
72# Ensure OPENNLP_HOME is set
73if [ "x$OPENNLP_HOME" = "x" ]; then
74    echo "OPENNLP_HOME not set, attempting to set it to the local apache-opennlp (v1.9.1). ENSURE THIS EXISTS OR SET OPENNLP_HOME YOURSELF!"
75    #if [ -d apache-opennlp-* ]; then
76    cd apache-opennlp-*
77    if [ "x$?" = "x0" ]; then
78    export OPENNLP_HOME=`pwd`
79    cd ..
80    else
81    echo "No OPENNLP_HOME set and could not find a subfolder 'apache-opennlp-...' to set it to."
82    echo "Set OPENNLP_HOME yourself before running this script. Exitting..."
83    exit
84    fi
85fi
86
87mkdir -p $OPENNLP_HOME/training_data
88sentences_train_file=$OPENNLP_HOME/training_data/mri-sent.train
89if [ -f "$trainfile" ]; then
90    echo "Removing existing $sentences_train_file"
91    rm "$sentences_train_file"
92fi
93
94# Get just the sentences in the inputfile (remove the starting number followed by tab from each line)
95# see https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp
96# Don't add an extra newline at end of each sentence (don't insert an empty line between each sentence)
97
98
99# https://stackoverflow.com/questions/7619438/bash-read-a-file-line-by-line-and-process-each-segment-as-parameters-to-other-p
100# how come it removes the first number automatically?
101#while read file num sentence
102#do
103#    echo "$num $sentence" >> $sentences_train_file
104#done < "$infile"
105
106# Don't add the extra newline at the end, creating empty lines between sentences output,
107# or the SentenceDetector in step 3 will do a poor job.
108# (The example at the stackoverflow link may have included newlines separating sentences for clarity.)
109#cat $infile | awk -F '\t' '{ print $2 "\n" }' > $sentences_train_file
110cat $infile | awk -F '\t' '{ print $2 }' > $sentences_train_file
111
112# Why did my way below not work?
113#    openNLP-lang-detect/opennlp-corpus/leipzig/data>echo "100000\tYWCA Boarding house : ĀwhinaServices and support Kei te pÅ«manawa o Tāmaki Makaurau a YMCA." | awk -F "\t" '{ print $2 }'
114# $1 contains everything and $2 nothing. Why?
115# The problem appears to be that echo behaves differently from cat and less. Ask Dr Bainbridge what makes echo different.
116# Note that I tried manually inserting \t, after copying the original line with tabspacing had no effect. Still no difference.
117# Note 2: echo doesn't appear to preserve copied tab spaces.
118# Answer: echo doesn't treat \n as newline and \t as tab and so on, unless the -e flag is passed in:
119# echo -e "100000\tYWCA Boarding house : ĀwhinaServices and support Kei te pūmanawa o Tāmaki Makaurau a YMCA." | awk -F "\t" '{ print $2 }'
120
121# 2. Create mri sentences model from training sentences file
122#$OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model mri-sent_trained.bin -lang en -data mri-sent.train -encoding UTF-8
123
124
125mkdir -p $OPENNLP_HOME/models
126
127$OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model $OPENNLP_HOME/models/mri-sent_trained.bin -lang en -data $sentences_train_file -encoding UTF-8
128
129
130# 3. Let's try to split some Maori language text into sentences using our model created from training on sentences.
131# http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.cli.sentdetect
132#    Usage: opennlp SentenceDetector model < sentences
133echo ""
134echo "Testing the model."
135echo "Running the SentenceDetector with the generated mri-sent_trained.bin model on sample_mri_paragraphs.txt..."
136echo "****************************"
137$OPENNLP_HOME/bin/opennlp SentenceDetector $OPENNLP_HOME/models/mri-sent_trained.bin < models-trainingdata-and-sampletxts/sample_mri_paragraphs.txt
138echo "****************************"
139echo ""
140
Note: See TracBrowser for help on using the browser.