Context Navigation

source: gs3-extensions/maori-lang-detection/gen_SentenceDetection_model.sh@ 33357

Last change on this file since 33357 was 33357, checked in by ak19, 5 years ago
Minor changes
Property svn:executable set to ``*
File size: 6.2 KB

Rev	Line
[33355]	1	#!/bin/bash
	2
	3	#####################################################################################################
	4	# Following instructions at
	5	# https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp
	6	# But contrary to their example, don't insert empty newlines separating the sentences input file
	7	# Each sentence just needs to be on its own line. But no empty lines between each sentence.
	8	#####################################################################################################
	9
	10	# 1. Create training file of Maori sentences in the correct format
	11	# Train this on the 2011 set (rather than 2017) set of 100k Maori language sentences
	12	# because the 2011 one appears to have fewer accidentally incorporated English sentences
	13
	14
	15	if [ ! -z $1 ]; then
	16	if [ "x$1" = "x--help" ]; then
	17	echo ""
	18	echo "Usage: $0 <mri sentences.txt>"
	19	echo " Try the one in ./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt"
	20	echo " (Unzip mri-opennlp-corpus.tar.gz)"
	21	echo " It's better than the 100k sentences from 2017, as that contains a lot of English lines."
	22	echo ""
	23	exit
	24	else
	25	infile=$1
	26	fi
	27	else
	28	# use the file opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt as inputfile
	29	# Check the file exists at the expected location. If not, check we have the tarball containing it and untar that.
	30	# If the tarball doesn't exist, try to get what we want from svn if we have svn. If no svn either, bail.
	31	if [ ! -f opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt ]; then
	32	if [ -f mri-opennlp-corpus.tar.gz ]; then
	33	tar -xvzf mri-opennlp-corpus.tar.gz
	34	else
	35	svn --help > /dev/null
	36	if [ "x$?" != "x0" ]; then
	37	echo "Attempted to train the Sentence Detector on Maori sentences on missing mri_web_2011_100K-sentences.txt"
	38	echo "The tarball mri-opennlp-corpus.tar.gz (containing mri_web_2011_100K-sentences.txt) didn't exist either."
	39	echo "And couldn't get opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt using SVN: no SVN."
	40	echo "Exitting..."
	41	exit
	42	else
	43	# Get just what we need from svn
	44	# Based on https://www.apache.org/dist/opennlp/models/langdetect/1.8.3/README.txt
[33357]	45	# To auto-confirm repeated svn https check, see https://serverfault.com/questions/37929/how-do-you-accept-an-ssl-certificate-through-the-svn-command-line
[33355]	46	svn co --depth immediates --trust-server-cert --non-interactive https://svn.apache.org/repos/bigdata/opennlp/trunk opennlp-corpus
	47	cd opennlp-corpus
	48	svn up --set-depth immediates --trust-server-cert --non-interactive
	49	cd leipzig
	50	svn up --set-depth immediates --trust-server-cert --non-interactive
	51	cd resources/
	52	svn up --set-depth infinity --trust-server-cert --non-interactive
	53	cd ../data
	54	echo "About to download the sample Maori language 100k sentences file. This may take a while (<1 minute)."
	55	svn up --trust-server-cert --non-interactive mri_web_2011_100K-sentences.txt
	56	cd ..
	57	# in opennlp-corpus/leipzig
	58	chmod u+x create_langdetect_model.sh
	59	cd ../..
	60	fi
	61	fi
	62	fi
	63	infile=./opennlp-corpus/leipzig/data/mri_web_2011_100K-sentences.txt
	64	fi
	65
	66	echo $infile
	67
	68	#tail -100 $infile
	69
	70
	71	mkdir -p $OPENNLP_HOME/training_data
	72	sentences_train_file=$OPENNLP_HOME/training_data/mri-sent.train
	73	if [ -f "$trainfile" ]; then
	74	echo "Removing existing $sentences_train_file"
	75	rm "$sentences_train_file"
	76	fi
	77
	78	# Get just the sentences in the inputfile (remove the starting number followed by tab from each line)
	79	# see https://stackoverflow.com/questions/36516363/sentence-detection-with-opennlp
	80	# Don't add an extra newline at end of each sentence (don't insert an empty line between each sentence)
	81
	82
	83	# https://stackoverflow.com/questions/7619438/bash-read-a-file-line-by-line-and-process-each-segment-as-parameters-to-other-p
	84	# how come it removes the first number automatically?
	85	#while read file num sentence
	86	#do
	87	# echo "$num $sentence" >> $sentences_train_file
	88	#done < "$infile"
	89
	90	# Don't add the extra newline at the end, creating empty lines between sentences output,
	91	# or the SentenceDetector in step 3 will do a poor job.
	92	# (The example at the stackoverflow link may have included newlines separating sentences for clarity.)
	93	#cat $infile \| awk -F '\t' '{ print $2 "\n" }' > $sentences_train_file
	94	cat $infile \| awk -F '\t' '{ print $2 }' > $sentences_train_file
	95
	96	# Why did my way below not work?
	97	# openNLP-lang-detect/opennlp-corpus/leipzig/data>echo "100000\tYWCA Boarding house : ÄwhinaServices and support Kei te pÅ«manawa o TÄmaki Makaurau a YMCA." \| awk -F "\t" '{ print $2 }'
	98	# $1 contains everything and $2 nothing. Why?
	99	# The problem appears to be that echo behaves differently from cat and less. Ask Dr Bainbridge what makes echo different.
	100	# Note that I tried manually inserting \t, after copying the original line with tabspacing had no effect. Still no difference.
	101	# Note 2: echo doesn't appear to preserve copied tab spaces.
	102
	103
	104	# 2. Create mri sentences model from training sentences file
	105	#$OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model mri-sent_trained.bin -lang en -data mri-sent.train -encoding UTF-8
	106
	107	if [ "x$OPENNLP_HOME" = "x" ]; then
	108	echo "OPENNLP_HOME not set, attempting to set it to apache-opennlp-1.9.1 (ENSURE THIS EXISTS OR SET OPENNLP_HOME YOURSELF!)"
	109	if [ -d apache-opennlp-* ]; then
	110	cd apache-opennlp-*
	111	export OPENNLP_HOME=`pwd`
	112	cd ..
	113	else
	114	echo "No OPENNLP_HOME set and could not find a subfolder 'apache-opennlp-...' to set it to."
	115	echo "Set OPENNLP_HOME yourself before running this script. Exitting..."
	116	fi
	117	fi
	118
	119	mkdir -p $OPENNLP_HOME/models
	120
	121	$OPENNLP_HOME/bin/opennlp SentenceDetectorTrainer -model $OPENNLP_HOME/models/mri-sent_trained.bin -lang en -data $sentences_train_file -encoding UTF-8
	122
	123
	124	# 3. Let's try to split some Maori language text into sentences using our model created from training on sentences.
	125	# http://opennlp.apache.org/docs/1.9.1/manual/opennlp.html#tools.cli.sentdetect
	126	# Usage: opennlp SentenceDetector model < sentences
	127	echo ""
	128	echo "Testing the model."
	129	echo "Running the SentenceDetector with the generated mri-sent_trained.bin model on sample_mri_paragraphs.txt..."
	130	echo "****************************"
[33356]	131	$OPENNLP_HOME/bin/opennlp SentenceDetector $OPENNLP_HOME/models/mri-sent_trained.bin < models-trainingdata-and-sampletxts/sample_mri_paragraphs.txt
[33355]	132	echo "****************************"
	133	echo ""

Note: See TracBrowser for help on using the repository browser.

Download in other formats: