Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/perllib/textcat.pm@ 1679

Last change on this file since 1679 was 1316, checked in by paynter, 24 years ago
The textcat language identification package.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 5.2 KB

Rev	Line
[1316]	1	###########################################################################
	2	#
	3	# textcat.pm -- Identify the language of a piece of text
	4	#
	5	#
	6	# This file is based on TextCat version 1.08 by Gertjan van Noord
	7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
	8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
	9	#
	10	# It was modified by Gordon Paynter ([email protected]) and turned
	11	# into a package for use in Greenstone digital library system. Most of
	12	# the modifications consist of commenting out or deleting functionality
	13	# I don't need.
	14	#
	15	#
	16	# This program is free software; you can redistribute it and/or modify
	17	# it under the terms of the GNU General Public License as published by
	18	# the Free Software Foundation; either version 2 of the License, or
	19	# (at your option) any later version.
	20	#
	21	# This program is distributed in the hope that it will be useful,
	22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	24	# GNU General Public License for more details.
	25	#
	26	# You should have received a copy of the GNU General Public License
	27	# along with this program; if not, write to the Free Software
	28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	29	#
	30	###########################################################################
	31
	32	package textcat;
	33
	34	use strict;
	35	#use Benchmark;
	36
	37	# OPTIONS
	38	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
	39
	40	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
	41	my $opt_t = 400; # topmost number of ngrams that should be used
	42	my $opt_u = 1.05; # how much worse result must be before it is ignored
	43
	44	my $non_word_characters = '0-9\s';
	45
	46
	47	# CLASSIFICATION
	48	#
	49	# What language is a text string?
	50	# Input: text string
	51	# Output: array of language names
	52
	53	sub classify {
	54	my ($input)=@_;
	55	my %results=();
	56	my $maxp = $opt_t;
	57
	58	# open directory to find which languages are supported
	59	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
	60	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
	61	closedir DIR;
	62	@languages or die "sorry, can't read any language models from $model_dir\n" .
	63	"language models must reside in files with .lm ending\n";
	64
	65	# create ngrams for input. Note that hash %unknown is not used;
	66	# it contains the actual counts which are only used under -n: creating
	67	# new language model (and even then they are not really required).
	68	my @unknown=create_lm($input);
	69	# load model and count for each language.
	70	my $language;
	71	# my $t1 = new Benchmark;
	72	foreach $language (@languages) {
	73	# loads the language model into hash %$language.
	74	my %ngram=();
	75	my $rang=1;
	76	open(LM,"$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
	77	while (<LM>) {
	78	chomp;
	79	# only use lines starting with appropriate character. Others are
	80	# ignored.
	81	if (/^[^$non_word_characters]+/o) {
	82	$ngram{$&} = $rang++;
	83	}
	84	}
	85	close(LM);
	86	#print STDERR "loaded language model $language\n" if $opt_v;
	87
	88	# compares the language model with input ngrams list
	89	my ($i,$p)=(0,0);
	90	while ($i < @unknown) {
	91	if ($ngram{$unknown[$i]}) {
	92	$p=$p+abs($ngram{$unknown[$i]}-$i);
	93	} else {
	94	$p=$p+$maxp;
	95	}
	96	++$i;
	97	}
	98	#print STDERR "$language: $p\n" if $opt_v;
	99
	100	$results{$language} = $p;
	101	}
	102	# print STDERR "read language models done (" .
	103	# timestr(timediff(new Benchmark, $t1)) .
	104	# ".\n" if $opt_v;
	105	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
	106
	107	# print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
	108	my $a = $results{$results[0]};
	109
	110	my @answers=(shift(@results));
	111	while (@results && $results{$results[0]} < ($opt_u *$a)) {
	112	@answers=(@answers,shift(@results));
	113	}
	114
	115	return @answers;
	116	}
	117
	118
	119
	120	sub create_lm {
	121	# my $t1 = new Benchmark;
	122	my $ngram;
	123	($_,$ngram) = @_; #$ngram contains reference to the hash we build
	124	# then add the ngrams found in each word in the hash
	125	my $word;
	126	foreach $word (split("[$non_word_characters]+")) {
	127	$word = "_" . $word . "_";
	128	my $len = length($word);
	129	my $flen=$len;
	130	my $i;
	131	for ($i=0;$i<$flen;$i++) {
	132	$$ngram{substr($word,$i,5)}++ if $len > 4;
	133	$$ngram{substr($word,$i,4)}++ if $len > 3;
	134	$$ngram{substr($word,$i,3)}++ if $len > 2;
	135	$$ngram{substr($word,$i,2)}++ if $len > 1;
	136	$$ngram{substr($word,$i,1)}++;
	137	$len--;
	138	}
	139	}
	140	###print "@{[%$ngram]}";
	141	# my $t2 = new Benchmark;
	142	# print STDERR "count_ngrams done (".
	143	# timestr(timediff($t2, $t1)) .").\n" if $opt_v;
	144
	145	# as suggested by Karel P. de Vos, [email protected], we speed up
	146	# sorting by removing singletons
	147	map { my $key=$_; if ($$ngram{$key} <= $opt_f)
	148	{ delete $$ngram{$key}; }; } keys %$ngram;
	149
	150
	151	# sort the ngrams, and spit out the $opt_t frequent ones.
	152	# adding `or $a cmp $b' in the sort block makes sorting five
	153	# times slower..., although it would be somewhat nicer (unique result)
	154	my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
	155	splice(@sorted,$opt_t) if (@sorted > $opt_t);
	156	# print STDERR "sorting done (" .
	157	# timestr(timediff(new Benchmark, $t2)) .
	158	# ").\n" if $opt_v;
	159	return @sorted;
	160	}
	161
	162	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: