Context Navigation

source: main/tags/2.71/gsdl/perllib/textcat.pm@ 21105

Last change on this file since 21105 was 2235, checked in by sjboddie, 23 years ago
Hacked the textcat package about so that it only reads all the language models once (instead of reading them in before processing each document). Fairly significant speed improvements, as you'd expect.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.5 KB

Line
1	###########################################################################
2	#
3	# textcat.pm -- Identify the language of a piece of text
4	#
5	#
6	# This file is based on TextCat version 1.08 by Gertjan van Noord
7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9	#
10	# It was modified by Gordon Paynter ([email protected]) and turned
11	# into a package for use in Greenstone digital library system. Most of
12	# the modifications consist of commenting out or deleting functionality
13	# I don't need.
14	#
15	#
16	# This program is free software; you can redistribute it and/or modify
17	# it under the terms of the GNU General Public License as published by
18	# the Free Software Foundation; either version 2 of the License, or
19	# (at your option) any later version.
20	#
21	# This program is distributed in the hope that it will be useful,
22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24	# GNU General Public License for more details.
25	#
26	# You should have received a copy of the GNU General Public License
27	# along with this program; if not, write to the Free Software
28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29	#
30	###########################################################################
31
32	package textcat;
33
34	# OPTIONS
35	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
36
37	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
38	my $opt_t = 400; # topmost number of ngrams that should be used
39	my $opt_u = 1.05; # how much worse result must be before it is ignored
40
41	my $non_word_characters = '0-9\s';
42
43	sub new {
44	my $class = shift (@_);
45	my $self = {};
46
47	# open directory to find which languages are supported
48	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
49	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
50	closedir DIR;
51	@languages or die "sorry, can't read any language models from $model_dir\n" .
52	"language models must reside in files with .lm ending\n";
53
54	# load model and count for each language.
55	foreach my $language (@languages) {
56	my %ngram=();
57	my $rang=1;
58	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
59	while (<LM>) {
60	chomp;
61	# only use lines starting with appropriate character. Others are ignored.
62	if (/^[^$non_word_characters]+/o) {
63	$self->{'ngrams'}->{$language}->{$&} = $rang++;
64	}
65	}
66	close(LM);
67	}
68
69	$self->{'languages'} = \@languages;
70	return bless $self, $class;
71	}
72
73
74
75	# CLASSIFICATION
76	#
77	# What language is a text string?
78	# Input: text string
79	# Output: array of language names
80
81	sub classify {
82	my ($self, $inputref)=@_;
83	my %results = ();
84	my $maxp = $opt_t;
85
86	# create ngrams for input.
87	my $unknown = $self->create_lm($inputref);
88
89	foreach my $language (@{$self->{'languages'}}) {
90
91	# compare language model with input ngrams list
92	my ($i,$p)=(0,0);
93	while ($i < scalar (@$unknown)) {
94	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
95	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
96	} else {
97	$p=$p+$maxp;
98	}
99	++$i;
100	}
101	$results{$language} = $p;
102	}
103
104	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
105	my $a = $results{$results[0]};
106
107	my @answers=(shift(@results));
108	while (@results && $results{$results[0]} < ($opt_u *$a)) {
109	@answers=(@answers,shift(@results));
110	}
111
112	return \@answers;
113	}
114
115	sub create_lm {
116	# $ngram contains reference to the hash we build
117	# then add the ngrams found in each word in the hash
118	my ($self, $textref) = @_;
119
120	my $ngram = {};
121
122	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
123	$word = "_" . $word . "_";
124	my $len = length($word);
125	my $flen=$len;
126	my $i;
127
128	for ($i=0; $i<$flen; $i++) {
129	$ngram->{substr($word,$i,5)}++ if $len > 4;
130	$ngram->{substr($word,$i,4)}++ if $len > 3;
131	$ngram->{substr($word,$i,3)}++ if $len > 2;
132	$ngram->{substr($word,$i,2)}++ if $len > 1;
133	$ngram->{substr($word,$i,1)}++;
134	$len--;
135	}
136	}
137
138	map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
139	} keys %$ngram;
140
141	# sort the ngrams, and spit out the $opt_t frequent ones.
142	# adding `or $a cmp $b' in the sort block makes sorting five
143	# times slower..., although it would be somewhat nicer (unique result)
144	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
145	splice(@sorted,$opt_t) if (@sorted > $opt_t);
146	return \@sorted;
147	}
148
149	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: