Context Navigation

source: gsdl/trunk/perllib/textcat.pm@ 16554

Last change on this file since 16554 was 16554, checked in by ak19, 16 years ago
Added subroutines classify_cached and clear_cache. The first of these is called by BasePlugin's filepath_to_utf8 subroutine. Textcat now caches results when called with classify_cached() which can be useful for small strings (like filenames) so that textcat need not be performed repeatedly on the same strings.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.6 KB

Line
1	###########################################################################
2	#
3	# textcat.pm -- Identify the language of a piece of text
4	#
5	#
6	# This file is based on TextCat version 1.08 by Gertjan van Noord
7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9	#
10	# It was modified by Gordon Paynter ([email protected]) and turned
11	# into a package for use in Greenstone digital library system. Most of
12	# the modifications consist of commenting out or deleting functionality
13	# I don't need.
14	#
15	#
16	# This program is free software; you can redistribute it and/or modify
17	# it under the terms of the GNU General Public License as published by
18	# the Free Software Foundation; either version 2 of the License, or
19	# (at your option) any later version.
20	#
21	# This program is distributed in the hope that it will be useful,
22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24	# GNU General Public License for more details.
25	#
26	# You should have received a copy of the GNU General Public License
27	# along with this program; if not, write to the Free Software
28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29	#
30	###########################################################################
31
32	package textcat;
33
34	use strict;
35
36	# OPTIONS
37	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40	my $opt_t = 400; # topmost number of ngrams that should be used
41	my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43	my $non_word_characters = '0-9\s';
44
45	# caching related
46	my %cache = (); # map of cached text-strings each to array of char-encodings
47	my $MAX_CACHE_SIZE = 1000;
48
49	sub new {
50	my $class = shift (@_);
51	my ($tmp_f, $tmp_t, $tmp_u) = @_;
52
53	my $self = {};
54
55	# open directory to find which languages are supported
56	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
57	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
58	closedir DIR;
59	@languages or die "sorry, can't read any language models from $model_dir\n" .
60	"language models must reside in files with .lm ending\n";
61
62	# load model and count for each language.
63	foreach my $language (@languages) {
64	my %ngram=();
65	my $rang=1;
66	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
67	while (<LM>) {
68	chomp;
69	# only use lines starting with appropriate character. Others are ignored.
70	if (/^[^$non_word_characters]+/o) {
71	$self->{'ngrams'}->{$language}->{$&} = $rang++;
72	}
73	}
74	close(LM);
75	}
76
77	$self->{'languages'} = \@languages;
78
79	$self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
80	$self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
81	$self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
82	$self->{'max_cache_size'} = $MAX_CACHE_SIZE;
83
84	return bless $self, $class;
85	}
86
87
88
89	# CLASSIFICATION
90	#
91	# What language is a text string?
92	# Input: text string
93	# Output: array of language names
94
95	sub classify {
96	my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
97
98	$self->{'opt_f'} = $opt_freq if defined $opt_freq;
99	$self->{'opt_u'} = $opt_factor if defined $opt_factor;
100	$self->{'opt_t'} = $opt_top if defined $opt_top;
101
102	my %results = ();
103	my $maxp = $self->{'opt_t'};
104
105	# create ngrams for input.
106	my $unknown = $self->create_lm($inputref);
107
108	foreach my $language (@{$self->{'languages'}}) {
109
110	# compare language model with input ngrams list
111	my ($i,$p)=(0,0);
112	while ($i < scalar (@$unknown)) {
113	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
114	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
115	} else {
116	$p=$p+$maxp;
117	}
118	++$i;
119	}
120	$results{$language} = $p;
121	}
122
123	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
124	my $a = $results{$results[0]};
125
126	my @answers=(shift(@results));
127	while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
128	@answers=(@answers,shift(@results));
129	}
130
131	return \@answers;
132	}
133
134	# Same as above, but caches textcat results for subsequent use.
135	# The cache is a map of the string to the corresponding array of results
136	# returned by textcat of the possible filename-encodings for that string.
137	# Use this method for short strings (such as filenames) rather than huge text
138	# files. The cache will be cleared when the max_cache_size is reached, which
139	# is MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
140	# can also be cleared by a call to clear_cache.
141	sub classify_cached {
142	my ($self, $inputref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
143	$self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
144
145	# if not already in the cache, work it out and put it there
146	if (!defined $cache{$$inputref})
147	{
148	if (scalar (keys %cache) >= $self->{'max_cache_size'}) {
149	$self->clear_cache();
150	}
151	## print STDERR "$$inputref is not yet in the cache\n";
152	$cache{$$inputref} = $self->classify($inputref, $opt_freq, $opt_factor, $opt_top);
153	} else {
154	## print STDERR "$$inputref is already in the cache\n";
155	}
156
157	## print STDERR "Count of elements in cache is now: ".scalar (keys %cache)."\n";
158
159	# return cached array of encodings for the given string
160	return $cache{$$inputref};
161	}
162
163	# Clears the cache (a map of strings to the textcat results for each string).
164	sub clear_cache {
165	my $self = shift (@_);
166
167	%cache = undef; # does this suffice to release memory?
168	%cache = ();
169	}
170
171	sub create_lm {
172	# $ngram contains reference to the hash we build
173	# then add the ngrams found in each word in the hash
174	my ($self, $textref) = @_;
175
176	my $ngram = {};
177
178	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
179	$word = "_" . $word . "_";
180	my $len = length($word);
181	my $flen=$len;
182	my $i;
183
184	for ($i=0; $i<$flen; $i++) {
185	$ngram->{substr($word,$i,5)}++ if $len > 4;
186	$ngram->{substr($word,$i,4)}++ if $len > 3;
187	$ngram->{substr($word,$i,3)}++ if $len > 2;
188	$ngram->{substr($word,$i,2)}++ if $len > 1;
189	$ngram->{substr($word,$i,1)}++;
190	$len--;
191	}
192	}
193
194	map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
195	} keys %$ngram;
196
197	# sort the ngrams, and spit out the $opt_t frequent ones.
198	# adding `or $a cmp $b' in the sort block makes sorting five
199	# times slower..., although it would be somewhat nicer (unique result)
200	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
201	splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
202	return \@sorted;
203	}
204
205	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: