source: gsdl/trunk/perllib/textcat.pm@ 16554

Last change on this file since 16554 was 16554, checked in by ak19, 13 years ago

Added subroutines classify_cached and clear_cache. The first of these is called by BasePlugin's filepath_to_utf8 subroutine. Textcat now caches results when called with classify_cached() which can be useful for small strings (like filenames) so that textcat need not be performed repeatedly on the same strings.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1###########################################################################
2#
3# textcat.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl)
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package textcat;
33
34use strict;
35
36# OPTIONS
37my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40my $opt_t = 400; # topmost number of ngrams that should be used
41my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43my $non_word_characters = '0-9\s';
44
45# caching related
46my %cache = (); # map of cached text-strings each to array of char-encodings
47my $MAX_CACHE_SIZE = 1000;
48
49sub new {
50 my $class = shift (@_);
51 my ($tmp_f, $tmp_t, $tmp_u) = @_;
52
53 my $self = {};
54
55 # open directory to find which languages are supported
56 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
57 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
58 closedir DIR;
59 @languages or die "sorry, can't read any language models from $model_dir\n" .
60 "language models must reside in files with .lm ending\n";
61
62 # load model and count for each language.
63 foreach my $language (@languages) {
64 my %ngram=();
65 my $rang=1;
66 open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
67 while (<LM>) {
68 chomp;
69 # only use lines starting with appropriate character. Others are ignored.
70 if (/^[^$non_word_characters]+/o) {
71 $self->{'ngrams'}->{$language}->{$&} = $rang++;
72 }
73 }
74 close(LM);
75 }
76
77 $self->{'languages'} = \@languages;
78
79 $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
80 $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
81 $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
82 $self->{'max_cache_size'} = $MAX_CACHE_SIZE;
83
84 return bless $self, $class;
85}
86
87
88
89# CLASSIFICATION
90#
91# What language is a text string?
92# Input: text string
93# Output: array of language names
94
95sub classify {
96 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
97
98 $self->{'opt_f'} = $opt_freq if defined $opt_freq;
99 $self->{'opt_u'} = $opt_factor if defined $opt_factor;
100 $self->{'opt_t'} = $opt_top if defined $opt_top;
101
102 my %results = ();
103 my $maxp = $self->{'opt_t'};
104
105 # create ngrams for input.
106 my $unknown = $self->create_lm($inputref);
107
108 foreach my $language (@{$self->{'languages'}}) {
109
110 # compare language model with input ngrams list
111 my ($i,$p)=(0,0);
112 while ($i < scalar (@$unknown)) {
113 if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
114 $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
115 } else {
116 $p=$p+$maxp;
117 }
118 ++$i;
119 }
120 $results{$language} = $p;
121 }
122
123 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
124 my $a = $results{$results[0]};
125
126 my @answers=(shift(@results));
127 while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
128 @answers=(@answers,shift(@results));
129 }
130
131 return \@answers;
132}
133
134# Same as above, but caches textcat results for subsequent use.
135# The cache is a map of the string to the corresponding array of results
136# returned by textcat of the possible filename-encodings for that string.
137# Use this method for short strings (such as filenames) rather than huge text
138# files. The cache will be cleared when the max_cache_size is reached, which
139# is MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
140# can also be cleared by a call to clear_cache.
141sub classify_cached {
142 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
143 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
144
145 # if not already in the cache, work it out and put it there
146 if (!defined $cache{$$inputref})
147 {
148 if (scalar (keys %cache) >= $self->{'max_cache_size'}) {
149 $self->clear_cache();
150 }
151## print STDERR "$$inputref is not yet in the cache\n";
152 $cache{$$inputref} = $self->classify($inputref, $opt_freq, $opt_factor, $opt_top);
153 } else {
154## print STDERR "$$inputref is already in the cache\n";
155 }
156
157## print STDERR "Count of elements in cache is now: ".scalar (keys %cache)."\n";
158
159 # return cached array of encodings for the given string
160 return $cache{$$inputref};
161}
162
163# Clears the cache (a map of strings to the textcat results for each string).
164sub clear_cache {
165 my $self = shift (@_);
166
167 %cache = undef; # does this suffice to release memory?
168 %cache = ();
169}
170
171sub create_lm {
172 # $ngram contains reference to the hash we build
173 # then add the ngrams found in each word in the hash
174 my ($self, $textref) = @_;
175
176 my $ngram = {};
177
178 foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
179 $word = "_" . $word . "_";
180 my $len = length($word);
181 my $flen=$len;
182 my $i;
183
184 for ($i=0; $i<$flen; $i++) {
185 $ngram->{substr($word,$i,5)}++ if $len > 4;
186 $ngram->{substr($word,$i,4)}++ if $len > 3;
187 $ngram->{substr($word,$i,3)}++ if $len > 2;
188 $ngram->{substr($word,$i,2)}++ if $len > 1;
189 $ngram->{substr($word,$i,1)}++;
190 $len--;
191 }
192 }
193
194 map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
195 } keys %$ngram;
196
197 # sort the ngrams, and spit out the $opt_t frequent ones.
198 # adding `or $a cmp $b' in the sort block makes sorting five
199 # times slower..., although it would be somewhat nicer (unique result)
200 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
201 splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
202 return \@sorted;
203}
204
2051;
Note: See TracBrowser for help on using the repository browser.