source: gsdl/trunk/perllib/textcat.pm@ 17110

Last change on this file since 17110 was 16674, checked in by ak19, 16 years ago

Added caching for textcat results on filecontents as well: a second map now stores the mapping from filenames to the cached textcat results on the contents of those files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1###########################################################################
2#
3# textcat.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord ([email protected])
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter ([email protected]) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package textcat;
33
34use strict;
35
36# OPTIONS
37my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40my $opt_t = 400; # topmost number of ngrams that should be used
41my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43my $non_word_characters = '0-9\s';
44
45# caching related
46my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
47my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
48my $MAX_CACHE_SIZE = 1000;
49
50sub new {
51 my $class = shift (@_);
52 my ($tmp_f, $tmp_t, $tmp_u) = @_;
53
54 my $self = {};
55
56 # open directory to find which languages are supported
57 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
58 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
59 closedir DIR;
60 @languages or die "sorry, can't read any language models from $model_dir\n" .
61 "language models must reside in files with .lm ending\n";
62
63 # load model and count for each language.
64 foreach my $language (@languages) {
65 my %ngram=();
66 my $rang=1;
67 open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
68 while (<LM>) {
69 chomp;
70 # only use lines starting with appropriate character. Others are ignored.
71 if (/^[^$non_word_characters]+/o) {
72 $self->{'ngrams'}->{$language}->{$&} = $rang++;
73 }
74 }
75 close(LM);
76 }
77
78 $self->{'languages'} = \@languages;
79
80 $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
81 $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
82 $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
83 $self->{'max_cache_size'} = $MAX_CACHE_SIZE;
84
85 return bless $self, $class;
86}
87
88
89
90# CLASSIFICATION
91#
92# What language is a text string?
93# Input: text string
94# Output: array of language names
95
96sub classify {
97 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
98
99 $self->{'opt_f'} = $opt_freq if defined $opt_freq;
100 $self->{'opt_u'} = $opt_factor if defined $opt_factor;
101 $self->{'opt_t'} = $opt_top if defined $opt_top;
102
103 my %results = ();
104 my $maxp = $self->{'opt_t'};
105
106 # create ngrams for input.
107 my $unknown = $self->create_lm($inputref);
108
109 foreach my $language (@{$self->{'languages'}}) {
110
111 # compare language model with input ngrams list
112 my ($i,$p)=(0,0);
113 while ($i < scalar (@$unknown)) {
114 if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
115 $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
116 } else {
117 $p=$p+$maxp;
118 }
119 ++$i;
120 }
121 $results{$language} = $p;
122 }
123
124 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
125 my $a = $results{$results[0]};
126
127 my @answers=(shift(@results));
128 while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
129 @answers=(@answers,shift(@results));
130 }
131
132 return \@answers;
133}
134
135# Same as above, but caches textcat results on filenames for subsequent use.
136# The cache is a map of the filename to the corresponding filename_encodings
137# (an array of results returned by textcat of the possible filename-encodings
138# for the indexing filename string itself).
139# Need to make sure that the filename is only the tailname: no path and no
140# extension (no digits), in order to make optimum use of cached textcat.
141# Textcat is performed on $filename_ref and the results associated with $filename_ref.
142# The cache will be cleared when the max_cache_size is reached, which is
143# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
144# can also be cleared by a call to clear_filename_cache.
145sub classify_cached_filename {
146 my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
147 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
148
149 # if not already in the cache, work it out and put it there
150 if (!defined $filename_cache{$$filename_ref})
151 {
152 if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
153 $self->clear_filename_cache();
154 }
155 $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top);
156 }
157
158 # return cached array of encodings for the given string
159 return $filename_cache{$$filename_ref};
160}
161
162# Same as above, but caches textcat results on filecontents for subsequent use.
163# The cache is a map of the filename to an array of possible filename_encodings
164# for the *contents* of the file returned by textcat.
165# Textcat is performed on $contents_ref and the results associated with $filename.
166# The cache will be cleared when the max_cache_size is reached, which is
167# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
168# can also be cleared by a call to clear_filecontents_cache.
169sub classify_cached_filecontents {
170 my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
171 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
172
173 # if not already in the cache, work it out and put it there
174 if (!defined $filecontents_cache{$filename})
175 {
176 if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
177 $self->clear_filecontents_cache();
178 }
179 $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top);
180 }
181
182 # return cached array of content encodings for the given filename
183 return $filecontents_cache{$filename};
184}
185
186# Clears the filename cache (a map of strings to the textcat results for each string).
187sub clear_filename_cache {
188 my $self = shift (@_);
189
190 %filename_cache = undef; # does this suffice to release memory?
191 %filename_cache = ();
192}
193
194# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
195sub clear_filecontents_cache {
196 my $self = shift (@_);
197
198 %filecontents_cache = undef; # does this suffice to release memory?
199 %filecontents_cache = ();
200}
201
202sub create_lm {
203 # $ngram contains reference to the hash we build
204 # then add the ngrams found in each word in the hash
205 my ($self, $textref) = @_;
206
207 my $ngram = {};
208
209 foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
210 $word = "_" . $word . "_";
211 my $len = length($word);
212 my $flen=$len;
213 my $i;
214
215 for ($i=0; $i<$flen; $i++) {
216 $ngram->{substr($word,$i,5)}++ if $len > 4;
217 $ngram->{substr($word,$i,4)}++ if $len > 3;
218 $ngram->{substr($word,$i,3)}++ if $len > 2;
219 $ngram->{substr($word,$i,2)}++ if $len > 1;
220 $ngram->{substr($word,$i,1)}++;
221 $len--;
222 }
223 }
224
225 map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
226 } keys %$ngram;
227
228 # sort the ngrams, and spit out the $opt_t frequent ones.
229 # adding `or $a cmp $b' in the sort block makes sorting five
230 # times slower..., although it would be somewhat nicer (unique result)
231 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
232 splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
233 return \@sorted;
234}
235
2361;
Note: See TracBrowser for help on using the repository browser.