1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # textcat.pm -- Identify the language of a piece of text
|
---|
4 | #
|
---|
5 | #
|
---|
6 | # This file is based on TextCat version 1.08 by Gertjan van Noord
|
---|
7 | # Copyright (C) 1997 Gertjan van Noord ([email protected])
|
---|
8 | # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
|
---|
9 | #
|
---|
10 | # It was modified by Gordon Paynter ([email protected]) and turned
|
---|
11 | # into a package for use in Greenstone digital library system. Most of
|
---|
12 | # the modifications consist of commenting out or deleting functionality
|
---|
13 | # I don't need.
|
---|
14 | #
|
---|
15 | #
|
---|
16 | # This program is free software; you can redistribute it and/or modify
|
---|
17 | # it under the terms of the GNU General Public License as published by
|
---|
18 | # the Free Software Foundation; either version 2 of the License, or
|
---|
19 | # (at your option) any later version.
|
---|
20 | #
|
---|
21 | # This program is distributed in the hope that it will be useful,
|
---|
22 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
23 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
24 | # GNU General Public License for more details.
|
---|
25 | #
|
---|
26 | # You should have received a copy of the GNU General Public License
|
---|
27 | # along with this program; if not, write to the Free Software
|
---|
28 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
29 | #
|
---|
30 | ###########################################################################
|
---|
31 |
|
---|
32 | package textcat;
|
---|
33 |
|
---|
34 | use strict;
|
---|
35 |
|
---|
36 | # OPTIONS
|
---|
37 | my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
|
---|
38 |
|
---|
39 | my $opt_f = 1; # Ngrams which occur <= this number of times are removed
|
---|
40 | my $opt_t = 400; # topmost number of ngrams that should be used
|
---|
41 | my $opt_u = 1.05; # how much worse result must be before it is ignored
|
---|
42 |
|
---|
43 | my $non_word_characters = '0-9\s';
|
---|
44 |
|
---|
45 | # caching related
|
---|
46 | my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
|
---|
47 | my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
|
---|
48 | my $MAX_CACHE_SIZE = 1000;
|
---|
49 |
|
---|
50 | sub new {
|
---|
51 | my $class = shift (@_);
|
---|
52 | my ($tmp_f, $tmp_t, $tmp_u) = @_;
|
---|
53 |
|
---|
54 | my $self = {};
|
---|
55 |
|
---|
56 | # open directory to find which languages are supported
|
---|
57 | opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
|
---|
58 | my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
|
---|
59 | closedir DIR;
|
---|
60 | @languages or die "sorry, can't read any language models from $model_dir\n" .
|
---|
61 | "language models must reside in files with .lm ending\n";
|
---|
62 |
|
---|
63 | # load model and count for each language.
|
---|
64 | foreach my $language (@languages) {
|
---|
65 | my %ngram=();
|
---|
66 | my $rang=1;
|
---|
67 | open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
|
---|
68 | while (<LM>) {
|
---|
69 | chomp;
|
---|
70 | # only use lines starting with appropriate character. Others are ignored.
|
---|
71 | if (/^[^$non_word_characters]+/o) {
|
---|
72 | $self->{'ngrams'}->{$language}->{$&} = $rang++;
|
---|
73 | }
|
---|
74 | }
|
---|
75 | close(LM);
|
---|
76 | }
|
---|
77 |
|
---|
78 | $self->{'languages'} = \@languages;
|
---|
79 |
|
---|
80 | $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
|
---|
81 | $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
|
---|
82 | $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
|
---|
83 | $self->{'max_cache_size'} = $MAX_CACHE_SIZE;
|
---|
84 |
|
---|
85 | return bless $self, $class;
|
---|
86 | }
|
---|
87 |
|
---|
88 |
|
---|
89 |
|
---|
90 | # CLASSIFICATION
|
---|
91 | #
|
---|
92 | # What language is a text string?
|
---|
93 | # Input: text string
|
---|
94 | # Output: array of language names
|
---|
95 |
|
---|
96 | sub classify {
|
---|
97 | my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
|
---|
98 |
|
---|
99 | $self->{'opt_f'} = $opt_freq if defined $opt_freq;
|
---|
100 | $self->{'opt_u'} = $opt_factor if defined $opt_factor;
|
---|
101 | $self->{'opt_t'} = $opt_top if defined $opt_top;
|
---|
102 |
|
---|
103 | my %results = ();
|
---|
104 | my $maxp = $self->{'opt_t'};
|
---|
105 |
|
---|
106 | # create ngrams for input.
|
---|
107 | my $unknown = $self->create_lm($inputref);
|
---|
108 |
|
---|
109 | foreach my $language (@{$self->{'languages'}}) {
|
---|
110 |
|
---|
111 | # compare language model with input ngrams list
|
---|
112 | my ($i,$p)=(0,0);
|
---|
113 | while ($i < scalar (@$unknown)) {
|
---|
114 | if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
|
---|
115 | $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
|
---|
116 | } else {
|
---|
117 | $p=$p+$maxp;
|
---|
118 | }
|
---|
119 | ++$i;
|
---|
120 | }
|
---|
121 | $results{$language} = $p;
|
---|
122 | }
|
---|
123 |
|
---|
124 | my @results = sort { $results{$a} <=> $results{$b} } keys %results;
|
---|
125 | my $a = $results{$results[0]};
|
---|
126 |
|
---|
127 | my @answers=(shift(@results));
|
---|
128 | while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
|
---|
129 | @answers=(@answers,shift(@results));
|
---|
130 | }
|
---|
131 |
|
---|
132 | return \@answers;
|
---|
133 | }
|
---|
134 |
|
---|
135 | # Same as above, but caches textcat results on filenames for subsequent use.
|
---|
136 | # The cache is a map of the filename to the corresponding filename_encodings
|
---|
137 | # (an array of results returned by textcat of the possible filename-encodings
|
---|
138 | # for the indexing filename string itself).
|
---|
139 | # Need to make sure that the filename is only the tailname: no path and no
|
---|
140 | # extension (no digits), in order to make optimum use of cached textcat.
|
---|
141 | # Textcat is performed on $filename_ref and the results associated with $filename_ref.
|
---|
142 | # The cache will be cleared when the max_cache_size is reached, which is
|
---|
143 | # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
|
---|
144 | # can also be cleared by a call to clear_filename_cache.
|
---|
145 | sub classify_cached_filename {
|
---|
146 | my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
|
---|
147 | $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
|
---|
148 |
|
---|
149 | # if not already in the cache, work it out and put it there
|
---|
150 | if (!defined $filename_cache{$$filename_ref})
|
---|
151 | {
|
---|
152 | if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
|
---|
153 | $self->clear_filename_cache();
|
---|
154 | }
|
---|
155 | $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top);
|
---|
156 | }
|
---|
157 |
|
---|
158 | # return cached array of encodings for the given string
|
---|
159 | return $filename_cache{$$filename_ref};
|
---|
160 | }
|
---|
161 |
|
---|
162 | # Same as above, but caches textcat results on filecontents for subsequent use.
|
---|
163 | # The cache is a map of the filename to an array of possible filename_encodings
|
---|
164 | # for the *contents* of the file returned by textcat.
|
---|
165 | # Textcat is performed on $contents_ref and the results associated with $filename.
|
---|
166 | # The cache will be cleared when the max_cache_size is reached, which is
|
---|
167 | # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
|
---|
168 | # can also be cleared by a call to clear_filecontents_cache.
|
---|
169 | sub classify_cached_filecontents {
|
---|
170 | my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
|
---|
171 | $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
|
---|
172 |
|
---|
173 | # if not already in the cache, work it out and put it there
|
---|
174 | if (!defined $filecontents_cache{$filename})
|
---|
175 | {
|
---|
176 | if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
|
---|
177 | $self->clear_filecontents_cache();
|
---|
178 | }
|
---|
179 | $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top);
|
---|
180 | }
|
---|
181 |
|
---|
182 | # return cached array of content encodings for the given filename
|
---|
183 | return $filecontents_cache{$filename};
|
---|
184 | }
|
---|
185 |
|
---|
186 | # Clears the filename cache (a map of strings to the textcat results for each string).
|
---|
187 | sub clear_filename_cache {
|
---|
188 | my $self = shift (@_);
|
---|
189 |
|
---|
190 | %filename_cache = undef; # does this suffice to release memory?
|
---|
191 | %filename_cache = ();
|
---|
192 | }
|
---|
193 |
|
---|
194 | # Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
|
---|
195 | sub clear_filecontents_cache {
|
---|
196 | my $self = shift (@_);
|
---|
197 |
|
---|
198 | %filecontents_cache = undef; # does this suffice to release memory?
|
---|
199 | %filecontents_cache = ();
|
---|
200 | }
|
---|
201 |
|
---|
202 | sub create_lm {
|
---|
203 | # $ngram contains reference to the hash we build
|
---|
204 | # then add the ngrams found in each word in the hash
|
---|
205 | my ($self, $textref) = @_;
|
---|
206 |
|
---|
207 | my $ngram = {};
|
---|
208 |
|
---|
209 | foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
|
---|
210 | $word = "_" . $word . "_";
|
---|
211 | my $len = length($word);
|
---|
212 | my $flen=$len;
|
---|
213 | my $i;
|
---|
214 |
|
---|
215 | for ($i=0; $i<$flen; $i++) {
|
---|
216 | $ngram->{substr($word,$i,5)}++ if $len > 4;
|
---|
217 | $ngram->{substr($word,$i,4)}++ if $len > 3;
|
---|
218 | $ngram->{substr($word,$i,3)}++ if $len > 2;
|
---|
219 | $ngram->{substr($word,$i,2)}++ if $len > 1;
|
---|
220 | $ngram->{substr($word,$i,1)}++;
|
---|
221 | $len--;
|
---|
222 | }
|
---|
223 | }
|
---|
224 |
|
---|
225 | map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
|
---|
226 | } keys %$ngram;
|
---|
227 |
|
---|
228 | # sort the ngrams, and spit out the $opt_t frequent ones.
|
---|
229 | # adding `or $a cmp $b' in the sort block makes sorting five
|
---|
230 | # times slower..., although it would be somewhat nicer (unique result)
|
---|
231 | my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
|
---|
232 | splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
|
---|
233 | return \@sorted;
|
---|
234 | }
|
---|
235 |
|
---|
236 | 1;
|
---|