source: gsdl/trunk/perllib/textcat.pm@ 18463

Last change on this file since 18463 was 17214, checked in by ak19, 16 years ago

Significant changes: 1. Textcat can be restricted to a given encoding when the encoding of a file's contents is known. In this case we are after the language of the contents. 2. Code for working out most frequently occurring encoding has been moved into here from ReadTextFile.pm. 3. Subroutines renamed and unnecessary parameters passed to the various versions of classify() have been moved into set and get methods instead.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1###########################################################################
2#
3# textcat.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord ([email protected])
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter ([email protected]) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package textcat;
33
34use strict;
35
36# OPTIONS
37my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40my $opt_t = 400; # topmost number of ngrams that should be used
41my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43my $non_word_characters = '0-9\s';
44
45# caching related
46my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
47my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
48my $MAX_CACHE_SIZE = 1000;
49
50sub new {
51 my $class = shift (@_);
52 my ($tmp_f, $tmp_t, $tmp_u) = @_;
53
54 my $self = {};
55
56 # open directory to find which languages are supported
57 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
58 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
59 closedir DIR;
60 @languages or die "sorry, can't read any language models from $model_dir\n" .
61 "language models must reside in files with .lm ending\n";
62
63 # load model and count for each language.
64 foreach my $language (@languages) {
65 my %ngram=();
66 my $rang=1;
67 open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
68 while (<LM>) {
69 chomp;
70 # only use lines starting with appropriate character. Others are ignored.
71 if (/^[^$non_word_characters]+/o) {
72 $self->{'ngrams'}->{$language}->{$&} = $rang++;
73 }
74 }
75 close(LM);
76 }
77
78 $self->{'languages'} = \@languages;
79
80 $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
81 $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
82 $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
83 $self->{'max_cache_size'} = $MAX_CACHE_SIZE;
84
85 return bless $self, $class;
86}
87
88
89# CLASSIFICATION
90#
91# What language is a text string?
92# Input: text string
93# Output: array of language names
94# $languages is the set of language models to consider (to textcat on)
95# Can be set to filter out language models that don't belong to the given encoding
96# in order to obtain a list of the probable languages for that known encoding.
97# $filter_by_encoding indicates what encoding to narrow the search for languages down to.
98# This is for when we already know the encoding, but we're still looking for the language.
99sub classify {
100 my ($self, $inputref, $filter_by_encoding)=@_;
101 my $languages;
102 @$languages = ();
103
104 # filter language filenames by encoding
105 if(defined $filter_by_encoding) {
106 # make sure to normalize language and filtering encoding so we are not
107 # stuck comparing hyphens with underscores in such things as iso-8859-1
108 my $normalized_filter = $filter_by_encoding;
109 $normalized_filter =~ s/[\W\_]//g;
110
111 foreach my $lang (@{$self->{'languages'}}) {
112 my $normalized_lang = $lang;
113 $normalized_lang =~ s/[\W\_]//g;
114
115 if($normalized_lang =~ m/$normalized_filter/i) {
116 push (@$languages, $lang);
117 }
118 }
119 }
120
121 # if the filter_by_encoding wasn't in the list of language model filenames
122 # or if we're not filtering, then work with all language model filenames
123 if(scalar @$languages == 0) {
124 $languages = $self->{'languages'};
125 }
126
127 my %results = ();
128 my $maxp = $self->{'opt_t'};
129
130 # create ngrams for input.
131 my $unknown = $self->create_lm($inputref);
132
133 foreach my $language (@$languages) {
134 # compare language model with input ngrams list
135 my ($i,$p)=(0,0);
136 while ($i < scalar (@$unknown)) {
137 if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
138 $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
139 } else {
140 $p=$p+$maxp;
141 }
142 ++$i;
143 }
144 $results{$language} = $p;
145 }
146
147 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
148 my $a = $results{$results[0]};
149
150 my @answers=(shift(@results));
151 while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
152 @answers=(@answers,shift(@results));
153 }
154
155 return \@answers;
156}
157
158
159# Same as below, but caches textcat results on filenames for subsequent use.
160# The cache is a map of the filename to the corresponding filename_encodings
161# (an array of results returned by textcat of the possible filename-encodings
162# for the indexing filename string itself).
163# Need to make sure that the filename is only the tailname: no path and no
164# extension (no digits), in order to make optimum use of cached textcat.
165# Textcat is performed on $filename_ref and the results associated with $filename_ref.
166# The cache will be cleared when the max_cache_size is reached, which is
167# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
168# can also be cleared by a call to clear_filename_cache.
169sub classify_cached_filename {
170 my ($self, $filename_ref)=@_;
171
172 # if not already in the cache, work it out and put it there
173 if (!defined $filename_cache{$$filename_ref})
174 {
175 if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
176 $self->clear_filename_cache();
177 }
178 $filename_cache{$$filename_ref} = $self->classify($filename_ref);
179 }
180
181 # return cached array of encodings for the given string
182 return $filename_cache{$$filename_ref};
183}
184
185# Same as above, but caches textcat results on filecontents for subsequent use.
186# Textcat on a file's contents to work out its possible encodings. Uses the cache.
187# The cache is a map of the filename to an array of possible filename_encodings
188# for the *contents* of the file returned by textcat.
189# Textcat is performed on $contents_ref and the results associated with $filename.
190# The cache will be cleared when the max_cache_size is reached, which is
191# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
192# can also be cleared by a call to clear_filecontents_cache.
193sub classify_contents {
194 my ($self, $contents_ref, $filename)=@_;
195
196 # if not already in the cache, work it out and put it there
197 if (!defined $filecontents_cache{$filename})
198 {
199 if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
200 $self->clear_filecontents_cache();
201 }
202
203 # Finally, we can perform the textcat classification of language and encoding
204 $filecontents_cache{$filename} = $self->classify($contents_ref);
205 }
206 # return cached array of content encodings for the given filename
207 return $filecontents_cache{$filename};
208}
209
210
211# Given the known encoding for a file's contents, performs a textcat
212# filtering on the languages for the given encoding. Results are stored
213# in the cache TWICE: once under $filename|$filter_by_encoding, and
214# once under the usual $filename, so that subsequent calls to either
215# this method or classify_contents using the same filename will not
216# perform textcat again.
217sub classify_contents_for_encoding {
218 my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
219
220 if (!defined $filecontents_cache{"$filename|$filter_by_encoding"})
221 {
222 if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
223 $self->clear_filecontents_cache();
224 }
225
226 $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
227 # store this in cache again under $filename entry, so that subsequent
228 # calls to classify_contents will find it in the cache already
229 $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
230 }
231 return $filecontents_cache{$filename};
232}
233
234
235# This method returns the most frequently occurring encoding
236# but only if any encoding occurs more than once in the given results.
237# Otherwise, "" is returned.
238sub most_frequent_encoding {
239 my ($self, $results) = @_;
240 my $best_encoding = "";
241
242 # guessed_encodings is a hashmap of Encoding -> Frequency pairs
243 my %guessed_encodings = ();
244 foreach my $result (@$results) {
245 # Get the encoding portion of a language-model filename like en-iso8859_1
246 my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
247 if(!defined($guessed_encodings{$encoding})) {
248 $guessed_encodings{$encoding} = 0;
249 }
250 $guessed_encodings{$encoding}++;
251 }
252
253 $guessed_encodings{""}=-1; # for default best_encoding of ""
254
255 foreach my $enc (keys %guessed_encodings) {
256 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
257 $best_encoding = $enc;
258 }
259 }
260
261 # If best_encoding's frequency == 1, then the frequency for all encodings will
262 # be 1 since the sum total of all frequencies is num_results: if any encoding
263 # has frequency > 1 (it's possibly the best_encoding), one or more of the others
264 # would have been at 0 frequency to compensate.
265 return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
266}
267
268
269# set some of the specific member variables
270sub set_opts {
271 my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
272
273 $self->{'opt_f'} = $opt_freq if defined $opt_freq;
274 $self->{'opt_u'} = $opt_factor if defined $opt_factor;
275 $self->{'opt_t'} = $opt_top if defined $opt_top;
276
277 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
278}
279
280sub get_opts {
281 my $self = shift (@_);
282 return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
283}
284
285# Clears the filename cache (a map of strings to the textcat results for each string).
286sub clear_filename_cache {
287 my $self = shift (@_);
288
289 %filename_cache = undef; # does this suffice to release memory?
290 %filename_cache = ();
291}
292
293# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
294sub clear_filecontents_cache {
295 my $self = shift (@_);
296
297 %filecontents_cache = undef; # does this suffice to release memory?
298 %filecontents_cache = ();
299}
300
301sub create_lm {
302 # $ngram contains reference to the hash we build
303 # then add the ngrams found in each word in the hash
304 my ($self, $textref) = @_;
305
306 my $ngram = {};
307
308 foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
309 $word = "_" . $word . "_";
310 my $len = length($word);
311 my $flen=$len;
312 my $i;
313
314 for ($i=0; $i<$flen; $i++) {
315 $ngram->{substr($word,$i,5)}++ if $len > 4;
316 $ngram->{substr($word,$i,4)}++ if $len > 3;
317 $ngram->{substr($word,$i,3)}++ if $len > 2;
318 $ngram->{substr($word,$i,2)}++ if $len > 1;
319 $ngram->{substr($word,$i,1)}++;
320 $len--;
321 }
322 }
323
324 map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
325 } keys %$ngram;
326
327 # sort the ngrams, and spit out the $opt_t frequent ones.
328 # adding `or $a cmp $b' in the sort block makes sorting five
329 # times slower..., although it would be somewhat nicer (unique result)
330 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
331 splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
332 return \@sorted;
333}
334
3351;
Note: See TracBrowser for help on using the repository browser.