[1316] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
| 3 | # textcat.pm -- Identify the language of a piece of text
|
---|
| 4 | #
|
---|
| 5 | #
|
---|
| 6 | # This file is based on TextCat version 1.08 by Gertjan van Noord
|
---|
| 7 | # Copyright (C) 1997 Gertjan van Noord ([email protected])
|
---|
| 8 | # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
|
---|
| 9 | #
|
---|
| 10 | # It was modified by Gordon Paynter ([email protected]) and turned
|
---|
| 11 | # into a package for use in Greenstone digital library system. Most of
|
---|
| 12 | # the modifications consist of commenting out or deleting functionality
|
---|
| 13 | # I don't need.
|
---|
| 14 | #
|
---|
| 15 | #
|
---|
| 16 | # This program is free software; you can redistribute it and/or modify
|
---|
| 17 | # it under the terms of the GNU General Public License as published by
|
---|
| 18 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 19 | # (at your option) any later version.
|
---|
| 20 | #
|
---|
| 21 | # This program is distributed in the hope that it will be useful,
|
---|
| 22 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 23 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 24 | # GNU General Public License for more details.
|
---|
| 25 | #
|
---|
| 26 | # You should have received a copy of the GNU General Public License
|
---|
| 27 | # along with this program; if not, write to the Free Software
|
---|
| 28 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 29 | #
|
---|
| 30 | ###########################################################################
|
---|
| 31 |
|
---|
| 32 | package textcat;
|
---|
| 33 |
|
---|
[15894] | 34 | use strict;
|
---|
| 35 |
|
---|
[1316] | 36 | # OPTIONS
|
---|
| 37 | my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
|
---|
| 38 |
|
---|
| 39 | my $opt_f = 1; # Ngrams which occur <= this number of times are removed
|
---|
| 40 | my $opt_t = 400; # topmost number of ngrams that should be used
|
---|
| 41 | my $opt_u = 1.05; # how much worse result must be before it is ignored
|
---|
| 42 |
|
---|
| 43 | my $non_word_characters = '0-9\s';
|
---|
| 44 |
|
---|
[16554] | 45 | # caching related
|
---|
[16674] | 46 | my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
|
---|
| 47 | my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
|
---|
[16554] | 48 | my $MAX_CACHE_SIZE = 1000;
|
---|
| 49 |
|
---|
[2235] | 50 | sub new {
|
---|
| 51 | my $class = shift (@_);
|
---|
[16554] | 52 | my ($tmp_f, $tmp_t, $tmp_u) = @_;
|
---|
| 53 |
|
---|
[2235] | 54 | my $self = {};
|
---|
[1316] | 55 |
|
---|
[2235] | 56 | # open directory to find which languages are supported
|
---|
| 57 | opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
|
---|
| 58 | my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
|
---|
| 59 | closedir DIR;
|
---|
| 60 | @languages or die "sorry, can't read any language models from $model_dir\n" .
|
---|
| 61 | "language models must reside in files with .lm ending\n";
|
---|
| 62 |
|
---|
| 63 | # load model and count for each language.
|
---|
| 64 | foreach my $language (@languages) {
|
---|
| 65 | my %ngram=();
|
---|
| 66 | my $rang=1;
|
---|
| 67 | open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
|
---|
| 68 | while (<LM>) {
|
---|
| 69 | chomp;
|
---|
| 70 | # only use lines starting with appropriate character. Others are ignored.
|
---|
| 71 | if (/^[^$non_word_characters]+/o) {
|
---|
| 72 | $self->{'ngrams'}->{$language}->{$&} = $rang++;
|
---|
| 73 | }
|
---|
| 74 | }
|
---|
| 75 | close(LM);
|
---|
| 76 | }
|
---|
| 77 |
|
---|
| 78 | $self->{'languages'} = \@languages;
|
---|
[16554] | 79 |
|
---|
| 80 | $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
|
---|
| 81 | $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
|
---|
| 82 | $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
|
---|
| 83 | $self->{'max_cache_size'} = $MAX_CACHE_SIZE;
|
---|
| 84 |
|
---|
[2235] | 85 | return bless $self, $class;
|
---|
| 86 | }
|
---|
| 87 |
|
---|
| 88 |
|
---|
[1316] | 89 | # CLASSIFICATION
|
---|
| 90 | #
|
---|
| 91 | # What language is a text string?
|
---|
| 92 | # Input: text string
|
---|
| 93 | # Output: array of language names
|
---|
[17214] | 94 | # $languages is the set of language models to consider (to textcat on)
|
---|
| 95 | # Can be set to filter out language models that don't belong to the given encoding
|
---|
| 96 | # in order to obtain a list of the probable languages for that known encoding.
|
---|
| 97 | # $filter_by_encoding indicates what encoding to narrow the search for languages down to.
|
---|
| 98 | # This is for when we already know the encoding, but we're still looking for the language.
|
---|
| 99 | sub classify {
|
---|
| 100 | my ($self, $inputref, $filter_by_encoding)=@_;
|
---|
| 101 | my $languages;
|
---|
| 102 | @$languages = ();
|
---|
[1316] | 103 |
|
---|
[17214] | 104 | # filter language filenames by encoding
|
---|
| 105 | if(defined $filter_by_encoding) {
|
---|
| 106 | # make sure to normalize language and filtering encoding so we are not
|
---|
| 107 | # stuck comparing hyphens with underscores in such things as iso-8859-1
|
---|
| 108 | my $normalized_filter = $filter_by_encoding;
|
---|
| 109 | $normalized_filter =~ s/[\W\_]//g;
|
---|
| 110 |
|
---|
| 111 | foreach my $lang (@{$self->{'languages'}}) {
|
---|
| 112 | my $normalized_lang = $lang;
|
---|
| 113 | $normalized_lang =~ s/[\W\_]//g;
|
---|
| 114 |
|
---|
| 115 | if($normalized_lang =~ m/$normalized_filter/i) {
|
---|
| 116 | push (@$languages, $lang);
|
---|
| 117 | }
|
---|
| 118 | }
|
---|
| 119 | }
|
---|
| 120 |
|
---|
| 121 | # if the filter_by_encoding wasn't in the list of language model filenames
|
---|
| 122 | # or if we're not filtering, then work with all language model filenames
|
---|
| 123 | if(scalar @$languages == 0) {
|
---|
| 124 | $languages = $self->{'languages'};
|
---|
| 125 | }
|
---|
| 126 |
|
---|
[2235] | 127 | my %results = ();
|
---|
[16554] | 128 | my $maxp = $self->{'opt_t'};
|
---|
[1316] | 129 |
|
---|
[2235] | 130 | # create ngrams for input.
|
---|
| 131 | my $unknown = $self->create_lm($inputref);
|
---|
[1316] | 132 |
|
---|
[17214] | 133 | foreach my $language (@$languages) {
|
---|
[2235] | 134 | # compare language model with input ngrams list
|
---|
| 135 | my ($i,$p)=(0,0);
|
---|
| 136 | while ($i < scalar (@$unknown)) {
|
---|
| 137 | if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
|
---|
| 138 | $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
|
---|
| 139 | } else {
|
---|
| 140 | $p=$p+$maxp;
|
---|
| 141 | }
|
---|
| 142 | ++$i;
|
---|
| 143 | }
|
---|
| 144 | $results{$language} = $p;
|
---|
[1316] | 145 | }
|
---|
[2235] | 146 |
|
---|
| 147 | my @results = sort { $results{$a} <=> $results{$b} } keys %results;
|
---|
| 148 | my $a = $results{$results[0]};
|
---|
[1316] | 149 |
|
---|
[2235] | 150 | my @answers=(shift(@results));
|
---|
[16554] | 151 | while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
|
---|
[2235] | 152 | @answers=(@answers,shift(@results));
|
---|
| 153 | }
|
---|
[1316] | 154 |
|
---|
[2235] | 155 | return \@answers;
|
---|
[1316] | 156 | }
|
---|
| 157 |
|
---|
[17214] | 158 |
|
---|
| 159 | # Same as below, but caches textcat results on filenames for subsequent use.
|
---|
[16674] | 160 | # The cache is a map of the filename to the corresponding filename_encodings
|
---|
| 161 | # (an array of results returned by textcat of the possible filename-encodings
|
---|
| 162 | # for the indexing filename string itself).
|
---|
| 163 | # Need to make sure that the filename is only the tailname: no path and no
|
---|
| 164 | # extension (no digits), in order to make optimum use of cached textcat.
|
---|
| 165 | # Textcat is performed on $filename_ref and the results associated with $filename_ref.
|
---|
| 166 | # The cache will be cleared when the max_cache_size is reached, which is
|
---|
| 167 | # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
|
---|
| 168 | # can also be cleared by a call to clear_filename_cache.
|
---|
| 169 | sub classify_cached_filename {
|
---|
[17214] | 170 | my ($self, $filename_ref)=@_;
|
---|
[16554] | 171 |
|
---|
| 172 | # if not already in the cache, work it out and put it there
|
---|
[16674] | 173 | if (!defined $filename_cache{$$filename_ref})
|
---|
[16554] | 174 | {
|
---|
[16674] | 175 | if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
|
---|
| 176 | $self->clear_filename_cache();
|
---|
[16554] | 177 | }
|
---|
[17214] | 178 | $filename_cache{$$filename_ref} = $self->classify($filename_ref);
|
---|
[16674] | 179 | }
|
---|
| 180 |
|
---|
| 181 | # return cached array of encodings for the given string
|
---|
| 182 | return $filename_cache{$$filename_ref};
|
---|
| 183 | }
|
---|
| 184 |
|
---|
| 185 | # Same as above, but caches textcat results on filecontents for subsequent use.
|
---|
[17214] | 186 | # Textcat on a file's contents to work out its possible encodings. Uses the cache.
|
---|
[16674] | 187 | # The cache is a map of the filename to an array of possible filename_encodings
|
---|
| 188 | # for the *contents* of the file returned by textcat.
|
---|
| 189 | # Textcat is performed on $contents_ref and the results associated with $filename.
|
---|
| 190 | # The cache will be cleared when the max_cache_size is reached, which is
|
---|
| 191 | # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
|
---|
| 192 | # can also be cleared by a call to clear_filecontents_cache.
|
---|
[17214] | 193 | sub classify_contents {
|
---|
| 194 | my ($self, $contents_ref, $filename)=@_;
|
---|
| 195 |
|
---|
[16674] | 196 | # if not already in the cache, work it out and put it there
|
---|
[17214] | 197 | if (!defined $filecontents_cache{$filename})
|
---|
[16674] | 198 | {
|
---|
| 199 | if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
|
---|
| 200 | $self->clear_filecontents_cache();
|
---|
| 201 | }
|
---|
[17214] | 202 |
|
---|
| 203 | # Finally, we can perform the textcat classification of language and encoding
|
---|
| 204 | $filecontents_cache{$filename} = $self->classify($contents_ref);
|
---|
[16554] | 205 | }
|
---|
[16674] | 206 | # return cached array of content encodings for the given filename
|
---|
| 207 | return $filecontents_cache{$filename};
|
---|
| 208 | }
|
---|
[16554] | 209 |
|
---|
[17214] | 210 |
|
---|
| 211 | # Given the known encoding for a file's contents, performs a textcat
|
---|
| 212 | # filtering on the languages for the given encoding. Results are stored
|
---|
| 213 | # in the cache TWICE: once under $filename|$filter_by_encoding, and
|
---|
| 214 | # once under the usual $filename, so that subsequent calls to either
|
---|
| 215 | # this method or classify_contents using the same filename will not
|
---|
| 216 | # perform textcat again.
|
---|
| 217 | sub classify_contents_for_encoding {
|
---|
| 218 | my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
|
---|
| 219 |
|
---|
| 220 | if (!defined $filecontents_cache{"$filename|$filter_by_encoding"})
|
---|
| 221 | {
|
---|
| 222 | if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
|
---|
| 223 | $self->clear_filecontents_cache();
|
---|
| 224 | }
|
---|
| 225 |
|
---|
| 226 | $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
|
---|
| 227 | # store this in cache again under $filename entry, so that subsequent
|
---|
| 228 | # calls to classify_contents will find it in the cache already
|
---|
| 229 | $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
|
---|
| 230 | }
|
---|
| 231 | return $filecontents_cache{$filename};
|
---|
| 232 | }
|
---|
| 233 |
|
---|
| 234 |
|
---|
| 235 | # This method returns the most frequently occurring encoding
|
---|
| 236 | # but only if any encoding occurs more than once in the given results.
|
---|
| 237 | # Otherwise, "" is returned.
|
---|
| 238 | sub most_frequent_encoding {
|
---|
| 239 | my ($self, $results) = @_;
|
---|
| 240 | my $best_encoding = "";
|
---|
| 241 |
|
---|
| 242 | # guessed_encodings is a hashmap of Encoding -> Frequency pairs
|
---|
| 243 | my %guessed_encodings = ();
|
---|
| 244 | foreach my $result (@$results) {
|
---|
| 245 | # Get the encoding portion of a language-model filename like en-iso8859_1
|
---|
| 246 | my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
|
---|
| 247 | if(!defined($guessed_encodings{$encoding})) {
|
---|
| 248 | $guessed_encodings{$encoding} = 0;
|
---|
| 249 | }
|
---|
| 250 | $guessed_encodings{$encoding}++;
|
---|
| 251 | }
|
---|
| 252 |
|
---|
| 253 | $guessed_encodings{""}=-1; # for default best_encoding of ""
|
---|
| 254 |
|
---|
| 255 | foreach my $enc (keys %guessed_encodings) {
|
---|
| 256 | if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
|
---|
| 257 | $best_encoding = $enc;
|
---|
| 258 | }
|
---|
| 259 | }
|
---|
| 260 |
|
---|
| 261 | # If best_encoding's frequency == 1, then the frequency for all encodings will
|
---|
| 262 | # be 1 since the sum total of all frequencies is num_results: if any encoding
|
---|
| 263 | # has frequency > 1 (it's possibly the best_encoding), one or more of the others
|
---|
| 264 | # would have been at 0 frequency to compensate.
|
---|
| 265 | return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 |
|
---|
| 269 | # set some of the specific member variables
|
---|
| 270 | sub set_opts {
|
---|
| 271 | my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
|
---|
| 272 |
|
---|
| 273 | $self->{'opt_f'} = $opt_freq if defined $opt_freq;
|
---|
| 274 | $self->{'opt_u'} = $opt_factor if defined $opt_factor;
|
---|
| 275 | $self->{'opt_t'} = $opt_top if defined $opt_top;
|
---|
| 276 |
|
---|
| 277 | $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
|
---|
| 278 | }
|
---|
| 279 |
|
---|
| 280 | sub get_opts {
|
---|
| 281 | my $self = shift (@_);
|
---|
| 282 | return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
|
---|
| 283 | }
|
---|
| 284 |
|
---|
[16674] | 285 | # Clears the filename cache (a map of strings to the textcat results for each string).
|
---|
| 286 | sub clear_filename_cache {
|
---|
| 287 | my $self = shift (@_);
|
---|
| 288 |
|
---|
| 289 | %filename_cache = undef; # does this suffice to release memory?
|
---|
| 290 | %filename_cache = ();
|
---|
[16554] | 291 | }
|
---|
| 292 |
|
---|
[16674] | 293 | # Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
|
---|
| 294 | sub clear_filecontents_cache {
|
---|
[16554] | 295 | my $self = shift (@_);
|
---|
| 296 |
|
---|
[16674] | 297 | %filecontents_cache = undef; # does this suffice to release memory?
|
---|
| 298 | %filecontents_cache = ();
|
---|
[16554] | 299 | }
|
---|
| 300 |
|
---|
[2235] | 301 | sub create_lm {
|
---|
| 302 | # $ngram contains reference to the hash we build
|
---|
| 303 | # then add the ngrams found in each word in the hash
|
---|
| 304 | my ($self, $textref) = @_;
|
---|
| 305 |
|
---|
| 306 | my $ngram = {};
|
---|
[1316] | 307 |
|
---|
[2235] | 308 | foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
|
---|
| 309 | $word = "_" . $word . "_";
|
---|
| 310 | my $len = length($word);
|
---|
| 311 | my $flen=$len;
|
---|
| 312 | my $i;
|
---|
[1316] | 313 |
|
---|
[2235] | 314 | for ($i=0; $i<$flen; $i++) {
|
---|
| 315 | $ngram->{substr($word,$i,5)}++ if $len > 4;
|
---|
| 316 | $ngram->{substr($word,$i,4)}++ if $len > 3;
|
---|
| 317 | $ngram->{substr($word,$i,3)}++ if $len > 2;
|
---|
| 318 | $ngram->{substr($word,$i,2)}++ if $len > 1;
|
---|
| 319 | $ngram->{substr($word,$i,1)}++;
|
---|
| 320 | $len--;
|
---|
| 321 | }
|
---|
[1316] | 322 | }
|
---|
| 323 |
|
---|
[16554] | 324 | map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
|
---|
[2235] | 325 | } keys %$ngram;
|
---|
[1316] | 326 |
|
---|
[2235] | 327 | # sort the ngrams, and spit out the $opt_t frequent ones.
|
---|
| 328 | # adding `or $a cmp $b' in the sort block makes sorting five
|
---|
| 329 | # times slower..., although it would be somewhat nicer (unique result)
|
---|
| 330 | my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
|
---|
[16554] | 331 | splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
|
---|
[2235] | 332 | return \@sorted;
|
---|
[1316] | 333 | }
|
---|
| 334 |
|
---|
| 335 | 1;
|
---|