[1316] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
| 3 | # textcat.pm -- Identify the language of a piece of text
|
---|
| 4 | #
|
---|
| 5 | #
|
---|
| 6 | # This file is based on TextCat version 1.08 by Gertjan van Noord
|
---|
| 7 | # Copyright (C) 1997 Gertjan van Noord ([email protected])
|
---|
| 8 | # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
|
---|
| 9 | #
|
---|
| 10 | # It was modified by Gordon Paynter ([email protected]) and turned
|
---|
| 11 | # into a package for use in Greenstone digital library system. Most of
|
---|
| 12 | # the modifications consist of commenting out or deleting functionality
|
---|
| 13 | # I don't need.
|
---|
| 14 | #
|
---|
| 15 | #
|
---|
| 16 | # This program is free software; you can redistribute it and/or modify
|
---|
| 17 | # it under the terms of the GNU General Public License as published by
|
---|
| 18 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 19 | # (at your option) any later version.
|
---|
| 20 | #
|
---|
| 21 | # This program is distributed in the hope that it will be useful,
|
---|
| 22 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 23 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 24 | # GNU General Public License for more details.
|
---|
| 25 | #
|
---|
| 26 | # You should have received a copy of the GNU General Public License
|
---|
| 27 | # along with this program; if not, write to the Free Software
|
---|
| 28 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 29 | #
|
---|
| 30 | ###########################################################################
|
---|
| 31 |
|
---|
| 32 | package textcat;
|
---|
| 33 |
|
---|
[15894] | 34 | use strict;
|
---|
| 35 |
|
---|
[1316] | 36 | # OPTIONS
|
---|
| 37 | my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
|
---|
| 38 |
|
---|
| 39 | my $opt_f = 1; # Ngrams which occur <= this number of times are removed
|
---|
| 40 | my $opt_t = 400; # topmost number of ngrams that should be used
|
---|
| 41 | my $opt_u = 1.05; # how much worse result must be before it is ignored
|
---|
| 42 |
|
---|
| 43 | my $non_word_characters = '0-9\s';
|
---|
| 44 |
|
---|
[16554] | 45 | # caching related
|
---|
[16674] | 46 | my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
|
---|
| 47 | my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
|
---|
[16554] | 48 | my $MAX_CACHE_SIZE = 1000;
|
---|
| 49 |
|
---|
[2235] | 50 | sub new {
|
---|
| 51 | my $class = shift (@_);
|
---|
[16554] | 52 | my ($tmp_f, $tmp_t, $tmp_u) = @_;
|
---|
| 53 |
|
---|
[2235] | 54 | my $self = {};
|
---|
[1316] | 55 |
|
---|
[2235] | 56 | # open directory to find which languages are supported
|
---|
| 57 | opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
|
---|
| 58 | my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
|
---|
| 59 | closedir DIR;
|
---|
| 60 | @languages or die "sorry, can't read any language models from $model_dir\n" .
|
---|
| 61 | "language models must reside in files with .lm ending\n";
|
---|
| 62 |
|
---|
| 63 | # load model and count for each language.
|
---|
| 64 | foreach my $language (@languages) {
|
---|
| 65 | my %ngram=();
|
---|
| 66 | my $rang=1;
|
---|
| 67 | open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
|
---|
| 68 | while (<LM>) {
|
---|
| 69 | chomp;
|
---|
| 70 | # only use lines starting with appropriate character. Others are ignored.
|
---|
| 71 | if (/^[^$non_word_characters]+/o) {
|
---|
| 72 | $self->{'ngrams'}->{$language}->{$&} = $rang++;
|
---|
| 73 | }
|
---|
| 74 | }
|
---|
| 75 | close(LM);
|
---|
| 76 | }
|
---|
| 77 |
|
---|
| 78 | $self->{'languages'} = \@languages;
|
---|
[16554] | 79 |
|
---|
| 80 | $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
|
---|
| 81 | $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
|
---|
| 82 | $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
|
---|
| 83 | $self->{'max_cache_size'} = $MAX_CACHE_SIZE;
|
---|
| 84 |
|
---|
[2235] | 85 | return bless $self, $class;
|
---|
| 86 | }
|
---|
| 87 |
|
---|
| 88 |
|
---|
| 89 |
|
---|
[1316] | 90 | # CLASSIFICATION
|
---|
| 91 | #
|
---|
| 92 | # What language is a text string?
|
---|
| 93 | # Input: text string
|
---|
| 94 | # Output: array of language names
|
---|
| 95 |
|
---|
| 96 | sub classify {
|
---|
[16554] | 97 | my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
|
---|
| 98 |
|
---|
| 99 | $self->{'opt_f'} = $opt_freq if defined $opt_freq;
|
---|
| 100 | $self->{'opt_u'} = $opt_factor if defined $opt_factor;
|
---|
| 101 | $self->{'opt_t'} = $opt_top if defined $opt_top;
|
---|
| 102 |
|
---|
[2235] | 103 | my %results = ();
|
---|
[16554] | 104 | my $maxp = $self->{'opt_t'};
|
---|
[1316] | 105 |
|
---|
[2235] | 106 | # create ngrams for input.
|
---|
| 107 | my $unknown = $self->create_lm($inputref);
|
---|
[1316] | 108 |
|
---|
[2235] | 109 | foreach my $language (@{$self->{'languages'}}) {
|
---|
[1316] | 110 |
|
---|
[2235] | 111 | # compare language model with input ngrams list
|
---|
| 112 | my ($i,$p)=(0,0);
|
---|
| 113 | while ($i < scalar (@$unknown)) {
|
---|
| 114 | if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
|
---|
| 115 | $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
|
---|
| 116 | } else {
|
---|
| 117 | $p=$p+$maxp;
|
---|
| 118 | }
|
---|
| 119 | ++$i;
|
---|
| 120 | }
|
---|
| 121 | $results{$language} = $p;
|
---|
[1316] | 122 | }
|
---|
[2235] | 123 |
|
---|
| 124 | my @results = sort { $results{$a} <=> $results{$b} } keys %results;
|
---|
| 125 | my $a = $results{$results[0]};
|
---|
[1316] | 126 |
|
---|
[2235] | 127 | my @answers=(shift(@results));
|
---|
[16554] | 128 | while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
|
---|
[2235] | 129 | @answers=(@answers,shift(@results));
|
---|
| 130 | }
|
---|
[1316] | 131 |
|
---|
[2235] | 132 | return \@answers;
|
---|
[1316] | 133 | }
|
---|
| 134 |
|
---|
[16674] | 135 | # Same as above, but caches textcat results on filenames for subsequent use.
|
---|
| 136 | # The cache is a map of the filename to the corresponding filename_encodings
|
---|
| 137 | # (an array of results returned by textcat of the possible filename-encodings
|
---|
| 138 | # for the indexing filename string itself).
|
---|
| 139 | # Need to make sure that the filename is only the tailname: no path and no
|
---|
| 140 | # extension (no digits), in order to make optimum use of cached textcat.
|
---|
| 141 | # Textcat is performed on $filename_ref and the results associated with $filename_ref.
|
---|
| 142 | # The cache will be cleared when the max_cache_size is reached, which is
|
---|
| 143 | # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
|
---|
| 144 | # can also be cleared by a call to clear_filename_cache.
|
---|
| 145 | sub classify_cached_filename {
|
---|
| 146 | my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
|
---|
[16554] | 147 | $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
|
---|
| 148 |
|
---|
| 149 | # if not already in the cache, work it out and put it there
|
---|
[16674] | 150 | if (!defined $filename_cache{$$filename_ref})
|
---|
[16554] | 151 | {
|
---|
[16674] | 152 | if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
|
---|
| 153 | $self->clear_filename_cache();
|
---|
[16554] | 154 | }
|
---|
[16674] | 155 | $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top);
|
---|
| 156 | }
|
---|
| 157 |
|
---|
| 158 | # return cached array of encodings for the given string
|
---|
| 159 | return $filename_cache{$$filename_ref};
|
---|
| 160 | }
|
---|
| 161 |
|
---|
| 162 | # Same as above, but caches textcat results on filecontents for subsequent use.
|
---|
| 163 | # The cache is a map of the filename to an array of possible filename_encodings
|
---|
| 164 | # for the *contents* of the file returned by textcat.
|
---|
| 165 | # Textcat is performed on $contents_ref and the results associated with $filename.
|
---|
| 166 | # The cache will be cleared when the max_cache_size is reached, which is
|
---|
| 167 | # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
|
---|
| 168 | # can also be cleared by a call to clear_filecontents_cache.
|
---|
| 169 | sub classify_cached_filecontents {
|
---|
| 170 | my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
|
---|
| 171 | $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
|
---|
| 172 |
|
---|
| 173 | # if not already in the cache, work it out and put it there
|
---|
| 174 | if (!defined $filecontents_cache{$filename})
|
---|
| 175 | {
|
---|
| 176 | if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
|
---|
| 177 | $self->clear_filecontents_cache();
|
---|
| 178 | }
|
---|
| 179 | $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top);
|
---|
[16554] | 180 | }
|
---|
| 181 |
|
---|
[16674] | 182 | # return cached array of content encodings for the given filename
|
---|
| 183 | return $filecontents_cache{$filename};
|
---|
| 184 | }
|
---|
[16554] | 185 |
|
---|
[16674] | 186 | # Clears the filename cache (a map of strings to the textcat results for each string).
|
---|
| 187 | sub clear_filename_cache {
|
---|
| 188 | my $self = shift (@_);
|
---|
| 189 |
|
---|
| 190 | %filename_cache = undef; # does this suffice to release memory?
|
---|
| 191 | %filename_cache = ();
|
---|
[16554] | 192 | }
|
---|
| 193 |
|
---|
[16674] | 194 | # Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
|
---|
| 195 | sub clear_filecontents_cache {
|
---|
[16554] | 196 | my $self = shift (@_);
|
---|
| 197 |
|
---|
[16674] | 198 | %filecontents_cache = undef; # does this suffice to release memory?
|
---|
| 199 | %filecontents_cache = ();
|
---|
[16554] | 200 | }
|
---|
| 201 |
|
---|
[2235] | 202 | sub create_lm {
|
---|
| 203 | # $ngram contains reference to the hash we build
|
---|
| 204 | # then add the ngrams found in each word in the hash
|
---|
| 205 | my ($self, $textref) = @_;
|
---|
| 206 |
|
---|
| 207 | my $ngram = {};
|
---|
[1316] | 208 |
|
---|
[2235] | 209 | foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
|
---|
| 210 | $word = "_" . $word . "_";
|
---|
| 211 | my $len = length($word);
|
---|
| 212 | my $flen=$len;
|
---|
| 213 | my $i;
|
---|
[1316] | 214 |
|
---|
[2235] | 215 | for ($i=0; $i<$flen; $i++) {
|
---|
| 216 | $ngram->{substr($word,$i,5)}++ if $len > 4;
|
---|
| 217 | $ngram->{substr($word,$i,4)}++ if $len > 3;
|
---|
| 218 | $ngram->{substr($word,$i,3)}++ if $len > 2;
|
---|
| 219 | $ngram->{substr($word,$i,2)}++ if $len > 1;
|
---|
| 220 | $ngram->{substr($word,$i,1)}++;
|
---|
| 221 | $len--;
|
---|
| 222 | }
|
---|
[1316] | 223 | }
|
---|
| 224 |
|
---|
[16554] | 225 | map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
|
---|
[2235] | 226 | } keys %$ngram;
|
---|
[1316] | 227 |
|
---|
[2235] | 228 | # sort the ngrams, and spit out the $opt_t frequent ones.
|
---|
| 229 | # adding `or $a cmp $b' in the sort block makes sorting five
|
---|
| 230 | # times slower..., although it would be somewhat nicer (unique result)
|
---|
| 231 | my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
|
---|
[16554] | 232 | splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
|
---|
[2235] | 233 | return \@sorted;
|
---|
[1316] | 234 | }
|
---|
| 235 |
|
---|
| 236 | 1;
|
---|