Changeset 17214
- Timestamp:
- 2008-09-08T13:44:20+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/textcat.pm
r16674 r17214 87 87 88 88 89 90 89 # CLASSIFICATION 91 90 # … … 93 92 # Input: text string 94 93 # Output: array of language names 95 94 # $languages is the set of language models to consider (to textcat on) 95 # Can be set to filter out language models that don't belong to the given encoding 96 # in order to obtain a list of the probable languages for that known encoding. 97 # $filter_by_encoding indicates what encoding to narrow the search for languages down to. 98 # This is for when we already know the encoding, but we're still looking for the language. 96 99 sub classify { 97 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_; 98 99 $self->{'opt_f'} = $opt_freq if defined $opt_freq; 100 $self->{'opt_u'} = $opt_factor if defined $opt_factor; 101 $self->{'opt_t'} = $opt_top if defined $opt_top; 102 100 my ($self, $inputref, $filter_by_encoding)=@_; 101 my $languages; 102 @$languages = (); 103 104 # filter language filenames by encoding 105 if(defined $filter_by_encoding) { 106 # make sure to normalize language and filtering encoding so we are not 107 # stuck comparing hyphens with underscores in such things as iso-8859-1 108 my $normalized_filter = $filter_by_encoding; 109 $normalized_filter =~ s/[\W\_]//g; 110 111 foreach my $lang (@{$self->{'languages'}}) { 112 my $normalized_lang = $lang; 113 $normalized_lang =~ s/[\W\_]//g; 114 115 if($normalized_lang =~ m/$normalized_filter/i) { 116 push (@$languages, $lang); 117 } 118 } 119 } 120 121 # if the filter_by_encoding wasn't in the list of language model filenames 122 # or if we're not filtering, then work with all language model filenames 123 if(scalar @$languages == 0) { 124 $languages = $self->{'languages'}; 125 } 126 103 127 my %results = (); 104 128 my $maxp = $self->{'opt_t'}; … … 107 131 my $unknown = $self->create_lm($inputref); 108 132 109 foreach my $language (@{$self->{'languages'}}) { 110 133 foreach my $language (@$languages) { 111 134 # compare language model with input ngrams list 112 135 my ($i,$p)=(0,0); … … 133 156 } 134 157 135 # Same as above, but caches textcat results on filenames for subsequent use. 158 159 # Same as below, but caches textcat results on filenames for subsequent use. 136 160 # The cache is a map of the filename to the corresponding filename_encodings 137 161 # (an array of results returned by textcat of the possible filename-encodings … … 144 168 # can also be cleared by a call to clear_filename_cache. 145 169 sub classify_cached_filename { 146 my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 147 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 170 my ($self, $filename_ref)=@_; 148 171 149 172 # if not already in the cache, work it out and put it there … … 153 176 $self->clear_filename_cache(); 154 177 } 155 $filename_cache{$$filename_ref} = $self->classify($filename_ref , $opt_freq, $opt_factor, $opt_top);178 $filename_cache{$$filename_ref} = $self->classify($filename_ref); 156 179 } 157 180 … … 161 184 162 185 # Same as above, but caches textcat results on filecontents for subsequent use. 186 # Textcat on a file's contents to work out its possible encodings. Uses the cache. 163 187 # The cache is a map of the filename to an array of possible filename_encodings 164 188 # for the *contents* of the file returned by textcat. … … 167 191 # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 168 192 # can also be cleared by a call to clear_filecontents_cache. 169 sub classify_cached_filecontents { 170 my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 171 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 172 193 sub classify_contents { 194 my ($self, $contents_ref, $filename)=@_; 195 173 196 # if not already in the cache, work it out and put it there 174 if (!defined $filecontents_cache{$filename}) 197 if (!defined $filecontents_cache{$filename}) 175 198 { 176 199 if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { 177 200 $self->clear_filecontents_cache(); 178 201 } 179 $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top); 180 } 181 202 203 # Finally, we can perform the textcat classification of language and encoding 204 $filecontents_cache{$filename} = $self->classify($contents_ref); 205 } 182 206 # return cached array of content encodings for the given filename 183 207 return $filecontents_cache{$filename}; 208 } 209 210 211 # Given the known encoding for a file's contents, performs a textcat 212 # filtering on the languages for the given encoding. Results are stored 213 # in the cache TWICE: once under $filename|$filter_by_encoding, and 214 # once under the usual $filename, so that subsequent calls to either 215 # this method or classify_contents using the same filename will not 216 # perform textcat again. 217 sub classify_contents_for_encoding { 218 my ($self, $contents_ref, $filename, $filter_by_encoding)=@_; 219 220 if (!defined $filecontents_cache{"$filename|$filter_by_encoding"}) 221 { 222 if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { 223 $self->clear_filecontents_cache(); 224 } 225 226 $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding); 227 # store this in cache again under $filename entry, so that subsequent 228 # calls to classify_contents will find it in the cache already 229 $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding); 230 } 231 return $filecontents_cache{$filename}; 232 } 233 234 235 # This method returns the most frequently occurring encoding 236 # but only if any encoding occurs more than once in the given results. 237 # Otherwise, "" is returned. 238 sub most_frequent_encoding { 239 my ($self, $results) = @_; 240 my $best_encoding = ""; 241 242 # guessed_encodings is a hashmap of Encoding -> Frequency pairs 243 my %guessed_encodings = (); 244 foreach my $result (@$results) { 245 # Get the encoding portion of a language-model filename like en-iso8859_1 246 my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/); 247 if(!defined($guessed_encodings{$encoding})) { 248 $guessed_encodings{$encoding} = 0; 249 } 250 $guessed_encodings{$encoding}++; 251 } 252 253 $guessed_encodings{""}=-1; # for default best_encoding of "" 254 255 foreach my $enc (keys %guessed_encodings) { 256 if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) { 257 $best_encoding = $enc; 258 } 259 } 260 261 # If best_encoding's frequency == 1, then the frequency for all encodings will 262 # be 1 since the sum total of all frequencies is num_results: if any encoding 263 # has frequency > 1 (it's possibly the best_encoding), one or more of the others 264 # would have been at 0 frequency to compensate. 265 return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : ""; 266 } 267 268 269 # set some of the specific member variables 270 sub set_opts { 271 my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 272 273 $self->{'opt_f'} = $opt_freq if defined $opt_freq; 274 $self->{'opt_u'} = $opt_factor if defined $opt_factor; 275 $self->{'opt_t'} = $opt_top if defined $opt_top; 276 277 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 278 } 279 280 sub get_opts { 281 my $self = shift (@_); 282 return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'}); 184 283 } 185 284
Note:
See TracChangeset
for help on using the changeset viewer.