| 97 | | my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_; |
|---|
| 98 | | |
|---|
| 99 | | $self->{'opt_f'} = $opt_freq if defined $opt_freq; |
|---|
| 100 | | $self->{'opt_u'} = $opt_factor if defined $opt_factor; |
|---|
| 101 | | $self->{'opt_t'} = $opt_top if defined $opt_top; |
|---|
| 102 | | |
|---|
| | 100 | my ($self, $inputref, $filter_by_encoding)=@_; |
|---|
| | 101 | my $languages; |
|---|
| | 102 | @$languages = (); |
|---|
| | 103 | |
|---|
| | 104 | # filter language filenames by encoding |
|---|
| | 105 | if(defined $filter_by_encoding) { |
|---|
| | 106 | # make sure to normalize language and filtering encoding so we are not |
|---|
| | 107 | # stuck comparing hyphens with underscores in such things as iso-8859-1 |
|---|
| | 108 | my $normalized_filter = $filter_by_encoding; |
|---|
| | 109 | $normalized_filter =~ s/[\W\_]//g; |
|---|
| | 110 | |
|---|
| | 111 | foreach my $lang (@{$self->{'languages'}}) { |
|---|
| | 112 | my $normalized_lang = $lang; |
|---|
| | 113 | $normalized_lang =~ s/[\W\_]//g; |
|---|
| | 114 | |
|---|
| | 115 | if($normalized_lang =~ m/$normalized_filter/i) { |
|---|
| | 116 | push (@$languages, $lang); |
|---|
| | 117 | } |
|---|
| | 118 | } |
|---|
| | 119 | } |
|---|
| | 120 | |
|---|
| | 121 | # if the filter_by_encoding wasn't in the list of language model filenames |
|---|
| | 122 | # or if we're not filtering, then work with all language model filenames |
|---|
| | 123 | if(scalar @$languages == 0) { |
|---|
| | 124 | $languages = $self->{'languages'}; |
|---|
| | 125 | } |
|---|
| | 126 | |
|---|
| | 208 | } |
|---|
| | 209 | |
|---|
| | 210 | |
|---|
| | 211 | # Given the known encoding for a file's contents, performs a textcat |
|---|
| | 212 | # filtering on the languages for the given encoding. Results are stored |
|---|
| | 213 | # in the cache TWICE: once under $filename|$filter_by_encoding, and |
|---|
| | 214 | # once under the usual $filename, so that subsequent calls to either |
|---|
| | 215 | # this method or classify_contents using the same filename will not |
|---|
| | 216 | # perform textcat again. |
|---|
| | 217 | sub classify_contents_for_encoding { |
|---|
| | 218 | my ($self, $contents_ref, $filename, $filter_by_encoding)=@_; |
|---|
| | 219 | |
|---|
| | 220 | if (!defined $filecontents_cache{"$filename|$filter_by_encoding"}) |
|---|
| | 221 | { |
|---|
| | 222 | if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { |
|---|
| | 223 | $self->clear_filecontents_cache(); |
|---|
| | 224 | } |
|---|
| | 225 | |
|---|
| | 226 | $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding); |
|---|
| | 227 | # store this in cache again under $filename entry, so that subsequent |
|---|
| | 228 | # calls to classify_contents will find it in the cache already |
|---|
| | 229 | $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding); |
|---|
| | 230 | } |
|---|
| | 231 | return $filecontents_cache{$filename}; |
|---|
| | 232 | } |
|---|
| | 233 | |
|---|
| | 234 | |
|---|
| | 235 | # This method returns the most frequently occurring encoding |
|---|
| | 236 | # but only if any encoding occurs more than once in the given results. |
|---|
| | 237 | # Otherwise, "" is returned. |
|---|
| | 238 | sub most_frequent_encoding { |
|---|
| | 239 | my ($self, $results) = @_; |
|---|
| | 240 | my $best_encoding = ""; |
|---|
| | 241 | |
|---|
| | 242 | # guessed_encodings is a hashmap of Encoding -> Frequency pairs |
|---|
| | 243 | my %guessed_encodings = (); |
|---|
| | 244 | foreach my $result (@$results) { |
|---|
| | 245 | # Get the encoding portion of a language-model filename like en-iso8859_1 |
|---|
| | 246 | my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/); |
|---|
| | 247 | if(!defined($guessed_encodings{$encoding})) { |
|---|
| | 248 | $guessed_encodings{$encoding} = 0; |
|---|
| | 249 | } |
|---|
| | 250 | $guessed_encodings{$encoding}++; |
|---|
| | 251 | } |
|---|
| | 252 | |
|---|
| | 253 | $guessed_encodings{""}=-1; # for default best_encoding of "" |
|---|
| | 254 | |
|---|
| | 255 | foreach my $enc (keys %guessed_encodings) { |
|---|
| | 256 | if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) { |
|---|
| | 257 | $best_encoding = $enc; |
|---|
| | 258 | } |
|---|
| | 259 | } |
|---|
| | 260 | |
|---|
| | 261 | # If best_encoding's frequency == 1, then the frequency for all encodings will |
|---|
| | 262 | # be 1 since the sum total of all frequencies is num_results: if any encoding |
|---|
| | 263 | # has frequency > 1 (it's possibly the best_encoding), one or more of the others |
|---|
| | 264 | # would have been at 0 frequency to compensate. |
|---|
| | 265 | return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : ""; |
|---|
| | 266 | } |
|---|
| | 267 | |
|---|
| | 268 | |
|---|
| | 269 | # set some of the specific member variables |
|---|
| | 270 | sub set_opts { |
|---|
| | 271 | my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; |
|---|
| | 272 | |
|---|
| | 273 | $self->{'opt_f'} = $opt_freq if defined $opt_freq; |
|---|
| | 274 | $self->{'opt_u'} = $opt_factor if defined $opt_factor; |
|---|
| | 275 | $self->{'opt_t'} = $opt_top if defined $opt_top; |
|---|
| | 276 | |
|---|
| | 277 | $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; |
|---|
| | 278 | } |
|---|
| | 279 | |
|---|
| | 280 | sub get_opts { |
|---|
| | 281 | my $self = shift (@_); |
|---|
| | 282 | return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'}); |
|---|