Changeset 16554
- Timestamp:
- 2008-07-25T16:51:50+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/textcat.pm
r15894 r16554 43 43 my $non_word_characters = '0-9\s'; 44 44 45 # caching related 46 my %cache = (); # map of cached text-strings each to array of char-encodings 47 my $MAX_CACHE_SIZE = 1000; 48 45 49 sub new { 46 50 my $class = shift (@_); 51 my ($tmp_f, $tmp_t, $tmp_u) = @_; 52 47 53 my $self = {}; 48 54 … … 70 76 71 77 $self->{'languages'} = \@languages; 78 79 $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f; 80 $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t; 81 $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u; 82 $self->{'max_cache_size'} = $MAX_CACHE_SIZE; 83 72 84 return bless $self, $class; 73 85 } … … 82 94 83 95 sub classify { 84 my ($self, $inputref)=@_; 96 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_; 97 98 $self->{'opt_f'} = $opt_freq if defined $opt_freq; 99 $self->{'opt_u'} = $opt_factor if defined $opt_factor; 100 $self->{'opt_t'} = $opt_top if defined $opt_top; 101 85 102 my %results = (); 86 my $maxp = $ opt_t;103 my $maxp = $self->{'opt_t'}; 87 104 88 105 # create ngrams for input. … … 108 125 109 126 my @answers=(shift(@results)); 110 while (@results && $results{$results[0]} < ($ opt_u*$a)) {127 while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) { 111 128 @answers=(@answers,shift(@results)); 112 129 } 113 130 114 131 return \@answers; 132 } 133 134 # Same as above, but caches textcat results for subsequent use. 135 # The cache is a map of the string to the corresponding array of results 136 # returned by textcat of the possible filename-encodings for that string. 137 # Use this method for short strings (such as filenames) rather than huge text 138 # files. The cache will be cleared when the max_cache_size is reached, which 139 # is MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 140 # can also be cleared by a call to clear_cache. 141 sub classify_cached { 142 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 143 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 144 145 # if not already in the cache, work it out and put it there 146 if (!defined $cache{$$inputref}) 147 { 148 if (scalar (keys %cache) >= $self->{'max_cache_size'}) { 149 $self->clear_cache(); 150 } 151 ## print STDERR "$$inputref is not yet in the cache\n"; 152 $cache{$$inputref} = $self->classify($inputref, $opt_freq, $opt_factor, $opt_top); 153 } else { 154 ## print STDERR "$$inputref is already in the cache\n"; 155 } 156 157 ## print STDERR "Count of elements in cache is now: ".scalar (keys %cache)."\n"; 158 159 # return cached array of encodings for the given string 160 return $cache{$$inputref}; 161 } 162 163 # Clears the cache (a map of strings to the textcat results for each string). 164 sub clear_cache { 165 my $self = shift (@_); 166 167 %cache = undef; # does this suffice to release memory? 168 %cache = (); 115 169 } 116 170 … … 138 192 } 139 193 140 map { if ($ngram->{$_} <= $ opt_f) { delete $ngram->{$_}; }194 map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; } 141 195 } keys %$ngram; 142 196 … … 145 199 # times slower..., although it would be somewhat nicer (unique result) 146 200 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram; 147 splice(@sorted,$ opt_t) if (@sorted > $opt_t);201 splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'}); 148 202 return \@sorted; 149 203 }
Note:
See TracChangeset
for help on using the changeset viewer.