Changeset 16554

Show
Ignore:
Timestamp:
25.07.2008 16:51:50 (11 years ago)
Author:
ak19
Message:

Added subroutines classify_cached and clear_cache. The first of these is called by BasePlugin?'s filepath_to_utf8 subroutine. Textcat now caches results when called with classify_cached() which can be useful for small strings (like filenames) so that textcat need not be performed repeatedly on the same strings.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/textcat.pm

    r15894 r16554  
    4343my $non_word_characters = '0-9\s'; 
    4444 
     45# caching related 
     46my %cache = (); # map of cached text-strings each to array of char-encodings 
     47my $MAX_CACHE_SIZE = 1000; 
     48 
    4549sub new { 
    4650    my $class = shift (@_); 
     51    my ($tmp_f, $tmp_t, $tmp_u) = @_; 
     52 
    4753    my $self = {}; 
    4854 
     
    7076 
    7177    $self->{'languages'} = \@languages; 
     78 
     79    $self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f; 
     80    $self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t; 
     81    $self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u; 
     82    $self->{'max_cache_size'} = $MAX_CACHE_SIZE; 
     83 
    7284    return bless $self, $class; 
    7385} 
     
    8294 
    8395sub classify { 
    84     my ($self, $inputref)=@_; 
     96    my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_; 
     97     
     98    $self->{'opt_f'} = $opt_freq if defined $opt_freq; 
     99    $self->{'opt_u'} = $opt_factor if defined $opt_factor; 
     100    $self->{'opt_t'} = $opt_top if defined $opt_top; 
     101     
    85102    my %results = (); 
    86     my $maxp = $opt_t; 
     103    my $maxp = $self->{'opt_t'}; 
    87104 
    88105    # create ngrams for input. 
     
    108125   
    109126    my @answers=(shift(@results)); 
    110     while (@results && $results{$results[0]} < ($opt_u *$a)) { 
     127    while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) { 
    111128    @answers=(@answers,shift(@results)); 
    112129    } 
    113130 
    114131    return \@answers; 
     132} 
     133 
     134# Same as above, but caches textcat results for subsequent use. 
     135# The cache is a map of the string to the corresponding array of results  
     136# returned by textcat of the possible filename-encodings for that string. 
     137# Use this method for short strings (such as filenames) rather than huge text 
     138# files. The cache will be cleared when the max_cache_size is reached, which 
     139# is MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 
     140# can also be cleared by a call to clear_cache. 
     141sub classify_cached { 
     142    my ($self, $inputref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
     143    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 
     144     
     145    # if not already in the cache, work it out and put it there 
     146    if (!defined $cache{$$inputref})  
     147    { 
     148    if (scalar (keys %cache) >= $self->{'max_cache_size'}) { 
     149        $self->clear_cache(); 
     150    } 
     151##  print STDERR "$$inputref is not yet in the cache\n"; 
     152    $cache{$$inputref} = $self->classify($inputref, $opt_freq, $opt_factor, $opt_top); 
     153    } else { 
     154##  print STDERR "$$inputref is already in the cache\n"; 
     155    } 
     156 
     157##    print STDERR "Count of elements in cache is now: ".scalar (keys %cache)."\n"; 
     158 
     159    # return cached array of encodings for the given string 
     160    return $cache{$$inputref};  
     161} 
     162 
     163# Clears the cache (a map of strings to the textcat results for each string). 
     164sub clear_cache { 
     165    my $self = shift (@_); 
     166 
     167    %cache = undef; # does this suffice to release memory? 
     168    %cache = (); 
    115169} 
    116170 
     
    138192    } 
    139193 
    140     map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; } 
     194    map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; } 
    141195      } keys %$ngram; 
    142196   
     
    145199    # times slower..., although it would be somewhat nicer (unique result) 
    146200    my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram; 
    147     splice(@sorted,$opt_t) if (@sorted > $opt_t);  
     201    splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});  
    148202    return \@sorted; 
    149203}