Changeset 16674

Show
Ignore:
Timestamp:
07.08.2008 14:01:27 (11 years ago)
Author:
ak19
Message:

Added caching for textcat results on filecontents as well: a second map now stores the mapping from filenames to the cached textcat results on the contents of those files.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/textcat.pm

    r16554 r16674  
    4444 
    4545# caching related 
    46 my %cache = (); # map of cached text-strings each to array of char-encodings 
     46my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves 
     47my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files 
    4748my $MAX_CACHE_SIZE = 1000; 
    4849 
     
    132133} 
    133134 
    134 # Same as above, but caches textcat results for subsequent use. 
    135 # The cache is a map of the string to the corresponding array of results  
    136 # returned by textcat of the possible filename-encodings for that string. 
    137 # Use this method for short strings (such as filenames) rather than huge text 
    138 # files. The cache will be cleared when the max_cache_size is reached, which 
    139 # is MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 
    140 # can also be cleared by a call to clear_cache. 
    141 sub classify_cached { 
    142     my ($self, $inputref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
     135# Same as above, but caches textcat results on filenames for subsequent use. 
     136# The cache is a map of the filename to the corresponding filename_encodings 
     137# (an array of results returned by textcat of the possible filename-encodings  
     138# for the indexing filename string itself).  
     139# Need to make sure that the filename is only the tailname: no path and no 
     140# extension (no digits), in order to make optimum use of cached textcat. 
     141# Textcat is performed on $filename_ref and the results associated with $filename_ref. 
     142# The cache will be cleared when the max_cache_size is reached, which is 
     143# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 
     144# can also be cleared by a call to clear_filename_cache. 
     145sub classify_cached_filename { 
     146    my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
    143147    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 
    144148     
    145149    # if not already in the cache, work it out and put it there 
    146     if (!defined $cache{$$inputref})  
     150    if (!defined $filename_cache{$$filename_ref})  
    147151    { 
    148     if (scalar (keys %cache) >= $self->{'max_cache_size'}) { 
    149         $self->clear_cache(); 
    150     } 
    151 ##  print STDERR "$$inputref is not yet in the cache\n"; 
    152     $cache{$$inputref} = $self->classify($inputref, $opt_freq, $opt_factor, $opt_top); 
    153     } else { 
    154 ##  print STDERR "$$inputref is already in the cache\n"; 
    155     } 
    156  
    157 ##    print STDERR "Count of elements in cache is now: ".scalar (keys %cache)."\n"; 
     152    if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) { 
     153        $self->clear_filename_cache(); 
     154    } 
     155    $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top); 
     156    }  
    158157 
    159158    # return cached array of encodings for the given string 
    160     return $cache{$$inputref};  
    161 } 
    162  
    163 # Clears the cache (a map of strings to the textcat results for each string). 
    164 sub clear_cache { 
     159    return $filename_cache{$$filename_ref};  
     160} 
     161 
     162# Same as above, but caches textcat results on filecontents for subsequent use. 
     163# The cache is a map of the filename to an array of possible filename_encodings 
     164# for the *contents* of the file returned by textcat. 
     165# Textcat is performed on $contents_ref and the results associated with $filename. 
     166# The cache will be cleared when the max_cache_size is reached, which is 
     167# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 
     168# can also be cleared by a call to clear_filecontents_cache. 
     169sub classify_cached_filecontents { 
     170    my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
     171    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 
     172  
     173    # if not already in the cache, work it out and put it there 
     174    if (!defined $filecontents_cache{$filename})  
     175    {       
     176    if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { 
     177        $self->clear_filecontents_cache(); 
     178    } 
     179    $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top); 
     180    } 
     181 
     182    # return cached array of content encodings for the given filename 
     183    return $filecontents_cache{$filename}; 
     184} 
     185 
     186# Clears the filename cache (a map of strings to the textcat results for each string). 
     187sub clear_filename_cache { 
    165188    my $self = shift (@_); 
    166189 
    167     %cache = undef; # does this suffice to release memory? 
    168     %cache = (); 
     190    %filename_cache = undef; # does this suffice to release memory? 
     191    %filename_cache = (); 
     192} 
     193 
     194# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file). 
     195sub clear_filecontents_cache { 
     196    my $self = shift (@_); 
     197 
     198    %filecontents_cache = undef; # does this suffice to release memory? 
     199    %filecontents_cache = (); 
    169200} 
    170201