Changeset 16674
- Timestamp:
- 2008-08-07T14:01:27+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/textcat.pm
r16554 r16674 44 44 45 45 # caching related 46 my %cache = (); # map of cached text-strings each to array of char-encodings 46 my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves 47 my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files 47 48 my $MAX_CACHE_SIZE = 1000; 48 49 … … 132 133 } 133 134 134 # Same as above, but caches textcat results for subsequent use. 135 # The cache is a map of the string to the corresponding array of results 136 # returned by textcat of the possible filename-encodings for that string. 137 # Use this method for short strings (such as filenames) rather than huge text 138 # files. The cache will be cleared when the max_cache_size is reached, which 139 # is MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 140 # can also be cleared by a call to clear_cache. 141 sub classify_cached { 142 my ($self, $inputref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 135 # Same as above, but caches textcat results on filenames for subsequent use. 136 # The cache is a map of the filename to the corresponding filename_encodings 137 # (an array of results returned by textcat of the possible filename-encodings 138 # for the indexing filename string itself). 139 # Need to make sure that the filename is only the tailname: no path and no 140 # extension (no digits), in order to make optimum use of cached textcat. 141 # Textcat is performed on $filename_ref and the results associated with $filename_ref. 142 # The cache will be cleared when the max_cache_size is reached, which is 143 # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 144 # can also be cleared by a call to clear_filename_cache. 145 sub classify_cached_filename { 146 my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 143 147 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 144 148 145 149 # if not already in the cache, work it out and put it there 146 if (!defined $ cache{$$inputref})150 if (!defined $filename_cache{$$filename_ref}) 147 151 { 148 if (scalar (keys %cache) >= $self->{'max_cache_size'}) { 149 $self->clear_cache(); 150 } 151 ## print STDERR "$$inputref is not yet in the cache\n"; 152 $cache{$$inputref} = $self->classify($inputref, $opt_freq, $opt_factor, $opt_top); 153 } else { 154 ## print STDERR "$$inputref is already in the cache\n"; 155 } 156 157 ## print STDERR "Count of elements in cache is now: ".scalar (keys %cache)."\n"; 152 if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) { 153 $self->clear_filename_cache(); 154 } 155 $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top); 156 } 158 157 159 158 # return cached array of encodings for the given string 160 return $cache{$$inputref}; 161 } 162 163 # Clears the cache (a map of strings to the textcat results for each string). 164 sub clear_cache { 159 return $filename_cache{$$filename_ref}; 160 } 161 162 # Same as above, but caches textcat results on filecontents for subsequent use. 163 # The cache is a map of the filename to an array of possible filename_encodings 164 # for the *contents* of the file returned by textcat. 165 # Textcat is performed on $contents_ref and the results associated with $filename. 166 # The cache will be cleared when the max_cache_size is reached, which is 167 # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 168 # can also be cleared by a call to clear_filecontents_cache. 169 sub classify_cached_filecontents { 170 my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 171 $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 172 173 # if not already in the cache, work it out and put it there 174 if (!defined $filecontents_cache{$filename}) 175 { 176 if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { 177 $self->clear_filecontents_cache(); 178 } 179 $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top); 180 } 181 182 # return cached array of content encodings for the given filename 183 return $filecontents_cache{$filename}; 184 } 185 186 # Clears the filename cache (a map of strings to the textcat results for each string). 187 sub clear_filename_cache { 165 188 my $self = shift (@_); 166 189 167 %cache = undef; # does this suffice to release memory? 168 %cache = (); 190 %filename_cache = undef; # does this suffice to release memory? 191 %filename_cache = (); 192 } 193 194 # Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file). 195 sub clear_filecontents_cache { 196 my $self = shift (@_); 197 198 %filecontents_cache = undef; # does this suffice to release memory? 199 %filecontents_cache = (); 169 200 } 170 201
Note:
See TracChangeset
for help on using the changeset viewer.