Changeset 17214


Ignore:
Timestamp:
2008-09-08T13:44:20+12:00 (16 years ago)
Author:
ak19
Message:

Significant changes: 1. Textcat can be restricted to a given encoding when the encoding of a file's contents is known. In this case we are after the language of the contents. 2. Code for working out most frequently occurring encoding has been moved into here from ReadTextFile.pm. 3. Subroutines renamed and unnecessary parameters passed to the various versions of classify() have been moved into set and get methods instead.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/textcat.pm

    r16674 r17214  
    8787
    8888
    89 
    9089# CLASSIFICATION
    9190#
     
    9392#   Input:  text string
    9493#   Output: array of language names
    95 
     94# $languages is the set of language models to consider (to textcat on)
     95# Can be set to filter out language models that don't belong to the given encoding
     96# in order to obtain a list of the probable languages for that known encoding.
     97# $filter_by_encoding indicates what encoding to narrow the search for languages down to.
     98# This is for when we already know the encoding, but we're still looking for the language.
    9699sub classify {
    97     my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
    98    
    99     $self->{'opt_f'} = $opt_freq if defined $opt_freq;
    100     $self->{'opt_u'} = $opt_factor if defined $opt_factor;
    101     $self->{'opt_t'} = $opt_top if defined $opt_top;
    102    
     100    my ($self, $inputref, $filter_by_encoding)=@_;
     101    my $languages;
     102    @$languages = ();
     103
     104    # filter language filenames by encoding
     105    if(defined $filter_by_encoding) {
     106    # make sure to normalize language and filtering encoding so we are not
     107    # stuck comparing hyphens with underscores in such things as iso-8859-1
     108    my $normalized_filter = $filter_by_encoding;
     109    $normalized_filter =~ s/[\W\_]//g;
     110
     111    foreach my $lang (@{$self->{'languages'}}) {
     112        my $normalized_lang = $lang;
     113        $normalized_lang =~ s/[\W\_]//g;
     114
     115        if($normalized_lang =~ m/$normalized_filter/i) {
     116        push (@$languages, $lang);
     117        }
     118    }
     119    }
     120
     121    # if the filter_by_encoding wasn't in the list of language model filenames
     122    # or if we're not filtering, then work with all language model filenames
     123    if(scalar @$languages == 0) {
     124    $languages = $self->{'languages'};
     125    }
     126
    103127    my %results = ();
    104128    my $maxp = $self->{'opt_t'};
     
    107131    my $unknown = $self->create_lm($inputref);
    108132
    109     foreach my $language (@{$self->{'languages'}}) {
    110    
     133    foreach my $language (@$languages) {   
    111134    # compare language model with input ngrams list
    112135    my ($i,$p)=(0,0);
     
    133156}
    134157
    135 # Same as above, but caches textcat results on filenames for subsequent use.
     158
     159# Same as below, but caches textcat results on filenames for subsequent use.
    136160# The cache is a map of the filename to the corresponding filename_encodings
    137161# (an array of results returned by textcat of the possible filename-encodings
     
    144168# can also be cleared by a call to clear_filename_cache.
    145169sub classify_cached_filename {
    146     my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
    147     $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
     170    my ($self, $filename_ref)=@_;
    148171   
    149172    # if not already in the cache, work it out and put it there
     
    153176        $self->clear_filename_cache();
    154177    }
    155     $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top);
     178    $filename_cache{$$filename_ref} = $self->classify($filename_ref);
    156179    }
    157180
     
    161184
    162185# Same as above, but caches textcat results on filecontents for subsequent use.
     186# Textcat on a file's contents to work out its possible encodings. Uses the cache.
    163187# The cache is a map of the filename to an array of possible filename_encodings
    164188# for the *contents* of the file returned by textcat.
     
    167191# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
    168192# can also be cleared by a call to clear_filecontents_cache.
    169 sub classify_cached_filecontents {
    170     my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
    171     $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
    172  
     193sub classify_contents {
     194    my ($self, $contents_ref, $filename)=@_;
     195     
    173196    # if not already in the cache, work it out and put it there
    174     if (!defined $filecontents_cache{$filename}) 
     197    if (!defined $filecontents_cache{$filename})
    175198    {     
    176199    if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
    177200        $self->clear_filecontents_cache();
    178201    }
    179     $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top);
    180     }
    181 
     202   
     203    # Finally, we can perform the textcat classification of language and encoding
     204    $filecontents_cache{$filename} = $self->classify($contents_ref);
     205    }
    182206    # return cached array of content encodings for the given filename
    183207    return $filecontents_cache{$filename};
     208}
     209
     210
     211# Given the known encoding for a file's contents, performs a textcat
     212# filtering on the languages for the given encoding. Results are stored
     213# in the cache TWICE: once under $filename|$filter_by_encoding, and
     214# once under the usual $filename, so that subsequent calls to either
     215# this method or classify_contents using the same filename will not
     216# perform textcat again.
     217sub classify_contents_for_encoding {
     218    my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
     219
     220    if (!defined $filecontents_cache{"$filename|$filter_by_encoding"})
     221    {     
     222    if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
     223        $self->clear_filecontents_cache();
     224    }   
     225
     226    $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
     227    # store this in cache again under $filename entry, so that subsequent
     228    # calls to classify_contents will find it in the cache already
     229    $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
     230    }
     231    return $filecontents_cache{$filename};
     232}
     233   
     234
     235# This method returns the most frequently occurring encoding
     236# but only if any encoding occurs more than once in the given results.
     237# Otherwise, "" is returned.
     238sub most_frequent_encoding {
     239    my ($self, $results) = @_;
     240    my $best_encoding = "";
     241
     242    # guessed_encodings is a hashmap of Encoding -> Frequency pairs
     243    my %guessed_encodings = ();
     244    foreach my $result (@$results) {
     245    # Get the encoding portion of a language-model filename like en-iso8859_1
     246    my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
     247    if(!defined($guessed_encodings{$encoding})) {
     248        $guessed_encodings{$encoding} = 0;
     249    }
     250    $guessed_encodings{$encoding}++;
     251    }
     252   
     253    $guessed_encodings{""}=-1; # for default best_encoding of ""
     254
     255    foreach my $enc (keys %guessed_encodings) {
     256    if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
     257        $best_encoding = $enc;
     258    }
     259    }
     260
     261    # If best_encoding's frequency == 1, then the frequency for all encodings will
     262    # be 1 since the sum total of all frequencies is num_results: if any encoding
     263    # has frequency > 1 (it's possibly the best_encoding), one or more of the others
     264    # would have been at 0 frequency to compensate.
     265    return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
     266}
     267
     268
     269# set some of the specific member variables
     270sub set_opts {
     271    my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
     272         
     273    $self->{'opt_f'} = $opt_freq if defined $opt_freq;
     274    $self->{'opt_u'} = $opt_factor if defined $opt_factor;
     275    $self->{'opt_t'} = $opt_top if defined $opt_top;
     276
     277    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
     278}
     279
     280sub get_opts {
     281    my $self = shift (@_);
     282    return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
    184283}
    185284
Note: See TracChangeset for help on using the changeset viewer.