greenstone.org greenstone wiki greenstone trac planet greenstone

Changeset 17214

Show
Ignore:
Timestamp:
2008-09-08 13:44:20 (3 months ago)
Author:
ak19
Message:

Significant changes: 1. Textcat can be restricted to a given encoding when the encoding of a file's contents is known. In this case we are after the language of the contents. 2. Code for working out most frequently occurring encoding has been moved into here from ReadTextFile?.pm. 3. Subroutines renamed and unnecessary parameters passed to the various versions of classify() have been moved into set and get methods instead.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • gsdl/trunk/perllib/textcat.pm

    r16674 r17214  
    8787 
    8888 
    89  
    9089# CLASSIFICATION 
    9190# 
     
    9392#   Input:  text string 
    9493#   Output: array of language names 
    95  
     94# $languages is the set of language models to consider (to textcat on) 
     95# Can be set to filter out language models that don't belong to the given encoding  
     96# in order to obtain a list of the probable languages for that known encoding. 
     97# $filter_by_encoding indicates what encoding to narrow the search for languages down to. 
     98# This is for when we already know the encoding, but we're still looking for the language. 
    9699sub classify { 
    97     my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_; 
    98      
    99     $self->{'opt_f'} = $opt_freq if defined $opt_freq; 
    100     $self->{'opt_u'} = $opt_factor if defined $opt_factor; 
    101     $self->{'opt_t'} = $opt_top if defined $opt_top; 
    102      
     100    my ($self, $inputref, $filter_by_encoding)=@_; 
     101    my $languages; 
     102    @$languages = (); 
     103 
     104    # filter language filenames by encoding  
     105    if(defined $filter_by_encoding) { 
     106        # make sure to normalize language and filtering encoding so we are not 
     107        # stuck comparing hyphens with underscores in such things as iso-8859-1 
     108        my $normalized_filter = $filter_by_encoding;  
     109        $normalized_filter =~ s/[\W\_]//g; 
     110 
     111        foreach my $lang (@{$self->{'languages'}}) { 
     112            my $normalized_lang = $lang;  
     113            $normalized_lang =~ s/[\W\_]//g; 
     114 
     115            if($normalized_lang =~ m/$normalized_filter/i) { 
     116                push (@$languages, $lang); 
     117            } 
     118        } 
     119    } 
     120 
     121    # if the filter_by_encoding wasn't in the list of language model filenames 
     122    # or if we're not filtering, then work with all language model filenames 
     123    if(scalar @$languages == 0) { 
     124        $languages = $self->{'languages'}; 
     125    } 
     126 
    103127    my %results = (); 
    104128    my $maxp = $self->{'opt_t'}; 
     
    107131    my $unknown = $self->create_lm($inputref); 
    108132 
    109     foreach my $language (@{$self->{'languages'}}) { 
    110      
     133    foreach my $language (@$languages) {         
    111134        # compare language model with input ngrams list 
    112135        my ($i,$p)=(0,0); 
     
    133156} 
    134157 
    135 # Same as above, but caches textcat results on filenames for subsequent use. 
     158 
     159# Same as below, but caches textcat results on filenames for subsequent use. 
    136160# The cache is a map of the filename to the corresponding filename_encodings 
    137161# (an array of results returned by textcat of the possible filename-encodings  
     
    144168# can also be cleared by a call to clear_filename_cache. 
    145169sub classify_cached_filename { 
    146     my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
    147     $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 
     170    my ($self, $filename_ref)=@_; 
    148171     
    149172    # if not already in the cache, work it out and put it there 
     
    153176            $self->clear_filename_cache(); 
    154177        } 
    155         $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top); 
     178        $filename_cache{$$filename_ref} = $self->classify($filename_ref); 
    156179    }  
    157180 
     
    161184 
    162185# Same as above, but caches textcat results on filecontents for subsequent use. 
     186# Textcat on a file's contents to work out its possible encodings. Uses the cache. 
    163187# The cache is a map of the filename to an array of possible filename_encodings 
    164188# for the *contents* of the file returned by textcat. 
     
    167191# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache 
    168192# can also be cleared by a call to clear_filecontents_cache. 
    169 sub classify_cached_filecontents { 
    170     my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
    171     $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 
    172   
     193sub classify_contents { 
     194    my ($self, $contents_ref, $filename)=@_; 
     195      
    173196    # if not already in the cache, work it out and put it there 
    174     if (!defined $filecontents_cache{$filename})  
     197    if (!defined $filecontents_cache{$filename}) 
    175198    {       
    176199        if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { 
    177200            $self->clear_filecontents_cache(); 
    178201        } 
    179         $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top); 
    180     } 
    181  
     202         
     203        # Finally, we can perform the textcat classification of language and encoding 
     204        $filecontents_cache{$filename} = $self->classify($contents_ref); 
     205    } 
    182206    # return cached array of content encodings for the given filename 
    183207    return $filecontents_cache{$filename}; 
     208} 
     209 
     210 
     211# Given the known encoding for a file's contents, performs a textcat  
     212# filtering on the languages for the given encoding. Results are stored 
     213# in the cache TWICE: once under $filename|$filter_by_encoding, and 
     214# once under the usual $filename, so that subsequent calls to either 
     215# this method or classify_contents using the same filename will not 
     216# perform textcat again.  
     217sub classify_contents_for_encoding { 
     218    my ($self, $contents_ref, $filename, $filter_by_encoding)=@_; 
     219 
     220    if (!defined $filecontents_cache{"$filename|$filter_by_encoding"}) 
     221    {       
     222        if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) { 
     223            $self->clear_filecontents_cache(); 
     224        }        
     225 
     226        $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding); 
     227        # store this in cache again under $filename entry, so that subsequent  
     228        # calls to classify_contents will find it in the cache already 
     229        $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding); 
     230    } 
     231    return $filecontents_cache{$filename}; 
     232} 
     233    
     234 
     235# This method returns the most frequently occurring encoding  
     236# but only if any encoding occurs more than once in the given results. 
     237# Otherwise, "" is returned. 
     238sub most_frequent_encoding { 
     239    my ($self, $results) = @_; 
     240    my $best_encoding = ""; 
     241 
     242    # guessed_encodings is a hashmap of Encoding -> Frequency pairs 
     243    my %guessed_encodings = (); 
     244    foreach my $result (@$results) { 
     245        # Get the encoding portion of a language-model filename like en-iso8859_1 
     246        my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/); 
     247        if(!defined($guessed_encodings{$encoding})) { 
     248            $guessed_encodings{$encoding} = 0; 
     249        } 
     250        $guessed_encodings{$encoding}++; 
     251    } 
     252     
     253    $guessed_encodings{""}=-1; # for default best_encoding of "" 
     254 
     255    foreach my $enc (keys %guessed_encodings) { 
     256        if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) { 
     257            $best_encoding = $enc; 
     258        } 
     259    } 
     260 
     261    # If best_encoding's frequency == 1, then the frequency for all encodings will  
     262    # be 1 since the sum total of all frequencies is num_results: if any encoding 
     263    # has frequency > 1 (it's possibly the best_encoding), one or more of the others 
     264    # would have been at 0 frequency to compensate. 
     265    return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : ""; 
     266} 
     267 
     268 
     269# set some of the specific member variables 
     270sub set_opts { 
     271    my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_; 
     272          
     273    $self->{'opt_f'} = $opt_freq if defined $opt_freq; 
     274    $self->{'opt_u'} = $opt_factor if defined $opt_factor; 
     275    $self->{'opt_t'} = $opt_top if defined $opt_top; 
     276 
     277    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache; 
     278} 
     279 
     280sub get_opts { 
     281    my $self = shift (@_); 
     282    return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'}); 
    184283} 
    185284