Context Navigation

← Previous Changeset
Next Changeset →

Changeset 17214

Timestamp:

2008-09-08T13:44:20+12:00 (16 years ago)

Author:

ak19

Message:

Significant changes: 1. Textcat can be restricted to a given encoding when the encoding of a file's contents is known. In this case we are after the language of the contents. 2. Code for working out most frequently occurring encoding has been moved into here from ReadTextFile.pm. 3. Subroutines renamed and unnecessary parameters passed to the various versions of classify() have been moved into set and get methods instead.

File:

: 1 edited

gsdl/trunk/perllib/textcat.pm (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/textcat.pm

-              r16674
+              r17214
 # CLASSIFICATION
+#
 …
 #   Input:  text string
 #   Output: array of language names
+# $languages is the set of language models to consider (to textcat on)
+# Can be set to filter out language models that don't belong to the given encoding
+# in order to obtain a list of the probable languages for that known encoding.
+# $filter_by_encoding indicates what encoding to narrow the search for languages down to.
+# This is for when we already know the encoding, but we're still looking for the language.
 sub classify {
+    my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
+    $self->{'opt_f'} = $opt_freq if defined $opt_freq;
+    $self->{'opt_u'} = $opt_factor if defined $opt_factor;
+    $self->{'opt_t'} = $opt_top if defined $opt_top;
+    my ($self, $inputref, $filter_by_encoding)=@_;
+    my $languages;
+    @$languages = ();
+    # filter language filenames by encoding
+    if(defined $filter_by_encoding) {
+    # make sure to normalize language and filtering encoding so we are not
+    # stuck comparing hyphens with underscores in such things as iso-8859-1
+    my $normalized_filter = $filter_by_encoding;
+    $normalized_filter =~ s/[\W\_]//g;
+    foreach my $lang (@{$self->{'languages'}}) {
+        my $normalized_lang = $lang;
+        $normalized_lang =~ s/[\W\_]//g;
+        if($normalized_lang =~ m/$normalized_filter/i) {
+        push (@$languages, $lang);
+        }
+    }
+    }
+    # if the filter_by_encoding wasn't in the list of language model filenames
+    # or if we're not filtering, then work with all language model filenames
+    if(scalar @$languages == 0) {
+    $languages = $self->{'languages'};
+    }
     my %results = ();
     my $maxp = $self->{'opt_t'};
 …
     my $unknown = $self->create_lm($inputref);
+    foreach my $language (@{$self->{'languages'}}) {
+    foreach my $language (@$languages) {
     # compare language model with input ngrams list
     my ($i,$p)=(0,0);
 …
+}
+# Same as above, but caches textcat results on filenames for subsequent use.
+# Same as below, but caches textcat results on filenames for subsequent use.
 # The cache is a map of the filename to the corresponding filename_encodings
 # (an array of results returned by textcat of the possible filename-encodings
 …
 # can also be cleared by a call to clear_filename_cache.
 sub classify_cached_filename {
+    my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
+    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
+    my ($self, $filename_ref)=@_;
     # if not already in the cache, work it out and put it there
 …
         $self->clear_filename_cache();
+    }
     $filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top);
+    $filename_cache{$$filename_ref} = $self->classify($filename_ref);
+    }
 …
 # Same as above, but caches textcat results on filecontents for subsequent use.
+# Textcat on a file's contents to work out its possible encodings. Uses the cache.
 # The cache is a map of the filename to an array of possible filename_encodings
 # for the *contents* of the file returned by textcat.
 …
 # MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
 # can also be cleared by a call to clear_filecontents_cache.
+sub classify_cached_filecontents {
+    my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
+    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
+sub classify_contents {
+    my ($self, $contents_ref, $filename)=@_;
     # if not already in the cache, work it out and put it there
     if (!defined $filecontents_cache{$filename})
+    if (!defined $filecontents_cache{$filename})
+    {
     if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
         $self->clear_filecontents_cache();
+    }
+    $filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top);
+    }
+    # Finally, we can perform the textcat classification of language and encoding
+    $filecontents_cache{$filename} = $self->classify($contents_ref);
+    }
     # return cached array of content encodings for the given filename
     return $filecontents_cache{$filename};
+}
+# Given the known encoding for a file's contents, performs a textcat
+# filtering on the languages for the given encoding. Results are stored
+# in the cache TWICE: once under $filename|$filter_by_encoding, and
+# once under the usual $filename, so that subsequent calls to either
+# this method or classify_contents using the same filename will not
+# perform textcat again.
+sub classify_contents_for_encoding {
+    my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
+    if (!defined $filecontents_cache{"$filename|$filter_by_encoding"})
+    {
+    if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
+        $self->clear_filecontents_cache();
+    }
+    $filecontents_cache{"$filename|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
+    # store this in cache again under $filename entry, so that subsequent
+    # calls to classify_contents will find it in the cache already
+    $filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
+    }
+    return $filecontents_cache{$filename};
+}
+# This method returns the most frequently occurring encoding
+# but only if any encoding occurs more than once in the given results.
+# Otherwise, "" is returned.
+sub most_frequent_encoding {
+    my ($self, $results) = @_;
+    my $best_encoding = "";
+    # guessed_encodings is a hashmap of Encoding -> Frequency pairs
+    my %guessed_encodings = ();
+    foreach my $result (@$results) {
+    # Get the encoding portion of a language-model filename like en-iso8859_1
+    my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
+    if(!defined($guessed_encodings{$encoding})) {
+        $guessed_encodings{$encoding} = 0;
+    }
+    $guessed_encodings{$encoding}++;
+    }
+    $guessed_encodings{""}=-1; # for default best_encoding of ""
+    foreach my $enc (keys %guessed_encodings) {
+    if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
+        $best_encoding = $enc;
+    }
+    }
+    # If best_encoding's frequency == 1, then the frequency for all encodings will
+    # be 1 since the sum total of all frequencies is num_results: if any encoding
+    # has frequency > 1 (it's possibly the best_encoding), one or more of the others
+    # would have been at 0 frequency to compensate.
+    return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
+}
+# set some of the specific member variables
+sub set_opts {
+    my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
+    $self->{'opt_f'} = $opt_freq if defined $opt_freq;
+    $self->{'opt_u'} = $opt_factor if defined $opt_factor;
+    $self->{'opt_t'} = $opt_top if defined $opt_top;
+    $self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
+}
+sub get_opts {
+    my $self = shift (@_);
+    return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 17214

Legend:

gsdl/trunk/perllib/textcat.pm

Download in other formats: