Ignore:
Timestamp:
2008-09-08T13:41:13+12:00 (16 years ago)
Author:
ak19
Message:

Significant changes to subroutine get_language_encoding to better work out the encoding and language of a file's contents. In those cases where the encoding of a text's contents is already known, performs a textcat restricted on the known encoding. Corresponding changes have been made to textcat.pm

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r17099 r17213  
    296296}
    297297
     298
    298299# Uses textcat to work out the encoding and language of the text in
    299300# $filename. All html tags are removed before processing.
     
    307308    my $best_encoding = "";
    308309   
     310
    309311    # read in file
    310312    if (!open (FILE, $filename)) {
     
    356358    elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) {           
    357359        $best_encoding = $1;
    358 #       print STDERR "**** meta tag found, encoding is: $best_encoding\n";
    359360    }
    360361    if ($best_encoding) { # we extracted an encoding
     
    371372    }
    372373
    373     # get the language/encoding
    374     $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
    375 #    my $results = $self->{'textcat'}->classify(\$text);
    376     my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename);
    377 
    378     # if textcat returns 3 or less possibilities we'll use the
    379     # first one in the list - otherwise use the defaults
    380     if (scalar @$results > 3) {
    381     if ($unicode_format) { # in case the first had a BOM
    382         $best_encoding=$unicode_format;
    383     } else {
    384         my %guessed_encodings = ();
    385         foreach my $result (@$results) {
    386         $result =~ /([^\-]+)$/;
    387         my $enc=$1;
    388         if (!defined($guessed_encodings{$enc})) {
    389             $guessed_encodings{$enc}=0;
     374    # don't need to do textcat if we know the encoding now AND don't need to extract language
     375    if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
     376    $best_language = $self->{'default_language'};
     377    }
     378
     379    else { # need to use textcat to get either the language, or get both language and encoding
     380    $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
     381   
     382    if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
     383        my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
     384       
     385        my $language;
     386        ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0);
     387
     388        if (!defined $language || scalar @$results > 3) {
     389        # if there were too many results even when restricting results by encoding,
     390        # or if there were no results, use default language with the known encoding
     391        $best_language = $self->use_default_language($filename);
     392        }
     393        else { # fewer than 3 results means textcat is more certain, use the first result
     394        $best_language = $language;     
     395        }
     396    }
     397    else { # don't know encoding or language yet, therefore we use textcat
     398        my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
     399       
     400        # if textcat returns 3 or less possibilities we'll use the first one in the list
     401        if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
     402        my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/;
     403
     404        $language = $self->use_default_language($filename) unless defined $language;
     405        $encoding = $self->use_default_encoding($filename) unless defined $encoding;
     406
     407        $best_language = $language;
     408        $best_encoding = $encoding;
     409        }
     410        else { # if (scalar @$results > 3) {
     411        if ($unicode_format) { # in case the first had a BOM
     412            $best_encoding=$unicode_format;
    390413        }
    391         $guessed_encodings{$enc}++;
    392         }
    393 
    394         $guessed_encodings{""}=-1; # for default best_encoding of ""
    395         foreach my $enc (keys %guessed_encodings) {
    396         if ($guessed_encodings{$enc} >
    397             $guessed_encodings{$best_encoding}){
    398             $best_encoding=$enc;
     414        else {
     415            # Find the most frequent encoding in the textcat results returned
     416            # Returns "" if there's no encoding more frequent than another
     417            $best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
     418        }
     419       
     420        if ($best_encoding eq "") { # encoding still not set, use defaults
     421            $best_language = $self->use_default_language($filename);
     422            $best_encoding = $self->use_default_encoding($filename);
     423        }
     424        elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
     425            $best_language = $self->use_default_language($filename);
     426        }
     427        else { # textcat again using the most frequent encoding or the $unicode_format set above
     428            $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
     429            my $language;
     430            ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0);
     431            if (!defined $language || scalar @$results > 3) {
     432            # if no result or too many results, use default language for the encoding previously found
     433            $best_language = $self->use_default_language($filename);
     434            }
     435            else { # fewer than 3 results, use the language of the first result
     436            $best_language = $language;
     437            }
    399438        }
    400439        }
    401440    }
    402 
    403     if ($self->{'input_encoding'} ne 'auto') {
    404         if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
    405         gsprintf($outhandle,
    406              "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
    407              $filename, $self->{'default_language'});
    408         }       
    409         $best_language = $self->{'default_language'};
    410         if (!$found_html_encoding) {
    411         $best_encoding = $self->{'input_encoding'};
    412         }
    413 
    414     } else {
    415         if ($self->{'verbosity'}>2) {
    416         gsprintf($outhandle,
    417              "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
    418              $filename, $self->{'default_language'});
    419         }
    420         $best_language = $self->{'default_language'};
    421     }
    422     } else { # <= 3 suggestions
    423     my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
    424     if (!defined $language) {
    425         if ($self->{'verbosity'}>2) {
    426         gsprintf($outhandle,
    427             "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
    428             $filename, $self->{'default_language'});
    429         }
    430         $language = $self->{'default_language'};
    431     }
    432     if (!defined $encoding) {
    433         if ($self->{'verbosity'}>2) {
    434         gsprintf($outhandle,
    435             "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
    436             $filename, $self->{'default_encoding'});
    437         }
    438         $encoding = $self->{'default_encoding'};
    439     }
    440     $best_language = $language;
    441     if (! $best_encoding ) { # may already be set... eg from html meta tag
    442         $best_encoding = $encoding;
    443     }
    444     }
     441    }
     442
     443    if($best_encoding eq "" || $best_language eq "") {
     444    print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
     445    $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
     446    $best_language = $self->use_default_language($filename) if $best_language eq "";
     447    }
     448#    print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
     449
    445450
    446451    if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
     
    480485
    481486
     487sub use_default_language {
     488    my $self = shift (@_);
     489    my ($filename) = @_;
     490
     491    if ($self->{'verbosity'}>2) {
     492    gsprintf($self->{'outhandle'},
     493         "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
     494         $filename, $self->{'default_language'});
     495    }
     496    return $self->{'default_language'};
     497}
     498
     499sub use_default_encoding {
     500    my $self = shift (@_);
     501    my ($filename) = @_;
     502
     503    if ($self->{'verbosity'}>2) {
     504    gsprintf($self->{'outhandle'},
     505         "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
     506         $filename, $self->{'default_encoding'});
     507    }
     508    return $self->{'default_encoding'};
     509}
    482510
    483511# Overridden by exploding plugins (eg. ISISPlug)
Note: See TracChangeset for help on using the changeset viewer.