Changeset 17213

Show
Ignore:
Timestamp:
08.09.2008 13:41:13 (11 years ago)
Author:
ak19
Message:

Significant changes to subroutine get_language_encoding to better work out the encoding and language of a file's contents. In those cases where the encoding of a text's contents is already known, performs a textcat restricted on the known encoding. Corresponding changes have been made to textcat.pm

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r17099 r17213  
    296296} 
    297297 
     298 
    298299# Uses textcat to work out the encoding and language of the text in  
    299300# $filename. All html tags are removed before processing. 
     
    307308    my $best_encoding = ""; 
    308309     
     310 
    309311    # read in file 
    310312    if (!open (FILE, $filename)) { 
     
    356358    elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) {             
    357359        $best_encoding = $1; 
    358 #       print STDERR "**** meta tag found, encoding is: $best_encoding\n"; 
    359360    } 
    360361    if ($best_encoding) { # we extracted an encoding 
     
    371372    } 
    372373 
    373     # get the language/encoding 
    374     $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 
    375 #    my $results = $self->{'textcat'}->classify(\$text); 
    376     my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename); 
    377  
    378     # if textcat returns 3 or less possibilities we'll use the  
    379     # first one in the list - otherwise use the defaults 
    380     if (scalar @$results > 3) { 
    381     if ($unicode_format) { # in case the first had a BOM 
    382         $best_encoding=$unicode_format; 
    383     } else { 
    384         my %guessed_encodings = (); 
    385         foreach my $result (@$results) { 
    386         $result =~ /([^\-]+)$/; 
    387         my $enc=$1; 
    388         if (!defined($guessed_encodings{$enc})) { 
    389             $guessed_encodings{$enc}=0; 
     374    # don't need to do textcat if we know the encoding now AND don't need to extract language 
     375    if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file 
     376    $best_language = $self->{'default_language'}; 
     377    } 
     378 
     379    else { # need to use textcat to get either the language, or get both language and encoding 
     380    $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 
     381     
     382    if($found_html_encoding) { # know encoding, find language by limiting search to known encoding 
     383        my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); 
     384         
     385        my $language; 
     386        ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0); 
     387 
     388        if (!defined $language || scalar @$results > 3) { 
     389        # if there were too many results even when restricting results by encoding, 
     390        # or if there were no results, use default language with the known encoding 
     391        $best_language = $self->use_default_language($filename); 
     392        }  
     393        else { # fewer than 3 results means textcat is more certain, use the first result 
     394        $best_language = $language;      
     395        }  
     396    } 
     397    else { # don't know encoding or language yet, therefore we use textcat 
     398        my $results = $self->{'textcat'}->classify_contents(\$text, $filename); 
     399         
     400        # if textcat returns 3 or less possibilities we'll use the first one in the list 
     401        if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding 
     402        my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/; 
     403 
     404        $language = $self->use_default_language($filename) unless defined $language; 
     405        $encoding = $self->use_default_encoding($filename) unless defined $encoding; 
     406 
     407        $best_language = $language; 
     408        $best_encoding = $encoding; 
     409        } 
     410        else { # if (scalar @$results > 3) { 
     411        if ($unicode_format) { # in case the first had a BOM 
     412            $best_encoding=$unicode_format; 
    390413        } 
    391         $guessed_encodings{$enc}++; 
    392         } 
    393  
    394         $guessed_encodings{""}=-1; # for default best_encoding of "" 
    395         foreach my $enc (keys %guessed_encodings) { 
    396         if ($guessed_encodings{$enc} > 
    397             $guessed_encodings{$best_encoding}){ 
    398             $best_encoding=$enc; 
     414        else { 
     415            # Find the most frequent encoding in the textcat results returned 
     416            # Returns "" if there's no encoding more frequent than another 
     417            $best_encoding = $self->{'textcat'}->most_frequent_encoding($results); 
     418        } 
     419         
     420        if ($best_encoding eq "") { # encoding still not set, use defaults 
     421            $best_language = $self->use_default_language($filename); 
     422            $best_encoding = $self->use_default_encoding($filename); 
     423        } 
     424        elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language 
     425            $best_language = $self->use_default_language($filename); 
     426        } 
     427        else { # textcat again using the most frequent encoding or the $unicode_format set above 
     428            $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); 
     429            my $language; 
     430            ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0); 
     431            if (!defined $language || scalar @$results > 3) {  
     432            # if no result or too many results, use default language for the encoding previously found 
     433            $best_language = $self->use_default_language($filename); 
     434            } 
     435            else { # fewer than 3 results, use the language of the first result 
     436            $best_language = $language; 
     437            } 
    399438        } 
    400439        } 
    401440    } 
    402  
    403     if ($self->{'input_encoding'} ne 'auto') { 
    404         if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) { 
    405         gsprintf($outhandle, 
    406              "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
    407              $filename, $self->{'default_language'}); 
    408         }        
    409         $best_language = $self->{'default_language'}; 
    410         if (!$found_html_encoding) { 
    411         $best_encoding = $self->{'input_encoding'}; 
    412         } 
    413  
    414     } else { 
    415         if ($self->{'verbosity'}>2) { 
    416         gsprintf($outhandle, 
    417              "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
    418              $filename, $self->{'default_language'}); 
    419         } 
    420         $best_language = $self->{'default_language'}; 
    421     } 
    422     } else { # <= 3 suggestions 
    423     my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 
    424     if (!defined $language) { 
    425         if ($self->{'verbosity'}>2) { 
    426         gsprintf($outhandle, 
    427             "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
    428             $filename, $self->{'default_language'}); 
    429         } 
    430         $language = $self->{'default_language'}; 
    431     } 
    432     if (!defined $encoding) { 
    433         if ($self->{'verbosity'}>2) { 
    434         gsprintf($outhandle, 
    435             "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", 
    436             $filename, $self->{'default_encoding'}); 
    437         } 
    438         $encoding = $self->{'default_encoding'}; 
    439     } 
    440     $best_language = $language; 
    441     if (! $best_encoding ) { # may already be set... eg from html meta tag 
    442         $best_encoding = $encoding; 
    443     } 
    444     } 
     441    } 
     442 
     443    if($best_encoding eq "" || $best_language eq "") { 
     444    print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n"; 
     445    $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq ""; 
     446    $best_language = $self->use_default_language($filename) if $best_language eq ""; 
     447    } 
     448#    print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n"; 
     449 
    445450 
    446451    if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) { 
     
    480485 
    481486 
     487sub use_default_language { 
     488    my $self = shift (@_); 
     489    my ($filename) = @_; 
     490 
     491    if ($self->{'verbosity'}>2) { 
     492    gsprintf($self->{'outhandle'}, 
     493         "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
     494         $filename, $self->{'default_language'}); 
     495    } 
     496    return $self->{'default_language'}; 
     497} 
     498 
     499sub use_default_encoding { 
     500    my $self = shift (@_); 
     501    my ($filename) = @_; 
     502 
     503    if ($self->{'verbosity'}>2) { 
     504    gsprintf($self->{'outhandle'}, 
     505         "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", 
     506         $filename, $self->{'default_encoding'}); 
     507    } 
     508    return $self->{'default_encoding'}; 
     509} 
    482510 
    483511# Overridden by exploding plugins (eg. ISISPlug)