greenstone.org greenstone wiki greenstone trac planet greenstone

Changeset 17213

Show
Ignore:
Timestamp:
2008-09-08 13:41:13 (3 months ago)
Author:
ak19
Message:

Significant changes to subroutine get_language_encoding to better work out the encoding and language of a file's contents. In those cases where the encoding of a text's contents is already known, performs a textcat restricted on the known encoding. Corresponding changes have been made to textcat.pm

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r17099 r17213  
    296296} 
    297297 
     298 
    298299# Uses textcat to work out the encoding and language of the text in  
    299300# $filename. All html tags are removed before processing. 
     
    307308    my $best_encoding = ""; 
    308309     
     310 
    309311    # read in file 
    310312    if (!open (FILE, $filename)) { 
     
    356358        elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) {                 
    357359            $best_encoding = $1; 
    358 #           print STDERR "**** meta tag found, encoding is: $best_encoding\n"; 
    359360        } 
    360361        if ($best_encoding) { # we extracted an encoding 
     
    371372    } 
    372373 
    373     # get the language/encoding 
    374     $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 
    375 #    my $results = $self->{'textcat'}->classify(\$text); 
    376     my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename); 
    377  
    378     # if textcat returns 3 or less possibilities we'll use the  
    379     # first one in the list - otherwise use the defaults 
    380     if (scalar @$results > 3) { 
    381         if ($unicode_format) { # in case the first had a BOM 
    382             $best_encoding=$unicode_format; 
    383         } else { 
    384             my %guessed_encodings = (); 
    385             foreach my $result (@$results) { 
    386                 $result =~ /([^\-]+)$/; 
    387                 my $enc=$1; 
    388                 if (!defined($guessed_encodings{$enc})) { 
    389                     $guessed_encodings{$enc}=0; 
     374    # don't need to do textcat if we know the encoding now AND don't need to extract language 
     375    if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file 
     376        $best_language = $self->{'default_language'}; 
     377    } 
     378 
     379    else { # need to use textcat to get either the language, or get both language and encoding 
     380        $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 
     381         
     382        if($found_html_encoding) { # know encoding, find language by limiting search to known encoding 
     383            my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); 
     384             
     385            my $language; 
     386            ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0); 
     387 
     388            if (!defined $language || scalar @$results > 3) { 
     389                # if there were too many results even when restricting results by encoding, 
     390                # or if there were no results, use default language with the known encoding 
     391                $best_language = $self->use_default_language($filename); 
     392            }  
     393            else { # fewer than 3 results means textcat is more certain, use the first result 
     394                $best_language = $language;              
     395            }  
     396        } 
     397        else { # don't know encoding or language yet, therefore we use textcat 
     398            my $results = $self->{'textcat'}->classify_contents(\$text, $filename); 
     399             
     400            # if textcat returns 3 or less possibilities we'll use the first one in the list 
     401            if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding 
     402                my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/; 
     403 
     404                $language = $self->use_default_language($filename) unless defined $language; 
     405                $encoding = $self->use_default_encoding($filename) unless defined $encoding; 
     406 
     407                $best_language = $language; 
     408                $best_encoding = $encoding; 
     409            } 
     410            else { # if (scalar @$results > 3) { 
     411                if ($unicode_format) { # in case the first had a BOM 
     412                    $best_encoding=$unicode_format; 
    390413                } 
    391                 $guessed_encodings{$enc}++; 
    392             } 
    393  
    394             $guessed_encodings{""}=-1; # for default best_encoding of "" 
    395             foreach my $enc (keys %guessed_encodings) { 
    396                 if ($guessed_encodings{$enc} > 
    397                     $guessed_encodings{$best_encoding}){ 
    398                     $best_encoding=$enc; 
     414                else { 
     415                    # Find the most frequent encoding in the textcat results returned 
     416                    # Returns "" if there's no encoding more frequent than another 
     417                    $best_encoding = $self->{'textcat'}->most_frequent_encoding($results); 
     418                } 
     419                 
     420                if ($best_encoding eq "") { # encoding still not set, use defaults 
     421                    $best_language = $self->use_default_language($filename); 
     422                    $best_encoding = $self->use_default_encoding($filename); 
     423                } 
     424                elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language 
     425                    $best_language = $self->use_default_language($filename); 
     426                } 
     427                else { # textcat again using the most frequent encoding or the $unicode_format set above 
     428                    $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding); 
     429                    my $language; 
     430                    ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0); 
     431                    if (!defined $language || scalar @$results > 3) {  
     432                        # if no result or too many results, use default language for the encoding previously found 
     433                        $best_language = $self->use_default_language($filename); 
     434                    } 
     435                    else { # fewer than 3 results, use the language of the first result 
     436                        $best_language = $language; 
     437                    } 
    399438                } 
    400439            } 
    401440        } 
    402  
    403         if ($self->{'input_encoding'} ne 'auto') { 
    404             if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) { 
    405                 gsprintf($outhandle, 
    406                          "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
    407                          $filename, $self->{'default_language'}); 
    408             }            
    409             $best_language = $self->{'default_language'}; 
    410             if (!$found_html_encoding) { 
    411                 $best_encoding = $self->{'input_encoding'}; 
    412             } 
    413  
    414         } else { 
    415             if ($self->{'verbosity'}>2) { 
    416                 gsprintf($outhandle, 
    417                          "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
    418                          $filename, $self->{'default_language'}); 
    419             } 
    420             $best_language = $self->{'default_language'}; 
    421         } 
    422     } else { # <= 3 suggestions 
    423         my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 
    424         if (!defined $language) { 
    425             if ($self->{'verbosity'}>2) { 
    426                 gsprintf($outhandle, 
    427                     "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
    428                     $filename, $self->{'default_language'}); 
    429             } 
    430             $language = $self->{'default_language'}; 
    431         } 
    432         if (!defined $encoding) { 
    433             if ($self->{'verbosity'}>2) { 
    434                 gsprintf($outhandle, 
    435                     "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", 
    436                     $filename, $self->{'default_encoding'}); 
    437             } 
    438             $encoding = $self->{'default_encoding'}; 
    439         } 
    440         $best_language = $language; 
    441         if (! $best_encoding ) { # may already be set... eg from html meta tag 
    442             $best_encoding = $encoding; 
    443         } 
    444     } 
     441    } 
     442 
     443    if($best_encoding eq "" || $best_language eq "") { 
     444        print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n"; 
     445        $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq ""; 
     446        $best_language = $self->use_default_language($filename) if $best_language eq ""; 
     447    } 
     448#    print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n"; 
     449 
    445450 
    446451    if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) { 
     
    480485 
    481486 
     487sub use_default_language { 
     488    my $self = shift (@_); 
     489    my ($filename) = @_; 
     490 
     491    if ($self->{'verbosity'}>2) { 
     492        gsprintf($self->{'outhandle'}, 
     493                 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n", 
     494                 $filename, $self->{'default_language'}); 
     495    } 
     496    return $self->{'default_language'}; 
     497} 
     498 
     499sub use_default_encoding { 
     500    my $self = shift (@_); 
     501    my ($filename) = @_; 
     502 
     503    if ($self->{'verbosity'}>2) { 
     504        gsprintf($self->{'outhandle'}, 
     505                 "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n", 
     506                 $filename, $self->{'default_encoding'}); 
     507    } 
     508    return $self->{'default_encoding'}; 
     509} 
    482510 
    483511# Overridden by exploding plugins (eg. ISISPlug)