Changeset 16557

Show
Ignore:
Timestamp:
25.07.2008 18:31:50 (11 years ago)
Author:
ak19
Message:

Auto filename encoding has several additional settings now, these are handled by subroutine filepath_to_utf8 which has changed accordingly. Some additional helper subroutines added. This file BasePlugin?.pm is an intermediate but working version (still has many debug output statements even when most are commented out, but as I want to test the changes out on Windows first, I want to retain the debug statements).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16520 r16557  
    3535use encodings; 
    3636use unicode; 
     37use textcat; 
    3738use doc; 
    3839eval "require diagnostics"; # some perl distros (eg mac) don't have this 
     
    5354      { 'name' => "unicode", 
    5455    'desc' => "{BasePlugin.encoding.unicode}" } ]; 
     56       
    5557 
    5658my $e = $encodings::encodings; 
     
    6668our $encoding_plus_auto_list =  
    6769    [ { 'name' => "auto", 
    68     'desc' => "{BasePlugin.filename_encoding.auto}" } ]; 
     70    'desc' => "{BasePlugin.filename_encoding.auto}" }, 
     71       { 'name' => "auto-language-analysis", 
     72    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat 
     73      { 'name' => "auto-filesystem-encoding", 
     74    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale 
     75      { 'name' => "auto-fl", 
     76    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat 
     77      { 'name' => "auto-lf", 
     78    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale  
    6979 
    7080push(@{$encoding_plus_auto_list},@{$encoding_list}); 
     
    376386    my $filemeta = $file; 
    377387 
    378     my $filename_encoding = $self->{'filename_encoding'}; 
    379     if ($filename_encoding eq "auto") { 
    380     # we check the locale first 
    381     if (!defined $self->{'filesystem_encoding'}) { 
    382         $self->{'filesystem_encoding'} = $self->get_filesystem_encoding(); 
    383         $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'}; 
    384     } 
    385     if ($self->{'filesystem_encoding'} ne "undefined") { 
    386         $filename_encoding = $self->{'filesystem_encoding'}; 
    387     } else { 
    388         # try the encoding of the document, if available 
    389         if (defined $file_encoding) { 
    390         $filename_encoding = $file_encoding; 
    391         } else { 
    392         # use utf8 
    393         $filename_encoding = "utf8"; 
    394         } 
    395     } 
     388    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting 
     389 
     390##  print STDERR "**** User chose filename encoding setting: $filename_encoding\n"; 
     391     
     392    # Whenever filename-encoding is set to any of the auto settings, we 
     393    # check if the filename is already in UTF8. If it is, then we're done. 
     394    if($filename_encoding =~ m/auto/) { 
     395    if(&unicode::check_is_utf8($filemeta))  
     396    { 
     397##      print STDERR "**** It is already UTF8\n"; 
     398        $filename_encoding = "utf8"; 
     399        return $filemeta; 
     400    }  
     401    } 
     402     
     403    # Auto setting, but filename is not utf8 
     404    if ($filename_encoding eq "auto")  
     405    { 
     406    # try textcat 
     407    $filename_encoding = $self->textcat_encoding($filemeta); 
    396408     
    397     } 
    398  
    399     if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) { 
    400     $filemeta = unicode::unicode2utf8( 
    401       unicode::convert2unicode($filename_encoding, \$filemeta) 
     409    # check the locale next 
     410    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 
     411     
     412 
     413    # now try the encoding of the document, if available 
     414    if ($filename_encoding eq "undefined" && defined $file_encoding) { 
     415        $filename_encoding = $file_encoding; 
     416    } 
     417 
     418    } 
     419 
     420    elsif ($filename_encoding eq "auto-language-analysis")  
     421    {    
     422    $filename_encoding = $self->textcat_encoding($filemeta); 
     423 
     424    # now try the encoding of the document, if available 
     425    if ($filename_encoding eq "undefined" && defined $file_encoding) { 
     426        $filename_encoding = $file_encoding; 
     427    }  
     428    } 
     429 
     430    elsif ($filename_encoding eq "auto-filesystem-encoding")  
     431    { 
     432    # try locale 
     433    $filename_encoding = $self->locale_encoding(); 
     434    } 
     435 
     436    elsif ($filename_encoding eq "auto-fl")  
     437    { 
     438    # filesystem-encoding (locale) then language-analysis (textcat) 
     439    $filename_encoding = $self->locale_encoding(); 
     440     
     441    # try textcat 
     442    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined"; 
     443         
     444    # else assume filename encoding is encoding of file content, if that's available 
     445    if ($filename_encoding eq "undefined" && defined $file_encoding) { 
     446        $filename_encoding = $file_encoding; 
     447    } 
     448    } 
     449     
     450    elsif ($filename_encoding eq "auto-lf")  
     451    { 
     452    # language-analysis (textcat) then filesystem-encoding (locale) 
     453    $filename_encoding = $self->textcat_encoding($filemeta); 
     454     
     455    # guess filename encoding from encoding of file content, if available 
     456    if ($filename_encoding eq "undefined" && defined $file_encoding) { 
     457        $filename_encoding = $file_encoding; 
     458    } 
     459 
     460    # try locale 
     461    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 
     462    } 
     463 
     464##    print STDERR "**** filename_encoding selected: $filename_encoding \n"; 
     465         
     466    # if still undefined, use utf8 as fallback 
     467    if ($filename_encoding eq "undefined") { 
     468    $filename_encoding = "utf8"; 
     469    } 
     470 
     471    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when 
     472    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to 
     473    # be always utf8 (in which case the filename's encoding is also set as utf8 even  
     474    # though the filename need not be if it originates from another system)--in such 
     475    # cases attempt to make the filename utf8 to match. 
     476    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) { 
     477##  print STDERR "**** BEFORE utf8 conversion: $filemeta\n"; 
     478    &unicode::ensure_utf8(\$filemeta); 
     479##  print STDERR "**** AFTER utf8 conversion: $filemeta\n"; 
     480    } 
     481 
     482 
     483    # convert non-unicode encodings to utf8 
     484    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) { 
     485    $filemeta = &unicode::unicode2utf8( 
     486      &unicode::convert2unicode($filename_encoding, \$filemeta) 
    402487    ); 
    403488    } 
     489 
     490    print "*** filename encoding found: $filename_encoding\n"; 
     491    print "*** utf8 encoded filename: $filemeta\n"; 
    404492 
    405493    return $filemeta; 
     
    424512} 
    425513 
    426  
     514sub locale_encoding { 
     515    my $self = shift(@_); 
     516     
     517    if (!defined $self->{'filesystem_encoding'}) { 
     518    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding(); 
     519    $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'}; 
     520    } 
     521 
     522    print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n"; 
     523    return $self->{'filesystem_encoding'}; # can be the string "undefined" 
     524} 
     525 
     526sub textcat_encoding { 
     527    my $self = shift(@_); 
     528    my ($filemeta) = @_; 
     529 
     530    # analyse filenames without extensions and digits (and trimmed of surrounding  
     531    # whitespace), so that irrelevant chars don't confuse textcat 
     532    my $strictfilemeta = $filemeta; 
     533    $strictfilemeta =~ s/\.[^\.]+$//g; 
     534    $strictfilemeta =~ s/\d//g; 
     535    $strictfilemeta =~ s/^\s*//g; 
     536    $strictfilemeta =~ s/\s*$//g; 
     537     
     538##    print STDERR "**** strict filename is |$strictfilemeta|\n"; 
     539    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta); 
     540    if(!defined $filename_encoding) { 
     541    $filename_encoding = "undefined"; 
     542    } 
     543 
     544##    print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n"; 
     545    return $filename_encoding; # can be the string "undefined" 
     546} 
     547 
     548# performs textcat 
     549sub encoding_from_language_analysis { 
     550    my $self = shift(@_); 
     551    my ($text) = @_; 
     552 
     553    my $outhandle = $self->{'outhandle'}; 
     554    my $best_encoding = undef; 
     555    
     556    # get the language/encoding of the file using textcat 
     557    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'}); 
     558    #my $results = $self->{'textcat'}->classify(\$text); 
     559    my $results = $self->{'textcat'}->classify_cached(\$text); 
     560 
     561 
     562    if (scalar @$results < 0) {  
     563    print STDERR "**** Textcat returned 0 results\n"; 
     564    return undef; 
     565    } 
     566     
     567    print STDERR "**** TEXTCAT RESULTS for $text: "; 
     568    print STDERR join(",", @$results); 
     569    print STDERR "\n"; 
     570 
     571    # We have some results, we choose the first 
     572    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 
     573     
     574    $best_encoding = $encoding; 
     575    if (!defined $best_encoding) { 
     576##  print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n"; 
     577    return undef; 
     578    }  
     579         
     580    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) { 
     581    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities) 
     582##  print STDERR "*** Filename turns out to be UTF8\n"; 
     583    $best_encoding = 'utf8'; 
     584    } 
     585     
     586     
     587    # check for equivalents where textcat doesn't have some encodings... 
     588    # eg MS versions of standard encodings 
     589    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) { 
     590##  print STDERR "**** best_encoding is ISO_8859: $best_encoding\n"; 
     591 
     592    my $iso = $1; # which variant of the iso standard? 
     593    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 
     594    if ($text =~ /[\x80-\x9f]/) { 
     595##      print STDERR "**** best_encoding is some windows value: $best_encoding\n"; 
     596        # Western Europe 
     597        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } 
     598        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe 
     599        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic 
     600        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic 
     601        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek 
     602        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 
     603        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 
     604##      print STDERR "**** best_encoding windows value: $best_encoding\n"; 
     605    } 
     606    } 
     607     
     608    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ && 
     609    !defined $encodings::encodings->{$best_encoding})  
     610    { 
     611    if ($self->{'verbosity'}) {  
     612        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef"); 
     613    } 
     614##  print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n";   
     615    $best_encoding = undef; 
     616    } 
     617##    print STDERR "**** language: $language\n" if defined $language; 
     618##    print STDERR "**** encoding: $best_encoding\n" if defined $encoding; 
     619     
     620    return $best_encoding; 
     621} 
     622 
     623# uses locale 
    427624sub get_filesystem_encoding { 
    428625