Changeset 16557


Ignore:
Timestamp:
07/25/08 18:31:50 (12 years ago)
Author:
ak19
Message:

Auto filename encoding has several additional settings now, these are handled by subroutine filepath_to_utf8 which has changed accordingly. Some additional helper subroutines added. This file BasePlugin.pm is an intermediate but working version (still has many debug output statements even when most are commented out, but as I want to test the changes out on Windows first, I want to retain the debug statements).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasePlugin.pm

    r16520 r16557  
    3535use encodings;
    3636use unicode;
     37use textcat;
    3738use doc;
    3839eval "require diagnostics"; # some perl distros (eg mac) don't have this
     
    5354      { 'name' => "unicode",
    5455    'desc' => "{BasePlugin.encoding.unicode}" } ];
     56     
    5557
    5658my $e = $encodings::encodings;
     
    6668our $encoding_plus_auto_list =
    6769    [ { 'name' => "auto",
    68     'desc' => "{BasePlugin.filename_encoding.auto}" } ];
     70    'desc' => "{BasePlugin.filename_encoding.auto}" },
     71       { 'name' => "auto-language-analysis",
     72    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
     73      { 'name' => "auto-filesystem-encoding",
     74    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
     75      { 'name' => "auto-fl",
     76    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
     77      { 'name' => "auto-lf",
     78    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
    6979
    7080push(@{$encoding_plus_auto_list},@{$encoding_list});
     
    376386    my $filemeta = $file;
    377387
    378     my $filename_encoding = $self->{'filename_encoding'};
    379     if ($filename_encoding eq "auto") {
    380     # we check the locale first
    381     if (!defined $self->{'filesystem_encoding'}) {
    382         $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
    383         $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
    384     }
    385     if ($self->{'filesystem_encoding'} ne "undefined") {
    386         $filename_encoding = $self->{'filesystem_encoding'};
    387     } else {
    388         # try the encoding of the document, if available
    389         if (defined $file_encoding) {
    390         $filename_encoding = $file_encoding;
    391         } else {
    392         # use utf8
    393         $filename_encoding = "utf8";
    394         }
    395     }
     388    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
     389
     390##  print STDERR "**** User chose filename encoding setting: $filename_encoding\n";
     391   
     392    # Whenever filename-encoding is set to any of the auto settings, we
     393    # check if the filename is already in UTF8. If it is, then we're done.
     394    if($filename_encoding =~ m/auto/) {
     395    if(&unicode::check_is_utf8($filemeta))
     396    {
     397##      print STDERR "**** It is already UTF8\n";
     398        $filename_encoding = "utf8";
     399        return $filemeta;
     400    }
     401    }
     402   
     403    # Auto setting, but filename is not utf8
     404    if ($filename_encoding eq "auto")
     405    {
     406    # try textcat
     407    $filename_encoding = $self->textcat_encoding($filemeta);
    396408   
    397     }
    398 
    399     if ($filename_encoding !~ /(?:ascii|utf8|unicode)/) {
    400     $filemeta = unicode::unicode2utf8(
    401       unicode::convert2unicode($filename_encoding, \$filemeta)
     409    # check the locale next
     410    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
     411   
     412
     413    # now try the encoding of the document, if available
     414    if ($filename_encoding eq "undefined" && defined $file_encoding) {
     415        $filename_encoding = $file_encoding;
     416    }
     417
     418    }
     419
     420    elsif ($filename_encoding eq "auto-language-analysis")
     421    {   
     422    $filename_encoding = $self->textcat_encoding($filemeta);
     423
     424    # now try the encoding of the document, if available
     425    if ($filename_encoding eq "undefined" && defined $file_encoding) {
     426        $filename_encoding = $file_encoding;
     427    }
     428    }
     429
     430    elsif ($filename_encoding eq "auto-filesystem-encoding")
     431    {
     432    # try locale
     433    $filename_encoding = $self->locale_encoding();
     434    }
     435
     436    elsif ($filename_encoding eq "auto-fl")
     437    {
     438    # filesystem-encoding (locale) then language-analysis (textcat)
     439    $filename_encoding = $self->locale_encoding();
     440   
     441    # try textcat
     442    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
     443       
     444    # else assume filename encoding is encoding of file content, if that's available
     445    if ($filename_encoding eq "undefined" && defined $file_encoding) {
     446        $filename_encoding = $file_encoding;
     447    }
     448    }
     449   
     450    elsif ($filename_encoding eq "auto-lf")
     451    {
     452    # language-analysis (textcat) then filesystem-encoding (locale)
     453    $filename_encoding = $self->textcat_encoding($filemeta);
     454   
     455    # guess filename encoding from encoding of file content, if available
     456    if ($filename_encoding eq "undefined" && defined $file_encoding) {
     457        $filename_encoding = $file_encoding;
     458    }
     459
     460    # try locale
     461    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
     462    }
     463
     464##    print STDERR "**** filename_encoding selected: $filename_encoding \n";
     465       
     466    # if still undefined, use utf8 as fallback
     467    if ($filename_encoding eq "undefined") {
     468    $filename_encoding = "utf8";
     469    }
     470
     471    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
     472    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
     473    # be always utf8 (in which case the filename's encoding is also set as utf8 even
     474    # though the filename need not be if it originates from another system)--in such
     475    # cases attempt to make the filename utf8 to match.
     476    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
     477##  print STDERR "**** BEFORE utf8 conversion: $filemeta\n";
     478    &unicode::ensure_utf8(\$filemeta);
     479##  print STDERR "**** AFTER utf8 conversion: $filemeta\n";
     480    }
     481
     482
     483    # convert non-unicode encodings to utf8
     484    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
     485    $filemeta = &unicode::unicode2utf8(
     486      &unicode::convert2unicode($filename_encoding, \$filemeta)
    402487    );
    403488    }
     489
     490    print "*** filename encoding found: $filename_encoding\n";
     491    print "*** utf8 encoded filename: $filemeta\n";
    404492
    405493    return $filemeta;
     
    424512}
    425513
    426 
     514sub locale_encoding {
     515    my $self = shift(@_);
     516   
     517    if (!defined $self->{'filesystem_encoding'}) {
     518    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
     519    $self->{'filesystem_encoding'} = "undefined" if !defined $self->{'filesystem_encoding'};
     520    }
     521
     522    print "filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
     523    return $self->{'filesystem_encoding'}; # can be the string "undefined"
     524}
     525
     526sub textcat_encoding {
     527    my $self = shift(@_);
     528    my ($filemeta) = @_;
     529
     530    # analyse filenames without extensions and digits (and trimmed of surrounding
     531    # whitespace), so that irrelevant chars don't confuse textcat
     532    my $strictfilemeta = $filemeta;
     533    $strictfilemeta =~ s/\.[^\.]+$//g;
     534    $strictfilemeta =~ s/\d//g;
     535    $strictfilemeta =~ s/^\s*//g;
     536    $strictfilemeta =~ s/\s*$//g;
     537   
     538##    print STDERR "**** strict filename is |$strictfilemeta|\n";
     539    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
     540    if(!defined $filename_encoding) {
     541    $filename_encoding = "undefined";
     542    }
     543
     544##    print STDERR "**** textcat found filename encoding: " . $file_textcat_encoding_map{$strictfilemeta} . "\n";
     545    return $filename_encoding; # can be the string "undefined"
     546}
     547
     548# performs textcat
     549sub encoding_from_language_analysis {
     550    my $self = shift(@_);
     551    my ($text) = @_;
     552
     553    my $outhandle = $self->{'outhandle'};
     554    my $best_encoding = undef;
     555   
     556    # get the language/encoding of the file using textcat
     557    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
     558    #my $results = $self->{'textcat'}->classify(\$text);
     559    my $results = $self->{'textcat'}->classify_cached(\$text);
     560
     561
     562    if (scalar @$results < 0) {
     563    print STDERR "**** Textcat returned 0 results\n";
     564    return undef;
     565    }
     566   
     567    print STDERR "**** TEXTCAT RESULTS for $text: ";
     568    print STDERR join(",", @$results);
     569    print STDERR "\n";
     570
     571    # We have some results, we choose the first
     572    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
     573   
     574    $best_encoding = $encoding;
     575    if (!defined $best_encoding) {
     576##  print STDERR "**** Textcat cannot determine encoding of filename: it's undefined.\n";
     577    return undef;
     578    }
     579       
     580    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
     581    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
     582##  print STDERR "*** Filename turns out to be UTF8\n";
     583    $best_encoding = 'utf8';
     584    }
     585   
     586   
     587    # check for equivalents where textcat doesn't have some encodings...
     588    # eg MS versions of standard encodings
     589    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
     590##  print STDERR "**** best_encoding is ISO_8859: $best_encoding\n";
     591
     592    my $iso = $1; # which variant of the iso standard?
     593    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
     594    if ($text =~ /[\x80-\x9f]/) {
     595##      print STDERR "**** best_encoding is some windows value: $best_encoding\n";
     596        # Western Europe
     597        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
     598        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
     599        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
     600        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
     601        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
     602        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
     603        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
     604##      print STDERR "**** best_encoding windows value: $best_encoding\n";
     605    }
     606    }
     607   
     608    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
     609    !defined $encodings::encodings->{$best_encoding})
     610    {
     611    if ($self->{'verbosity'}) {
     612        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
     613    }
     614##  print STDERR "***** unsupported encoding: $best_encoding. Setting it to undefined.\n"; 
     615    $best_encoding = undef;
     616    }
     617##    print STDERR "**** language: $language\n" if defined $language;
     618##    print STDERR "**** encoding: $best_encoding\n" if defined $encoding;
     619   
     620    return $best_encoding;
     621}
     622
     623# uses locale
    427624sub get_filesystem_encoding {
    428625
Note: See TracChangeset for help on using the changeset viewer.