Ignore:
Timestamp:
2017-03-06T13:48:15+13:00 (7 years ago)
Author:
kjdon
Message:

baseplugin now inherits from EncodingUtil, and all its encoding methods have been moved to the new plugin

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r31445 r31457  
    4242use FileUtils;
    4343
    44 use PrintInfo;
     44use EncodingUtil;
    4545
    4646BEGIN {
    47     @BasePlugin::ISA = ( 'PrintInfo' );
     47    @BasePlugin::ISA = ( 'EncodingUtil' );
    4848}
    4949
     
    5959    'hiddengli' => "yes" } ];
    6060
    61 our $encoding_list =
    62     [ { 'name' => "ascii",
    63     'desc' => "{BasePlugin.encoding.ascii}" },
    64       { 'name' => "utf8",
    65     'desc' => "{BasePlugin.encoding.utf8}" },
    66       { 'name' => "unicode",
    67     'desc' => "{BasePlugin.encoding.unicode}" } ];
    68 
    69 
    70 my $e = $encodings::encodings;
    71 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
    72 {
    73     my $hashEncode =
    74     {'name' => $enc,
    75      'desc' => $e->{$enc}->{'name'}};
    76    
    77     push(@{$encoding_list},$hashEncode);
    78 }
    79 
    80 our $encoding_plus_auto_list =
    81     [ { 'name' => "auto",
    82     'desc' => "{BasePlugin.filename_encoding.auto}" },
    83       { 'name' => "auto-language-analysis",
    84     'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
    85       { 'name' => "auto-filesystem-encoding",
    86     'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
    87       { 'name' => "auto-fl",
    88     'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
    89       { 'name' => "auto-lf",
    90     'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
    91 
    92 push(@{$encoding_plus_auto_list},@{$encoding_list});
     61# here went encoding list stuff
    9362
    9463our $oidtype_list =
     
    160129    'type' => "flag",
    161130    'reqd' => "no" },
    162       { 'name' => "filename_encoding",
    163     'desc' => "{BasePlugin.filename_encoding}",
    164     'type' => "enum",
    165     'deft' => "auto",
    166     'list' => $encoding_plus_auto_list,
    167     'reqd' => "no" },
     131#      { 'name' => "filename_encoding",
     132#   'desc' => "{BasePlugin.filename_encoding}",
     133#   'type' => "enum",
     134#   'deft' => "auto",
     135#   'list' => $encoding_plus_auto_list,
     136#   'reqd' => "no" },
    168137      { 'name' => "smart_block",
    169138        'desc' => "{common.deprecated}. {BasePlugin.smart_block}",
     
    185154        'desc'     => "{BasePlugin.desc}",
    186155        'abstract' => "yes",
    187         'inherits' => "no",
     156        'inherits' => "yes",
    188157        'args'     => $arguments };
    189158
     
    197166    push(@{$hashArgOptLists->{"OptList"}},$options);
    198167
    199     my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
     168    my $self = new EncodingUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
    200169   
    201170    if ($self->{'info_only'}) {
     
    203172        return bless $self, $class;
    204173    }
    205 
    206174    if ($self->{'smart_block'}) {
    207175    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n";
     
    284252    my $self = shift (@_);
    285253    my ($verbosity, $outhandle, $failhandle) = @_;
    286 
    287     # verbosity is passed through from the processor
    288     $self->{'verbosity'} = $verbosity;
    289 
    290     # as are the outhandle and failhandle
    291     $self->{'outhandle'} = $outhandle if defined $outhandle;
    292     $self->{'failhandle'} = $failhandle;
    293 #    $self->SUPER::init(@_);
     254   
     255    $self->SUPER::init(@_);
    294256   
    295257    # set process_exp and block_exp to defaults unless they were
     
    550512    return undef;
    551513}
    552 
    553 
    554 # just converts path as is to utf8.
    555 sub filepath_to_utf8 {
    556     my $self = shift (@_); 
    557     my ($file, $file_encoding) = @_;
    558     my $filemeta = $file;
    559 
    560     my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
    561 
    562     # Whenever filename-encoding is set to any of the auto settings, we
    563     # check if the filename is already in UTF8. If it is, then we're done.
    564     if($filename_encoding =~ m/auto/) {
    565     if(&unicode::check_is_utf8($filemeta))
    566     {
    567         $filename_encoding = "utf8";
    568         return $filemeta;
    569     }
    570     }
    571    
    572     # Auto setting, but filename is not utf8
    573     if ($filename_encoding eq "auto")
    574     {
    575     # try textcat
    576     $filename_encoding = $self->textcat_encoding($filemeta);
    577    
    578     # check the locale next
    579     $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
    580    
    581    
    582     # now try the encoding of the document, if available
    583     if ($filename_encoding eq "undefined" && defined $file_encoding) {
    584         $filename_encoding = $file_encoding;
    585     }
    586 
    587     }
    588 
    589     elsif ($filename_encoding eq "auto-language-analysis")
    590     {   
    591     $filename_encoding = $self->textcat_encoding($filemeta);
    592 
    593     # now try the encoding of the document, if available
    594     if ($filename_encoding eq "undefined" && defined $file_encoding) {
    595         $filename_encoding = $file_encoding;
    596     }
    597     }
    598 
    599     elsif ($filename_encoding eq "auto-filesystem-encoding")
    600     {   
    601     # try locale
    602     $filename_encoding = $self->locale_encoding();
    603     }
    604 
    605     elsif ($filename_encoding eq "auto-fl")
    606     {
    607     # filesystem-encoding (locale) then language-analysis (textcat)
    608     $filename_encoding = $self->locale_encoding();
    609    
    610     # try textcat
    611     $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
    612    
    613     # else assume filename encoding is encoding of file content, if that's available
    614     if ($filename_encoding eq "undefined" && defined $file_encoding) {
    615         $filename_encoding = $file_encoding;
    616     }
    617     }
    618    
    619     elsif ($filename_encoding eq "auto-lf")
    620     {
    621     # language-analysis (textcat) then filesystem-encoding (locale)
    622     $filename_encoding = $self->textcat_encoding($filemeta);
    623    
    624     # guess filename encoding from encoding of file content, if available
    625     if ($filename_encoding eq "undefined" && defined $file_encoding) {
    626         $filename_encoding = $file_encoding;
    627     }
    628 
    629     # try locale
    630     $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
    631     }
    632    
    633     # if still undefined, use utf8 as fallback
    634     if ($filename_encoding eq "undefined") {
    635     $filename_encoding = "utf8";
    636     }
    637 
    638     #print STDERR "**** UTF8 encoding the filename $filemeta ";
    639    
    640     # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
    641     # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
    642     # be always utf8 (in which case the filename's encoding is also set as utf8 even
    643     # though the filename need not be if it originates from another system)--in such
    644     # cases attempt to make the filename utf8 to match.
    645     if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
    646     &unicode::ensure_utf8(\$filemeta);
    647     }
    648 
    649     # convert non-unicode encodings to utf8
    650     if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
    651     $filemeta = &unicode::unicode2utf8(
    652                        &unicode::convert2unicode($filename_encoding, \$filemeta)
    653                        );
    654     }
    655 
    656     #print STDERR " from encoding $filename_encoding -> $filemeta\n";
    657     return $filemeta;
    658 }
    659 
    660 # gets the filename with no path, converts to utf8, and then dm safes it.
    661 # filename_encoding set by user
    662 sub filename_to_utf8_metadata
    663 {
    664     my $self = shift (@_); 
    665     my ($file, $file_encoding) = @_;
    666 
    667     my $outhandle = $self->{'outhandle'};
    668 
    669     print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
    670     my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
    671     print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
    672 
    673     my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
    674     $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
    675 
    676     return $filemeta;
    677 }
    678 
    679 sub locale_encoding {
    680     my $self = shift(@_);
    681    
    682     if (!defined $self->{'filesystem_encoding'}) {
    683     $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
    684     }
    685 
    686     #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
    687     return $self->{'filesystem_encoding'}; # can be the string "undefined"
    688 }
    689 
    690 sub textcat_encoding {
    691     my $self = shift(@_);
    692     my ($filemeta) = @_;
    693 
    694     # analyse filenames without extensions and digits (and trimmed of
    695     # surrounding whitespace), so that irrelevant chars don't confuse
    696     # textcat
    697     my $strictfilemeta = $filemeta;
    698     $strictfilemeta =~ s/\.[^\.]+$//g;
    699     $strictfilemeta =~ s/\d//g;
    700     $strictfilemeta =~ s/^\s*//g;
    701     $strictfilemeta =~ s/\s*$//g;
    702    
    703     my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
    704     if(!defined $filename_encoding) {
    705     $filename_encoding = "undefined";
    706     }
    707 
    708     return $filename_encoding; # can be the string "undefined"
    709 }
    710 
    711 # performs textcat
    712 sub encoding_from_language_analysis {
    713     my $self = shift(@_);
    714     my ($text) = @_;
    715 
    716     my $outhandle = $self->{'outhandle'};
    717     my $best_encoding = undef;
    718    
    719     # get the language/encoding of the textstring using textcat
    720     require textcat;  # Only load the textcat module if it is required
    721     $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
    722     my $results = $self->{'textcat'}->classify_cached_filename(\$text);
    723 
    724 
    725     if (scalar @$results < 0) {
    726     return undef;
    727     }
    728    
    729     # We have some results, we choose the first
    730     my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
    731    
    732     $best_encoding = $encoding;
    733     if (!defined $best_encoding) {
    734     return undef;
    735     }
    736    
    737     if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
    738     # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
    739     $best_encoding = 'utf8';
    740     }
    741    
    742    
    743     # check for equivalents where textcat doesn't have some encodings...
    744     # eg MS versions of standard encodings
    745     if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
    746     my $iso = $1; # which variant of the iso standard?
    747     # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
    748     if ($text =~ /[\x80-\x9f]/) {
    749         # Western Europe
    750         if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
    751         elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
    752         elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
    753         elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
    754         elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
    755         elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
    756         elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
    757     }
    758     }
    759    
    760     if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
    761     !defined $encodings::encodings->{$best_encoding})
    762     {
    763     if ($self->{'verbosity'}) {
    764         gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
    765     }
    766     $best_encoding = undef;
    767     }
    768    
    769     return $best_encoding;
    770 }
    771 
    772 # uses locale
    773 sub get_filesystem_encoding
    774 {
    775 
    776     my $self = shift(@_);
    777 
    778     my $outhandle = $self->{'outhandle'};
    779     my $filesystem_encoding = undef;
    780 
    781     eval {
    782     # Works for Windows as well, returning the DOS code page in use
    783     use POSIX qw(locale_h);
    784    
    785     # With only one parameter, setlocale retrieves the
    786     # current value
    787     my $current_locale = setlocale(LC_CTYPE);
    788    
    789     my $char_encoding = undef;
    790     if ($current_locale =~ m/\./) {
    791         ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
    792         $char_encoding = lc($char_encoding);
    793     }
    794     else {
    795         if ($current_locale =~ m/^(posix|c)$/i) {
    796         $char_encoding = "ascii";
    797         }
    798     }
    799 
    800     if (defined $char_encoding) {
    801         if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
    802         $char_encoding = "$1\_$2\_$3";
    803         }
    804 
    805         $char_encoding =~ s/-/_/g;
    806         $char_encoding =~ s/^utf_8$/utf8/;
    807        
    808         if ($char_encoding =~ m/^\d+$/) {
    809         if (defined $encodings::encodings->{"windows_$char_encoding"}) {
    810             $char_encoding = "windows_$char_encoding";
    811         }
    812         elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
    813             $char_encoding = "dos_$char_encoding";
    814         }
    815         }
    816        
    817         if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
    818         || (defined $encodings::encodings->{$char_encoding})) {
    819         $filesystem_encoding = $char_encoding;
    820         }
    821         else {
    822         print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
    823         }
    824     }
    825    
    826 
    827     };
    828     if ($@) {
    829     print $outhandle "$@\n";
    830     print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n";
    831    
    832     }
    833 
    834     return $filesystem_encoding;
    835 }
    836 
    837 
    838 sub guess_filesystem_encoding
    839 {
    840    my $self = shift (@_);
    841     # Look to file system to provide a character encoding
    842    my $deduced_filename_encoding = "";
    843     # If Windows NTFS, then -- assuming we work with long file names got through
    844     # Win32::GetLongFilePath() -- then the underlying file system is UTF16
    845 
    846     if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
    847         # Can do better than working with the DOS character encoding returned by locale     
    848         $deduced_filename_encoding = "unicode";
    849     }
    850     else {
    851         # Unix of some form or other
    852 
    853         # See if we can determine the file system encoding through locale
    854         $deduced_filename_encoding = $self->locale_encoding();
    855        
    856     }
    857         print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";
    858         return $deduced_filename_encoding;
    859 }
    860 
    861 
    862 
    863 sub deduce_filename_encoding
    864 {
    865     my $self = shift (@_); 
    866     my ($file,$metadata,$plugin_filename_encoding) = @_;
    867 
    868     my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
    869     my $deduced_filename_encoding = undef;
    870    
    871     # Start by looking for manually assigned metadata
    872     if (defined $gs_filename_encoding) {
    873     if (ref ($gs_filename_encoding) eq "ARRAY") {
    874         my $outhandle = $self->{'outhandle'};
    875        
    876         $deduced_filename_encoding = $gs_filename_encoding->[0];
    877        
    878         my $num_vals = scalar(@$gs_filename_encoding);
    879         if ($num_vals>1) {
    880         print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
    881         print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
    882         }
    883     }
    884     else {
    885         $deduced_filename_encoding = $gs_filename_encoding;
    886     }
    887     }
    888    
    889     if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
    890     # Look to see if plugin specifies this value
    891 
    892     if (defined $plugin_filename_encoding) {
    893         # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
    894         if ($plugin_filename_encoding =~ m/^auto-.*$/) {
    895         my $outhandle = $self->{'outhandle'};
    896         print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
    897         print $outhandle "         default to 'auto'\n";
    898         $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
    899         }
    900        
    901         if ($plugin_filename_encoding ne "auto") {
    902         # We've been given a specific filenamne encoding
    903         # => so use it!
    904         $deduced_filename_encoding = $plugin_filename_encoding;
    905         }
    906     }
    907     }
    908    
    909     if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
    910 
    911     # Look to file system to provide a character encoding
    912 
    913     # If Windows NTFS, then -- assuming we work with long file names got through
    914     # Win32::GetLongFilePath() -- then the underlying file system is UTF16
    915 
    916     if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
    917         # Can do better than working with the DOS character encoding returned by locale     
    918         $deduced_filename_encoding = "unicode";
    919     }
    920     else {
    921         # Unix of some form or other
    922 
    923         # See if we can determine the file system encoding through locale
    924         $deduced_filename_encoding = $self->locale_encoding();
    925        
    926         # if locale shows us filesystem is utf8, check to see filename is consistent
    927         # => if not, then we have an "alien" filename on our hands
    928 
    929         if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
    930         if (!&unicode::check_is_utf8($file)) {
    931             # "alien" filename, so revert
    932             $deduced_filename_encoding = undef;
    933         }
    934         }
    935     }
    936     }
    937    
    938 #    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
    939 #       # Last chance, apply textcat to deduce filename encoding
    940 #       $deduced_filename_encoding = $self->textcat_encoding($file);
    941 #    }
    942 
    943     if ($self->{'verbosity'}>3) {
    944     my $outhandle = $self->{'outhandle'};
    945 
    946     if (defined $deduced_filename_encoding) {
    947         print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n";
    948     }
    949     else {
    950         print $outhandle "  No filename encoding deduced\n";
    951     }
    952     }
    953    
    954     return $deduced_filename_encoding;
    955 }
    956 
    957514
    958515
     
    1258815}
    1259816
    1260 # write_file -- used by ConvertToPlug, for example in post processing
    1261 #
    1262 # where should this go, is here the best place??
    1263 sub utf8_write_file {
    1264     my $self = shift (@_);
    1265     my ($textref, $filename) = @_;
    1266    
    1267     if (!open (FILE, ">:utf8", $filename)) {
    1268     gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
    1269     die "\n";
    1270     }
    1271     print FILE $$textref;
    1272    
    1273     close FILE;
    1274 }
    1275817
    1276818
Note: See TracChangeset for help on using the changeset viewer.