Changeset 31457

Show
Ignore:
Timestamp:
06.03.2017 13:48:15 (3 years ago)
Author:
kjdon
Message:

baseplugin now inherits from EncodingUtil?, and all its encoding methods have been moved to the new plugin

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r31445 r31457  
    4242use FileUtils; 
    4343 
    44 use PrintInfo; 
     44use EncodingUtil; 
    4545 
    4646BEGIN { 
    47     @BasePlugin::ISA = ( 'PrintInfo' ); 
     47    @BasePlugin::ISA = ( 'EncodingUtil' ); 
    4848} 
    4949 
     
    5959    'hiddengli' => "yes" } ]; 
    6060 
    61 our $encoding_list = 
    62     [ { 'name' => "ascii", 
    63     'desc' => "{BasePlugin.encoding.ascii}" }, 
    64       { 'name' => "utf8", 
    65     'desc' => "{BasePlugin.encoding.utf8}" }, 
    66       { 'name' => "unicode", 
    67     'desc' => "{BasePlugin.encoding.unicode}" } ]; 
    68  
    69  
    70 my $e = $encodings::encodings; 
    71 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))  
    72 { 
    73     my $hashEncode = 
    74     {'name' => $enc, 
    75      'desc' => $e->{$enc}->{'name'}}; 
    76      
    77     push(@{$encoding_list},$hashEncode); 
    78 } 
    79  
    80 our $encoding_plus_auto_list =  
    81     [ { 'name' => "auto", 
    82     'desc' => "{BasePlugin.filename_encoding.auto}" }, 
    83       { 'name' => "auto-language-analysis", 
    84     'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat 
    85       { 'name' => "auto-filesystem-encoding", 
    86     'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale 
    87       { 'name' => "auto-fl", 
    88     'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat 
    89       { 'name' => "auto-lf", 
    90     'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale  
    91  
    92 push(@{$encoding_plus_auto_list},@{$encoding_list}); 
     61# here went encoding list stuff 
    9362 
    9463our $oidtype_list =  
     
    160129    'type' => "flag", 
    161130    'reqd' => "no" }, 
    162       { 'name' => "filename_encoding", 
    163     'desc' => "{BasePlugin.filename_encoding}", 
    164     'type' => "enum", 
    165     'deft' => "auto", 
    166     'list' => $encoding_plus_auto_list, 
    167     'reqd' => "no" }, 
     131#      { 'name' => "filename_encoding", 
     132#   'desc' => "{BasePlugin.filename_encoding}", 
     133#   'type' => "enum", 
     134#   'deft' => "auto", 
     135#   'list' => $encoding_plus_auto_list, 
     136#   'reqd' => "no" }, 
    168137      { 'name' => "smart_block", 
    169138        'desc' => "{common.deprecated}. {BasePlugin.smart_block}", 
     
    185154        'desc'     => "{BasePlugin.desc}", 
    186155        'abstract' => "yes", 
    187         'inherits' => "no", 
     156        'inherits' => "yes", 
    188157        'args'     => $arguments }; 
    189158 
     
    197166    push(@{$hashArgOptLists->{"OptList"}},$options); 
    198167 
    199     my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary); 
     168    my $self = new EncodingUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary); 
    200169     
    201170    if ($self->{'info_only'}) { 
     
    203172        return bless $self, $class; 
    204173    } 
    205  
    206174    if ($self->{'smart_block'}) { 
    207175    print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n"; 
     
    284252    my $self = shift (@_); 
    285253    my ($verbosity, $outhandle, $failhandle) = @_; 
    286  
    287     # verbosity is passed through from the processor 
    288     $self->{'verbosity'} = $verbosity; 
    289  
    290     # as are the outhandle and failhandle 
    291     $self->{'outhandle'} = $outhandle if defined $outhandle; 
    292     $self->{'failhandle'} = $failhandle; 
    293 #    $self->SUPER::init(@_); 
     254     
     255    $self->SUPER::init(@_); 
    294256     
    295257    # set process_exp and block_exp to defaults unless they were 
     
    550512    return undef; 
    551513} 
    552  
    553  
    554 # just converts path as is to utf8. 
    555 sub filepath_to_utf8 { 
    556     my $self = shift (@_);   
    557     my ($file, $file_encoding) = @_; 
    558     my $filemeta = $file; 
    559  
    560     my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting 
    561  
    562     # Whenever filename-encoding is set to any of the auto settings, we 
    563     # check if the filename is already in UTF8. If it is, then we're done. 
    564     if($filename_encoding =~ m/auto/) { 
    565     if(&unicode::check_is_utf8($filemeta))  
    566     { 
    567         $filename_encoding = "utf8"; 
    568         return $filemeta; 
    569     }  
    570     } 
    571      
    572     # Auto setting, but filename is not utf8 
    573     if ($filename_encoding eq "auto")  
    574     { 
    575     # try textcat 
    576     $filename_encoding = $self->textcat_encoding($filemeta); 
    577      
    578     # check the locale next 
    579     $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 
    580      
    581      
    582     # now try the encoding of the document, if available 
    583     if ($filename_encoding eq "undefined" && defined $file_encoding) { 
    584         $filename_encoding = $file_encoding; 
    585     } 
    586  
    587     } 
    588  
    589     elsif ($filename_encoding eq "auto-language-analysis")  
    590     {    
    591     $filename_encoding = $self->textcat_encoding($filemeta); 
    592  
    593     # now try the encoding of the document, if available 
    594     if ($filename_encoding eq "undefined" && defined $file_encoding) { 
    595         $filename_encoding = $file_encoding; 
    596     }  
    597     } 
    598  
    599     elsif ($filename_encoding eq "auto-filesystem-encoding")  
    600     {    
    601     # try locale 
    602     $filename_encoding = $self->locale_encoding(); 
    603     } 
    604  
    605     elsif ($filename_encoding eq "auto-fl")  
    606     { 
    607     # filesystem-encoding (locale) then language-analysis (textcat) 
    608     $filename_encoding = $self->locale_encoding(); 
    609      
    610     # try textcat 
    611     $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined"; 
    612      
    613     # else assume filename encoding is encoding of file content, if that's available 
    614     if ($filename_encoding eq "undefined" && defined $file_encoding) { 
    615         $filename_encoding = $file_encoding; 
    616     } 
    617     } 
    618      
    619     elsif ($filename_encoding eq "auto-lf")  
    620     { 
    621     # language-analysis (textcat) then filesystem-encoding (locale) 
    622     $filename_encoding = $self->textcat_encoding($filemeta); 
    623      
    624     # guess filename encoding from encoding of file content, if available 
    625     if ($filename_encoding eq "undefined" && defined $file_encoding) { 
    626         $filename_encoding = $file_encoding; 
    627     } 
    628  
    629     # try locale 
    630     $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 
    631     } 
    632      
    633     # if still undefined, use utf8 as fallback 
    634     if ($filename_encoding eq "undefined") { 
    635     $filename_encoding = "utf8"; 
    636     } 
    637  
    638     #print STDERR "**** UTF8 encoding the filename $filemeta "; 
    639      
    640     # if the filename encoding is set to utf8 but it isn't utf8 already--such as when 
    641     # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to 
    642     # be always utf8 (in which case the filename's encoding is also set as utf8 even  
    643     # though the filename need not be if it originates from another system)--in such 
    644     # cases attempt to make the filename utf8 to match. 
    645     if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) { 
    646     &unicode::ensure_utf8(\$filemeta); 
    647     } 
    648  
    649     # convert non-unicode encodings to utf8 
    650     if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) { 
    651     $filemeta = &unicode::unicode2utf8( 
    652                        &unicode::convert2unicode($filename_encoding, \$filemeta) 
    653                        ); 
    654     } 
    655  
    656     #print STDERR " from encoding $filename_encoding -> $filemeta\n"; 
    657     return $filemeta; 
    658 } 
    659  
    660 # gets the filename with no path, converts to utf8, and then dm safes it. 
    661 # filename_encoding set by user 
    662 sub filename_to_utf8_metadata 
    663 { 
    664     my $self = shift (@_);   
    665     my ($file, $file_encoding) = @_; 
    666  
    667     my $outhandle = $self->{'outhandle'}; 
    668  
    669     print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n"; 
    670     my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0); 
    671     print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n"; 
    672  
    673     my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end) 
    674     $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding); 
    675  
    676     return $filemeta; 
    677 } 
    678  
    679 sub locale_encoding { 
    680     my $self = shift(@_); 
    681      
    682     if (!defined $self->{'filesystem_encoding'}) { 
    683     $self->{'filesystem_encoding'} = $self->get_filesystem_encoding(); 
    684     } 
    685  
    686     #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n"; 
    687     return $self->{'filesystem_encoding'}; # can be the string "undefined" 
    688 } 
    689  
    690 sub textcat_encoding { 
    691     my $self = shift(@_); 
    692     my ($filemeta) = @_; 
    693  
    694     # analyse filenames without extensions and digits (and trimmed of 
    695     # surrounding whitespace), so that irrelevant chars don't confuse 
    696     # textcat 
    697     my $strictfilemeta = $filemeta; 
    698     $strictfilemeta =~ s/\.[^\.]+$//g; 
    699     $strictfilemeta =~ s/\d//g; 
    700     $strictfilemeta =~ s/^\s*//g; 
    701     $strictfilemeta =~ s/\s*$//g; 
    702      
    703     my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta); 
    704     if(!defined $filename_encoding) { 
    705     $filename_encoding = "undefined"; 
    706     } 
    707  
    708     return $filename_encoding; # can be the string "undefined" 
    709 } 
    710  
    711 # performs textcat 
    712 sub encoding_from_language_analysis { 
    713     my $self = shift(@_); 
    714     my ($text) = @_; 
    715  
    716     my $outhandle = $self->{'outhandle'}; 
    717     my $best_encoding = undef; 
    718      
    719     # get the language/encoding of the textstring using textcat 
    720     require textcat;  # Only load the textcat module if it is required 
    721     $self->{'textcat'} = new textcat() unless defined($self->{'textcat'}); 
    722     my $results = $self->{'textcat'}->classify_cached_filename(\$text); 
    723  
    724  
    725     if (scalar @$results < 0) {  
    726     return undef; 
    727     } 
    728      
    729     # We have some results, we choose the first 
    730     my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 
    731      
    732     $best_encoding = $encoding; 
    733     if (!defined $best_encoding) { 
    734     return undef; 
    735     }  
    736      
    737     if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) { 
    738     # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities) 
    739     $best_encoding = 'utf8'; 
    740     } 
    741      
    742      
    743     # check for equivalents where textcat doesn't have some encodings... 
    744     # eg MS versions of standard encodings 
    745     if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) { 
    746     my $iso = $1; # which variant of the iso standard? 
    747     # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 
    748     if ($text =~ /[\x80-\x9f]/) { 
    749         # Western Europe 
    750         if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } 
    751         elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe 
    752         elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic 
    753         elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic 
    754         elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek 
    755         elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 
    756         elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 
    757     } 
    758     } 
    759      
    760     if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ && 
    761     !defined $encodings::encodings->{$best_encoding})  
    762     { 
    763     if ($self->{'verbosity'}) {  
    764         gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef"); 
    765     } 
    766     $best_encoding = undef; 
    767     } 
    768      
    769     return $best_encoding; 
    770 } 
    771  
    772 # uses locale 
    773 sub get_filesystem_encoding  
    774 { 
    775  
    776     my $self = shift(@_); 
    777  
    778     my $outhandle = $self->{'outhandle'}; 
    779     my $filesystem_encoding = undef; 
    780  
    781     eval { 
    782     # Works for Windows as well, returning the DOS code page in use  
    783     use POSIX qw(locale_h); 
    784      
    785     # With only one parameter, setlocale retrieves the  
    786     # current value 
    787     my $current_locale = setlocale(LC_CTYPE); 
    788      
    789     my $char_encoding = undef; 
    790     if ($current_locale =~ m/\./) { 
    791         ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/); 
    792         $char_encoding = lc($char_encoding); 
    793     } 
    794     else { 
    795         if ($current_locale =~ m/^(posix|c)$/i) { 
    796         $char_encoding = "ascii"; 
    797         } 
    798     } 
    799  
    800     if (defined $char_encoding) { 
    801         if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) { 
    802         $char_encoding = "$1\_$2\_$3"; 
    803         } 
    804  
    805         $char_encoding =~ s/-/_/g; 
    806         $char_encoding =~ s/^utf_8$/utf8/; 
    807          
    808         if ($char_encoding =~ m/^\d+$/) { 
    809         if (defined $encodings::encodings->{"windows_$char_encoding"}) { 
    810             $char_encoding = "windows_$char_encoding"; 
    811         } 
    812         elsif (defined $encodings::encodings->{"dos_$char_encoding"}) { 
    813             $char_encoding = "dos_$char_encoding"; 
    814         } 
    815         } 
    816          
    817         if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)  
    818         || (defined $encodings::encodings->{$char_encoding})) { 
    819         $filesystem_encoding = $char_encoding; 
    820         } 
    821         else { 
    822         print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n"; 
    823         } 
    824     } 
    825      
    826  
    827     }; 
    828     if ($@) { 
    829     print $outhandle "$@\n"; 
    830     print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n"; 
    831      
    832     } 
    833  
    834     return $filesystem_encoding; 
    835 } 
    836  
    837  
    838 sub guess_filesystem_encoding 
    839 { 
    840    my $self = shift (@_);  
    841     # Look to file system to provide a character encoding 
    842    my $deduced_filename_encoding = ""; 
    843     # If Windows NTFS, then -- assuming we work with long file names got through 
    844     # Win32::GetLongFilePath() -- then the underlying file system is UTF16 
    845  
    846     if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) { 
    847         # Can do better than working with the DOS character encoding returned by locale      
    848         $deduced_filename_encoding = "unicode"; 
    849     } 
    850     else { 
    851         # Unix of some form or other 
    852  
    853         # See if we can determine the file system encoding through locale 
    854         $deduced_filename_encoding = $self->locale_encoding(); 
    855          
    856     } 
    857         print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n"; 
    858         return $deduced_filename_encoding; 
    859 } 
    860  
    861  
    862  
    863 sub deduce_filename_encoding 
    864 { 
    865     my $self = shift (@_);   
    866     my ($file,$metadata,$plugin_filename_encoding) = @_; 
    867  
    868     my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"}; 
    869     my $deduced_filename_encoding = undef; 
    870      
    871     # Start by looking for manually assigned metadata 
    872     if (defined $gs_filename_encoding) { 
    873     if (ref ($gs_filename_encoding) eq "ARRAY") { 
    874         my $outhandle = $self->{'outhandle'}; 
    875          
    876         $deduced_filename_encoding = $gs_filename_encoding->[0]; 
    877          
    878         my $num_vals = scalar(@$gs_filename_encoding); 
    879         if ($num_vals>1) { 
    880         print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n"; 
    881         print $outhandle "         Selecting first value: $deduced_filename_encoding\n"; 
    882         } 
    883     }  
    884     else { 
    885         $deduced_filename_encoding = $gs_filename_encoding; 
    886     } 
    887     } 
    888      
    889     if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
    890     # Look to see if plugin specifies this value 
    891  
    892     if (defined $plugin_filename_encoding) { 
    893         # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options) 
    894         if ($plugin_filename_encoding =~ m/^auto-.*$/) { 
    895         my $outhandle = $self->{'outhandle'}; 
    896         print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n"; 
    897         print $outhandle "         default to 'auto'\n"; 
    898         $self->{'filename_encoding'} = $plugin_filename_encoding = "auto"; 
    899         } 
    900          
    901         if ($plugin_filename_encoding ne "auto") { 
    902         # We've been given a specific filenamne encoding 
    903         # => so use it! 
    904         $deduced_filename_encoding = $plugin_filename_encoding; 
    905         } 
    906     } 
    907     } 
    908      
    909     if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
    910  
    911     # Look to file system to provide a character encoding 
    912  
    913     # If Windows NTFS, then -- assuming we work with long file names got through 
    914     # Win32::GetLongFilePath() -- then the underlying file system is UTF16 
    915  
    916     if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) { 
    917         # Can do better than working with the DOS character encoding returned by locale      
    918         $deduced_filename_encoding = "unicode"; 
    919     } 
    920     else { 
    921         # Unix of some form or other 
    922  
    923         # See if we can determine the file system encoding through locale 
    924         $deduced_filename_encoding = $self->locale_encoding(); 
    925          
    926         # if locale shows us filesystem is utf8, check to see filename is consistent 
    927         # => if not, then we have an "alien" filename on our hands 
    928  
    929         if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) { 
    930         if (!&unicode::check_is_utf8($file)) { 
    931             # "alien" filename, so revert 
    932             $deduced_filename_encoding = undef; 
    933         } 
    934         } 
    935     } 
    936     } 
    937      
    938 #    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
    939 #       # Last chance, apply textcat to deduce filename encoding 
    940 #       $deduced_filename_encoding = $self->textcat_encoding($file); 
    941 #    } 
    942  
    943     if ($self->{'verbosity'}>3) { 
    944     my $outhandle = $self->{'outhandle'}; 
    945  
    946     if (defined $deduced_filename_encoding) { 
    947         print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n"; 
    948     } 
    949     else { 
    950         print $outhandle "  No filename encoding deduced\n"; 
    951     } 
    952     } 
    953      
    954     return $deduced_filename_encoding; 
    955 } 
    956  
    957514 
    958515 
     
    1258815} 
    1259816 
    1260 # write_file -- used by ConvertToPlug, for example in post processing 
    1261 # 
    1262 # where should this go, is here the best place?? 
    1263 sub utf8_write_file { 
    1264     my $self = shift (@_); 
    1265     my ($textref, $filename) = @_; 
    1266      
    1267     if (!open (FILE, ">:utf8", $filename)) { 
    1268     gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename); 
    1269     die "\n"; 
    1270     } 
    1271     print FILE $$textref; 
    1272      
    1273     close FILE; 
    1274 } 
    1275817 
    1276818