Changeset 31457 for main/trunk/greenstone2
- Timestamp:
- 2017-03-06T13:48:15+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r31445 r31457 42 42 use FileUtils; 43 43 44 use PrintInfo;44 use EncodingUtil; 45 45 46 46 BEGIN { 47 @BasePlugin::ISA = ( ' PrintInfo' );47 @BasePlugin::ISA = ( 'EncodingUtil' ); 48 48 } 49 49 … … 59 59 'hiddengli' => "yes" } ]; 60 60 61 our $encoding_list = 62 [ { 'name' => "ascii", 63 'desc' => "{BasePlugin.encoding.ascii}" }, 64 { 'name' => "utf8", 65 'desc' => "{BasePlugin.encoding.utf8}" }, 66 { 'name' => "unicode", 67 'desc' => "{BasePlugin.encoding.unicode}" } ]; 68 69 70 my $e = $encodings::encodings; 71 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) 72 { 73 my $hashEncode = 74 {'name' => $enc, 75 'desc' => $e->{$enc}->{'name'}}; 76 77 push(@{$encoding_list},$hashEncode); 78 } 79 80 our $encoding_plus_auto_list = 81 [ { 'name' => "auto", 82 'desc' => "{BasePlugin.filename_encoding.auto}" }, 83 { 'name' => "auto-language-analysis", 84 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat 85 { 'name' => "auto-filesystem-encoding", 86 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale 87 { 'name' => "auto-fl", 88 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat 89 { 'name' => "auto-lf", 90 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale 91 92 push(@{$encoding_plus_auto_list},@{$encoding_list}); 61 # here went encoding list stuff 93 62 94 63 our $oidtype_list = … … 160 129 'type' => "flag", 161 130 'reqd' => "no" }, 162 { 'name' => "filename_encoding",163 'desc' => "{BasePlugin.filename_encoding}",164 'type' => "enum",165 'deft' => "auto",166 'list' => $encoding_plus_auto_list,167 'reqd' => "no" },131 # { 'name' => "filename_encoding", 132 # 'desc' => "{BasePlugin.filename_encoding}", 133 # 'type' => "enum", 134 # 'deft' => "auto", 135 # 'list' => $encoding_plus_auto_list, 136 # 'reqd' => "no" }, 168 137 { 'name' => "smart_block", 169 138 'desc' => "{common.deprecated}. {BasePlugin.smart_block}", … … 185 154 'desc' => "{BasePlugin.desc}", 186 155 'abstract' => "yes", 187 'inherits' => " no",156 'inherits' => "yes", 188 157 'args' => $arguments }; 189 158 … … 197 166 push(@{$hashArgOptLists->{"OptList"}},$options); 198 167 199 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);168 my $self = new EncodingUtil($pluginlist, $inputargs, $hashArgOptLists,$auxiliary); 200 169 201 170 if ($self->{'info_only'}) { … … 203 172 return bless $self, $class; 204 173 } 205 206 174 if ($self->{'smart_block'}) { 207 175 print STDERR "WARNING: -smart_block option has been deprecated and is no longer useful\n"; … … 284 252 my $self = shift (@_); 285 253 my ($verbosity, $outhandle, $failhandle) = @_; 286 287 # verbosity is passed through from the processor 288 $self->{'verbosity'} = $verbosity; 289 290 # as are the outhandle and failhandle 291 $self->{'outhandle'} = $outhandle if defined $outhandle; 292 $self->{'failhandle'} = $failhandle; 293 # $self->SUPER::init(@_); 254 255 $self->SUPER::init(@_); 294 256 295 257 # set process_exp and block_exp to defaults unless they were … … 550 512 return undef; 551 513 } 552 553 554 # just converts path as is to utf8.555 sub filepath_to_utf8 {556 my $self = shift (@_);557 my ($file, $file_encoding) = @_;558 my $filemeta = $file;559 560 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting561 562 # Whenever filename-encoding is set to any of the auto settings, we563 # check if the filename is already in UTF8. If it is, then we're done.564 if($filename_encoding =~ m/auto/) {565 if(&unicode::check_is_utf8($filemeta))566 {567 $filename_encoding = "utf8";568 return $filemeta;569 }570 }571 572 # Auto setting, but filename is not utf8573 if ($filename_encoding eq "auto")574 {575 # try textcat576 $filename_encoding = $self->textcat_encoding($filemeta);577 578 # check the locale next579 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";580 581 582 # now try the encoding of the document, if available583 if ($filename_encoding eq "undefined" && defined $file_encoding) {584 $filename_encoding = $file_encoding;585 }586 587 }588 589 elsif ($filename_encoding eq "auto-language-analysis")590 {591 $filename_encoding = $self->textcat_encoding($filemeta);592 593 # now try the encoding of the document, if available594 if ($filename_encoding eq "undefined" && defined $file_encoding) {595 $filename_encoding = $file_encoding;596 }597 }598 599 elsif ($filename_encoding eq "auto-filesystem-encoding")600 {601 # try locale602 $filename_encoding = $self->locale_encoding();603 }604 605 elsif ($filename_encoding eq "auto-fl")606 {607 # filesystem-encoding (locale) then language-analysis (textcat)608 $filename_encoding = $self->locale_encoding();609 610 # try textcat611 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";612 613 # else assume filename encoding is encoding of file content, if that's available614 if ($filename_encoding eq "undefined" && defined $file_encoding) {615 $filename_encoding = $file_encoding;616 }617 }618 619 elsif ($filename_encoding eq "auto-lf")620 {621 # language-analysis (textcat) then filesystem-encoding (locale)622 $filename_encoding = $self->textcat_encoding($filemeta);623 624 # guess filename encoding from encoding of file content, if available625 if ($filename_encoding eq "undefined" && defined $file_encoding) {626 $filename_encoding = $file_encoding;627 }628 629 # try locale630 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";631 }632 633 # if still undefined, use utf8 as fallback634 if ($filename_encoding eq "undefined") {635 $filename_encoding = "utf8";636 }637 638 #print STDERR "**** UTF8 encoding the filename $filemeta ";639 640 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when641 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to642 # be always utf8 (in which case the filename's encoding is also set as utf8 even643 # though the filename need not be if it originates from another system)--in such644 # cases attempt to make the filename utf8 to match.645 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {646 &unicode::ensure_utf8(\$filemeta);647 }648 649 # convert non-unicode encodings to utf8650 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {651 $filemeta = &unicode::unicode2utf8(652 &unicode::convert2unicode($filename_encoding, \$filemeta)653 );654 }655 656 #print STDERR " from encoding $filename_encoding -> $filemeta\n";657 return $filemeta;658 }659 660 # gets the filename with no path, converts to utf8, and then dm safes it.661 # filename_encoding set by user662 sub filename_to_utf8_metadata663 {664 my $self = shift (@_);665 my ($file, $file_encoding) = @_;666 667 my $outhandle = $self->{'outhandle'};668 669 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";670 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);671 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";672 673 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)674 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);675 676 return $filemeta;677 }678 679 sub locale_encoding {680 my $self = shift(@_);681 682 if (!defined $self->{'filesystem_encoding'}) {683 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();684 }685 686 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";687 return $self->{'filesystem_encoding'}; # can be the string "undefined"688 }689 690 sub textcat_encoding {691 my $self = shift(@_);692 my ($filemeta) = @_;693 694 # analyse filenames without extensions and digits (and trimmed of695 # surrounding whitespace), so that irrelevant chars don't confuse696 # textcat697 my $strictfilemeta = $filemeta;698 $strictfilemeta =~ s/\.[^\.]+$//g;699 $strictfilemeta =~ s/\d//g;700 $strictfilemeta =~ s/^\s*//g;701 $strictfilemeta =~ s/\s*$//g;702 703 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);704 if(!defined $filename_encoding) {705 $filename_encoding = "undefined";706 }707 708 return $filename_encoding; # can be the string "undefined"709 }710 711 # performs textcat712 sub encoding_from_language_analysis {713 my $self = shift(@_);714 my ($text) = @_;715 716 my $outhandle = $self->{'outhandle'};717 my $best_encoding = undef;718 719 # get the language/encoding of the textstring using textcat720 require textcat; # Only load the textcat module if it is required721 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});722 my $results = $self->{'textcat'}->classify_cached_filename(\$text);723 724 725 if (scalar @$results < 0) {726 return undef;727 }728 729 # We have some results, we choose the first730 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;731 732 $best_encoding = $encoding;733 if (!defined $best_encoding) {734 return undef;735 }736 737 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {738 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)739 $best_encoding = 'utf8';740 }741 742 743 # check for equivalents where textcat doesn't have some encodings...744 # eg MS versions of standard encodings745 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {746 my $iso = $1; # which variant of the iso standard?747 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do748 if ($text =~ /[\x80-\x9f]/) {749 # Western Europe750 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }751 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe752 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic753 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic754 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek755 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew756 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish757 }758 }759 760 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&761 !defined $encodings::encodings->{$best_encoding})762 {763 if ($self->{'verbosity'}) {764 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");765 }766 $best_encoding = undef;767 }768 769 return $best_encoding;770 }771 772 # uses locale773 sub get_filesystem_encoding774 {775 776 my $self = shift(@_);777 778 my $outhandle = $self->{'outhandle'};779 my $filesystem_encoding = undef;780 781 eval {782 # Works for Windows as well, returning the DOS code page in use783 use POSIX qw(locale_h);784 785 # With only one parameter, setlocale retrieves the786 # current value787 my $current_locale = setlocale(LC_CTYPE);788 789 my $char_encoding = undef;790 if ($current_locale =~ m/\./) {791 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);792 $char_encoding = lc($char_encoding);793 }794 else {795 if ($current_locale =~ m/^(posix|c)$/i) {796 $char_encoding = "ascii";797 }798 }799 800 if (defined $char_encoding) {801 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {802 $char_encoding = "$1\_$2\_$3";803 }804 805 $char_encoding =~ s/-/_/g;806 $char_encoding =~ s/^utf_8$/utf8/;807 808 if ($char_encoding =~ m/^\d+$/) {809 if (defined $encodings::encodings->{"windows_$char_encoding"}) {810 $char_encoding = "windows_$char_encoding";811 }812 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {813 $char_encoding = "dos_$char_encoding";814 }815 }816 817 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)818 || (defined $encodings::encodings->{$char_encoding})) {819 $filesystem_encoding = $char_encoding;820 }821 else {822 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";823 }824 }825 826 827 };828 if ($@) {829 print $outhandle "$@\n";830 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";831 832 }833 834 return $filesystem_encoding;835 }836 837 838 sub guess_filesystem_encoding839 {840 my $self = shift (@_);841 # Look to file system to provide a character encoding842 my $deduced_filename_encoding = "";843 # If Windows NTFS, then -- assuming we work with long file names got through844 # Win32::GetLongFilePath() -- then the underlying file system is UTF16845 846 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {847 # Can do better than working with the DOS character encoding returned by locale848 $deduced_filename_encoding = "unicode";849 }850 else {851 # Unix of some form or other852 853 # See if we can determine the file system encoding through locale854 $deduced_filename_encoding = $self->locale_encoding();855 856 }857 print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";858 return $deduced_filename_encoding;859 }860 861 862 863 sub deduce_filename_encoding864 {865 my $self = shift (@_);866 my ($file,$metadata,$plugin_filename_encoding) = @_;867 868 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};869 my $deduced_filename_encoding = undef;870 871 # Start by looking for manually assigned metadata872 if (defined $gs_filename_encoding) {873 if (ref ($gs_filename_encoding) eq "ARRAY") {874 my $outhandle = $self->{'outhandle'};875 876 $deduced_filename_encoding = $gs_filename_encoding->[0];877 878 my $num_vals = scalar(@$gs_filename_encoding);879 if ($num_vals>1) {880 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";881 print $outhandle " Selecting first value: $deduced_filename_encoding\n";882 }883 }884 else {885 $deduced_filename_encoding = $gs_filename_encoding;886 }887 }888 889 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {890 # Look to see if plugin specifies this value891 892 if (defined $plugin_filename_encoding) {893 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)894 if ($plugin_filename_encoding =~ m/^auto-.*$/) {895 my $outhandle = $self->{'outhandle'};896 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";897 print $outhandle " default to 'auto'\n";898 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";899 }900 901 if ($plugin_filename_encoding ne "auto") {902 # We've been given a specific filenamne encoding903 # => so use it!904 $deduced_filename_encoding = $plugin_filename_encoding;905 }906 }907 }908 909 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {910 911 # Look to file system to provide a character encoding912 913 # If Windows NTFS, then -- assuming we work with long file names got through914 # Win32::GetLongFilePath() -- then the underlying file system is UTF16915 916 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {917 # Can do better than working with the DOS character encoding returned by locale918 $deduced_filename_encoding = "unicode";919 }920 else {921 # Unix of some form or other922 923 # See if we can determine the file system encoding through locale924 $deduced_filename_encoding = $self->locale_encoding();925 926 # if locale shows us filesystem is utf8, check to see filename is consistent927 # => if not, then we have an "alien" filename on our hands928 929 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {930 if (!&unicode::check_is_utf8($file)) {931 # "alien" filename, so revert932 $deduced_filename_encoding = undef;933 }934 }935 }936 }937 938 # if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {939 # # Last chance, apply textcat to deduce filename encoding940 # $deduced_filename_encoding = $self->textcat_encoding($file);941 # }942 943 if ($self->{'verbosity'}>3) {944 my $outhandle = $self->{'outhandle'};945 946 if (defined $deduced_filename_encoding) {947 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";948 }949 else {950 print $outhandle " No filename encoding deduced\n";951 }952 }953 954 return $deduced_filename_encoding;955 }956 957 514 958 515 … … 1258 815 } 1259 816 1260 # write_file -- used by ConvertToPlug, for example in post processing1261 #1262 # where should this go, is here the best place??1263 sub utf8_write_file {1264 my $self = shift (@_);1265 my ($textref, $filename) = @_;1266 1267 if (!open (FILE, ">:utf8", $filename)) {1268 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);1269 die "\n";1270 }1271 print FILE $$textref;1272 1273 close FILE;1274 }1275 817 1276 818
Note:
See TracChangeset
for help on using the changeset viewer.