root/main/trunk/greenstone2/perllib/plugins/EncodingUtil.pm @ 31487

Revision 31487, 20.5 KB (checked in by ak19, 3 years ago)

Important import statement for the recent commits related to encoding.

Line 
1###########################################################################
2#
3# EncodingUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package EncodingUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34
35use PrintInfo;
36use Encode;
37use Unicode::Normalize 'normalize';
38
39BEGIN {
40    @EncodingUtil::ISA = ( 'PrintInfo' );
41}
42
43our $encoding_list =
44    [ { 'name' => "ascii",
45    'desc' => "{BasePlugin.encoding.ascii}" },
46      { 'name' => "utf8",
47    'desc' => "{BasePlugin.encoding.utf8}" },
48      { 'name' => "unicode",
49    'desc' => "{BasePlugin.encoding.unicode}" } ];
50
51
52my $e = $encodings::encodings;
53foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
54{
55    my $hashEncode =
56    {'name' => $enc,
57     'desc' => $e->{$enc}->{'name'}};
58   
59    push(@{$encoding_list},$hashEncode);
60}
61
62our $encoding_plus_auto_list =
63    [ { 'name' => "auto",
64    'desc' => "{BasePlugin.filename_encoding.auto}" },
65      { 'name' => "auto-language-analysis",
66    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
67      { 'name' => "auto-filesystem-encoding",
68    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
69      { 'name' => "auto-fl",
70    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
71      { 'name' => "auto-lf",
72    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
73
74push(@{$encoding_plus_auto_list},@{$encoding_list});
75
76my $arguments =
77    [  { 'name' => "block_exp",
78     'desc' => "{BasePlugin.block_exp}",
79     'type' => "regexp",
80     'deft' => "",
81     'reqd' => "no" },
82       { 'name' => "no_blocking",
83     'desc' => "{BasePlugin.no_blocking}",
84     'type' => "flag",
85     'reqd' => "no"},
86       { 'name' => "filename_encoding",
87     'desc' => "{BasePlugin.filename_encoding}",
88     'type' => "enum",
89     'deft' => "auto",
90     'list' => $encoding_plus_auto_list,
91     'reqd' => "no" }
92    ];
93
94my $options = { 'name'     => "EncodingUtil",
95        'desc'     => "{EncodingUtil.desc}",
96        'abstract' => "yes",
97        'inherits' => "no",
98        'args'     => $arguments };
99
100
101sub new {
102
103    my ($class) = shift (@_);
104    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
105    push(@$pluginlist, $class);
106
107    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
108    push(@{$hashArgOptLists->{"OptList"}},$options);
109
110    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
111
112    return bless $self, $class;
113 
114}
115
116sub init {   
117    my $self = shift (@_);
118    my ($verbosity, $outhandle, $failhandle) = @_;
119
120    print STDERR "guess encoding = ".$self->guess_filesystem_encoding()."\n";
121    print STDERR "get encoding = ".$self->get_filesystem_encoding()."\n";
122
123    # verbosity is passed through from the processor
124    $self->{'verbosity'} = $verbosity;
125
126    # as are the outhandle and failhandle
127    $self->{'outhandle'} = $outhandle if defined $outhandle;
128    $self->{'failhandle'} = $failhandle;
129
130}
131
132# converts raw filesystem filename to perl unicode format
133sub raw_filename_to_unicode {
134    my $self = shift (@_);
135    my ($file) = @_;
136
137    my $unicode_file = "";
138    ### need it in perl unicode, not raw filesystem
139    my $filename_encoding =  $self->guess_filesystem_encoding(); 
140       
141    # copied this from set_Source_metadata in BasePlugin
142    if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
143    # Use filename_encoding to map raw filename to a Perl unicode-aware string
144    $unicode_file = decode($filename_encoding,$file);       
145    }
146    else {
147    # otherwise generate %xx encoded version of filename for char > 127
148    $unicode_file = &unicode::raw_filename_to_url_encoded($file);
149    }
150    return $unicode_file;
151
152}
153# just converts path as is to utf8.
154sub filepath_to_utf8 {
155    my $self = shift (@_); 
156    my ($file, $file_encoding) = @_;
157    my $filemeta = $file;
158
159    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
160
161    # Whenever filename-encoding is set to any of the auto settings, we
162    # check if the filename is already in UTF8. If it is, then we're done.
163    if($filename_encoding =~ m/auto/) {
164    if(&unicode::check_is_utf8($filemeta))
165    {
166        $filename_encoding = "utf8";
167        return $filemeta;
168    }
169    }
170   
171    # Auto setting, but filename is not utf8
172    if ($filename_encoding eq "auto")
173    {
174    # try textcat
175    $filename_encoding = $self->textcat_encoding($filemeta);
176   
177    # check the locale next
178    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
179   
180   
181    # now try the encoding of the document, if available
182    if ($filename_encoding eq "undefined" && defined $file_encoding) {
183        $filename_encoding = $file_encoding;
184    }
185
186    }
187
188    elsif ($filename_encoding eq "auto-language-analysis")
189    {   
190    $filename_encoding = $self->textcat_encoding($filemeta);
191
192    # now try the encoding of the document, if available
193    if ($filename_encoding eq "undefined" && defined $file_encoding) {
194        $filename_encoding = $file_encoding;
195    }
196    }
197
198    elsif ($filename_encoding eq "auto-filesystem-encoding")
199    {   
200    # try locale
201    $filename_encoding = $self->locale_encoding();
202    }
203
204    elsif ($filename_encoding eq "auto-fl")
205    {
206    # filesystem-encoding (locale) then language-analysis (textcat)
207    $filename_encoding = $self->locale_encoding();
208   
209    # try textcat
210    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
211   
212    # else assume filename encoding is encoding of file content, if that's available
213    if ($filename_encoding eq "undefined" && defined $file_encoding) {
214        $filename_encoding = $file_encoding;
215    }
216    }
217   
218    elsif ($filename_encoding eq "auto-lf")
219    {
220    # language-analysis (textcat) then filesystem-encoding (locale)
221    $filename_encoding = $self->textcat_encoding($filemeta);
222   
223    # guess filename encoding from encoding of file content, if available
224    if ($filename_encoding eq "undefined" && defined $file_encoding) {
225        $filename_encoding = $file_encoding;
226    }
227
228    # try locale
229    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
230    }
231   
232    # if still undefined, use utf8 as fallback
233    if ($filename_encoding eq "undefined") {
234    $filename_encoding = "utf8";
235    }
236
237    #print STDERR "**** UTF8 encoding the filename $filemeta ";
238   
239    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
240    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
241    # be always utf8 (in which case the filename's encoding is also set as utf8 even
242    # though the filename need not be if it originates from another system)--in such
243    # cases attempt to make the filename utf8 to match.
244    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
245    &unicode::ensure_utf8(\$filemeta);
246    }
247
248    # convert non-unicode encodings to utf8
249    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
250    $filemeta = &unicode::unicode2utf8(
251                       &unicode::convert2unicode($filename_encoding, \$filemeta)
252                       );
253    }
254
255    #print STDERR " from encoding $filename_encoding -> $filemeta\n";
256    return $filemeta;
257}
258
259# gets the filename with no path, converts to utf8, and then dm safes it.
260# filename_encoding set by user
261sub filename_to_utf8_metadata
262{
263    my $self = shift (@_); 
264    my ($file, $file_encoding) = @_;
265
266    my $outhandle = $self->{'outhandle'};
267
268    print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
269    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
270    print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
271
272    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
273    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
274
275    return $filemeta;
276}
277
278sub locale_encoding {
279    my $self = shift(@_);
280   
281    if (!defined $self->{'filesystem_encoding'}) {
282    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
283    }
284
285    #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
286    return $self->{'filesystem_encoding'}; # can be the string "undefined"
287}
288
289
290sub textcat_encoding {
291    my $self = shift(@_);
292    my ($filemeta) = @_;
293
294    # analyse filenames without extensions and digits (and trimmed of
295    # surrounding whitespace), so that irrelevant chars don't confuse
296    # textcat
297    my $strictfilemeta = $filemeta;
298    $strictfilemeta =~ s/\.[^\.]+$//g;
299    $strictfilemeta =~ s/\d//g;
300    $strictfilemeta =~ s/^\s*//g;
301    $strictfilemeta =~ s/\s*$//g;
302   
303    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
304    if(!defined $filename_encoding) {
305    $filename_encoding = "undefined";
306    }
307
308    return $filename_encoding; # can be the string "undefined"
309}
310
311# performs textcat
312sub encoding_from_language_analysis {
313    my $self = shift(@_);
314    my ($text) = @_;
315
316    my $outhandle = $self->{'outhandle'};
317    my $best_encoding = undef;
318   
319    # get the language/encoding of the textstring using textcat
320    require textcat;  # Only load the textcat module if it is required
321    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
322    my $results = $self->{'textcat'}->classify_cached_filename(\$text);
323
324
325    if (scalar @$results < 0) {
326    return undef;
327    }
328   
329    # We have some results, we choose the first
330    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
331   
332    $best_encoding = $encoding;
333    if (!defined $best_encoding) {
334    return undef;
335    }
336   
337    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
338    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
339    $best_encoding = 'utf8';
340    }
341   
342   
343    # check for equivalents where textcat doesn't have some encodings...
344    # eg MS versions of standard encodings
345    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
346    my $iso = $1; # which variant of the iso standard?
347    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
348    if ($text =~ /[\x80-\x9f]/) {
349        # Western Europe
350        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
351        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
352        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
353        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
354        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
355        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
356        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
357    }
358    }
359   
360    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
361    !defined $encodings::encodings->{$best_encoding})
362    {
363    if ($self->{'verbosity'}) {
364        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
365    }
366    $best_encoding = undef;
367    }
368   
369    return $best_encoding;
370}
371
372
373
374sub deduce_filename_encoding
375{
376    my $self = shift (@_); 
377    my ($file,$metadata,$plugin_filename_encoding) = @_;
378
379    my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
380    my $deduced_filename_encoding = undef;
381   
382    # Start by looking for manually assigned metadata
383    if (defined $gs_filename_encoding) {
384    if (ref ($gs_filename_encoding) eq "ARRAY") {
385        my $outhandle = $self->{'outhandle'};
386       
387        $deduced_filename_encoding = $gs_filename_encoding->[0];
388       
389        my $num_vals = scalar(@$gs_filename_encoding);
390        if ($num_vals>1) {
391        print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
392        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
393        }
394    }
395    else {
396        $deduced_filename_encoding = $gs_filename_encoding;
397    }
398    }
399   
400    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
401    # Look to see if plugin specifies this value
402
403    if (defined $plugin_filename_encoding) {
404        # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
405        if ($plugin_filename_encoding =~ m/^auto-.*$/) {
406        my $outhandle = $self->{'outhandle'};
407        print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
408        print $outhandle "         default to 'auto'\n";
409        $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
410        }
411       
412        if ($plugin_filename_encoding ne "auto") {
413        # We've been given a specific filenamne encoding
414        # => so use it!
415        $deduced_filename_encoding = $plugin_filename_encoding;
416        }
417    }
418    }
419   
420    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
421
422    # Look to file system to provide a character encoding
423
424    # If Windows NTFS, then -- assuming we work with long file names got through
425    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
426
427    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
428        # Can do better than working with the DOS character encoding returned by locale     
429        $deduced_filename_encoding = "unicode";
430    }
431    else {
432        # Unix of some form or other
433
434        # See if we can determine the file system encoding through locale
435        $deduced_filename_encoding = $self->locale_encoding();
436       
437        # if locale shows us filesystem is utf8, check to see filename is consistent
438        # => if not, then we have an "alien" filename on our hands
439
440        if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
441        if (!&unicode::check_is_utf8($file)) {
442            # "alien" filename, so revert
443            $deduced_filename_encoding = undef;
444        }
445        }
446    }
447    }
448   
449#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
450#       # Last chance, apply textcat to deduce filename encoding
451#       $deduced_filename_encoding = $self->textcat_encoding($file);
452#    }
453
454    if ($self->{'verbosity'}>3) {
455    my $outhandle = $self->{'outhandle'};
456
457    if (defined $deduced_filename_encoding) {
458        print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n";
459    }
460    else {
461        print $outhandle "  No filename encoding deduced\n";
462    }
463    }
464   
465    return $deduced_filename_encoding;
466}
467
468
469sub guess_filesystem_encoding
470{
471   my $self = shift (@_);
472    # Look to file system to provide a character encoding
473   my $deduced_filename_encoding = "";
474    # If Windows NTFS, then -- assuming we work with long file names got through
475    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
476
477    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
478        # Can do better than working with the DOS character encoding returned by locale     
479        $deduced_filename_encoding = "unicode";
480    }
481    else {
482        # Unix of some form or other
483
484        # See if we can determine the file system encoding through locale
485        $deduced_filename_encoding = $self->locale_encoding(); #utf8??
486       
487    }
488        print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";
489        return $deduced_filename_encoding;
490}
491
492
493# uses locale
494sub get_filesystem_encoding
495{
496
497    my $self = shift(@_);
498
499    my $outhandle = $self->{'outhandle'};
500    my $filesystem_encoding = undef;
501
502    eval {
503    # Works for Windows as well, returning the DOS code page in use
504    use POSIX qw(locale_h);
505   
506    # With only one parameter, setlocale retrieves the
507    # current value
508    my $current_locale = setlocale(LC_CTYPE);
509   
510    my $char_encoding = undef;
511    if ($current_locale =~ m/\./) {
512        ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
513        $char_encoding = lc($char_encoding);
514    }
515    else {
516        if ($current_locale =~ m/^(posix|c)$/i) {
517        $char_encoding = "ascii";
518        }
519    }
520
521    if (defined $char_encoding) {
522        if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
523        $char_encoding = "$1\_$2\_$3";
524        }
525
526        $char_encoding =~ s/-/_/g;
527        $char_encoding =~ s/^utf_8$/utf8/;
528       
529        if ($char_encoding =~ m/^\d+$/) {
530        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
531            $char_encoding = "windows_$char_encoding";
532        }
533        elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
534            $char_encoding = "dos_$char_encoding";
535        }
536        }
537       
538        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
539        || (defined $encodings::encodings->{$char_encoding})) {
540        $filesystem_encoding = $char_encoding;
541        }
542        else {
543        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
544        }
545    }
546   
547
548    };
549    if ($@) {
550    print $outhandle "$@\n";
551    print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n";
552   
553    }
554
555    return $filesystem_encoding;
556}
557
558
559
560# write_file -- used by ConvertToPlug, for example in post processing
561#
562# where should this go, is here the best place??
563sub utf8_write_file {
564    my $self = shift (@_);
565    my ($textref, $filename) = @_;
566   
567    if (!open (FILE, ">:utf8", $filename)) {
568    gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
569    die "\n";
570    }
571    print FILE $$textref;
572   
573    close FILE;
574}
575
576sub block_raw_filename {
577
578    my $self = shift (@_);
579    my ($block_hash,$filename_full_path) = @_;
580
581    my $unicode_filename = $self->raw_filename_to_unicode($filename_full_path);
582    return $self->block_filename($block_hash, $unicode_filename);
583}
584
585# block unicode string filename
586sub block_filename
587{
588    my $self = shift (@_);
589    my ($block_hash,$filename_full_path) = @_;
590    print STDERR "in block filename $filename_full_path\n";
591    print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
592     
593    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
594       # block hash contains long names, lets make sure that we were passed a long name
595       $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
596       # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
597       my $lower_filename_full_path = lc($filename_full_path);
598       $block_hash->{'file_blocks'}->{$lower_filename_full_path} = 1;
599   
600    }
601    elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
602    # we need to normalize the filenames
603        my $composed_filename_full_path = normalize('C', $filename_full_path);
604        print STDERR "darwin, composed filename =". &unicode::debug_unicode_string($composed_filename_full_path)."\n";
605        $block_hash->{'file_blocks'}->{$composed_filename_full_path} = 1;
606   }
607 
608    else {
609    $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
610    }
611}
612
613
614# filename is raw filesystem name
615sub raw_file_is_blocked {
616     my $self = shift (@_);
617     my ($block_hash, $filename_full_path) = @_;
618
619     my $unicode_filename_full_path = $self->raw_filename_to_unicode($filename_full_path);
620     return $self->file_is_blocked($block_hash, $unicode_filename_full_path);
621}
622
623# filename must be perl unicode string
624sub file_is_blocked {
625    my $self = shift (@_);
626    my ($block_hash, $filename_full_path) = @_;
627
628    #
629    print STDERR "in file is blocked $filename_full_path\n";
630    print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
631    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
632    # convert to long filenames if needed
633    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
634    # all block paths are lowercased.
635    my $lower_filename = lc ($filename_full_path);
636    if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
637        $self->{'num_blocked'} ++;
638        return 1;
639    }
640    }
641    else {
642    if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
643        $self->{'num_blocked'} ++;
644        print STDERR "BLOCKED\n";
645        return 1;
646    }
647    }
648    # check Directory plugin's own block_exp
649    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
650    $self->{'num_blocked'} ++;
651    return 1; # blocked
652    }
653    print STDERR "NOT BLOCKED\n";
654    return 0;
655}
656
657
6581;
659
Note: See TracBrowser for help on using the browser.