root/main/trunk/greenstone2/perllib/plugins/EncodingUtil.pm @ 31478

Revision 31478, 20.5 KB (checked in by kjdon, 3 years ago)

blocking stuff moved to here

Line 
1###########################################################################
2#
3# EncodingUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package EncodingUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34
35use PrintInfo;
36use Encode;
37
38BEGIN {
39    @EncodingUtil::ISA = ( 'PrintInfo' );
40}
41
42our $encoding_list =
43    [ { 'name' => "ascii",
44    'desc' => "{BasePlugin.encoding.ascii}" },
45      { 'name' => "utf8",
46    'desc' => "{BasePlugin.encoding.utf8}" },
47      { 'name' => "unicode",
48    'desc' => "{BasePlugin.encoding.unicode}" } ];
49
50
51my $e = $encodings::encodings;
52foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
53{
54    my $hashEncode =
55    {'name' => $enc,
56     'desc' => $e->{$enc}->{'name'}};
57   
58    push(@{$encoding_list},$hashEncode);
59}
60
61our $encoding_plus_auto_list =
62    [ { 'name' => "auto",
63    'desc' => "{BasePlugin.filename_encoding.auto}" },
64      { 'name' => "auto-language-analysis",
65    'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
66      { 'name' => "auto-filesystem-encoding",
67    'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
68      { 'name' => "auto-fl",
69    'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
70      { 'name' => "auto-lf",
71    'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
72
73push(@{$encoding_plus_auto_list},@{$encoding_list});
74
75my $arguments =
76    [  { 'name' => "block_exp",
77     'desc' => "{BasePlugin.block_exp}",
78     'type' => "regexp",
79     'deft' => "",
80     'reqd' => "no" },
81       { 'name' => "no_blocking",
82     'desc' => "{BasePlugin.no_blocking}",
83     'type' => "flag",
84     'reqd' => "no"},
85       { 'name' => "filename_encoding",
86     'desc' => "{BasePlugin.filename_encoding}",
87     'type' => "enum",
88     'deft' => "auto",
89     'list' => $encoding_plus_auto_list,
90     'reqd' => "no" }
91    ];
92
93my $options = { 'name'     => "EncodingUtil",
94        'desc'     => "{EncodingUtil.desc}",
95        'abstract' => "yes",
96        'inherits' => "no",
97        'args'     => $arguments };
98
99
100sub new {
101
102    my ($class) = shift (@_);
103    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
104    push(@$pluginlist, $class);
105
106    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107    push(@{$hashArgOptLists->{"OptList"}},$options);
108
109    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
110
111    return bless $self, $class;
112 
113}
114
115sub init {   
116    my $self = shift (@_);
117    my ($verbosity, $outhandle, $failhandle) = @_;
118
119    print STDERR "guess encoding = ".$self->guess_filesystem_encoding()."\n";
120    print STDERR "get encoding = ".$self->get_filesystem_encoding()."\n";
121
122    # verbosity is passed through from the processor
123    $self->{'verbosity'} = $verbosity;
124
125    # as are the outhandle and failhandle
126    $self->{'outhandle'} = $outhandle if defined $outhandle;
127    $self->{'failhandle'} = $failhandle;
128
129}
130
131# converts raw filesystem filename to perl unicode format
132sub raw_filename_to_unicode {
133    my $self = shift (@_);
134    my ($file) = @_;
135
136    my $unicode_file = "";
137    ### need it in perl unicode, not raw filesystem
138    my $filename_encoding =  $self->guess_filesystem_encoding(); 
139       
140    # copied this from set_Source_metadata in BasePlugin
141    if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
142    # Use filename_encoding to map raw filename to a Perl unicode-aware string
143    $unicode_file = decode($filename_encoding,$file);       
144    }
145    else {
146    # otherwise generate %xx encoded version of filename for char > 127
147    $unicode_file = &unicode::raw_filename_to_url_encoded($file);
148    }
149    return $unicode_file;
150
151}
152# just converts path as is to utf8.
153sub filepath_to_utf8 {
154    my $self = shift (@_); 
155    my ($file, $file_encoding) = @_;
156    my $filemeta = $file;
157
158    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
159
160    # Whenever filename-encoding is set to any of the auto settings, we
161    # check if the filename is already in UTF8. If it is, then we're done.
162    if($filename_encoding =~ m/auto/) {
163    if(&unicode::check_is_utf8($filemeta))
164    {
165        $filename_encoding = "utf8";
166        return $filemeta;
167    }
168    }
169   
170    # Auto setting, but filename is not utf8
171    if ($filename_encoding eq "auto")
172    {
173    # try textcat
174    $filename_encoding = $self->textcat_encoding($filemeta);
175   
176    # check the locale next
177    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
178   
179   
180    # now try the encoding of the document, if available
181    if ($filename_encoding eq "undefined" && defined $file_encoding) {
182        $filename_encoding = $file_encoding;
183    }
184
185    }
186
187    elsif ($filename_encoding eq "auto-language-analysis")
188    {   
189    $filename_encoding = $self->textcat_encoding($filemeta);
190
191    # now try the encoding of the document, if available
192    if ($filename_encoding eq "undefined" && defined $file_encoding) {
193        $filename_encoding = $file_encoding;
194    }
195    }
196
197    elsif ($filename_encoding eq "auto-filesystem-encoding")
198    {   
199    # try locale
200    $filename_encoding = $self->locale_encoding();
201    }
202
203    elsif ($filename_encoding eq "auto-fl")
204    {
205    # filesystem-encoding (locale) then language-analysis (textcat)
206    $filename_encoding = $self->locale_encoding();
207   
208    # try textcat
209    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
210   
211    # else assume filename encoding is encoding of file content, if that's available
212    if ($filename_encoding eq "undefined" && defined $file_encoding) {
213        $filename_encoding = $file_encoding;
214    }
215    }
216   
217    elsif ($filename_encoding eq "auto-lf")
218    {
219    # language-analysis (textcat) then filesystem-encoding (locale)
220    $filename_encoding = $self->textcat_encoding($filemeta);
221   
222    # guess filename encoding from encoding of file content, if available
223    if ($filename_encoding eq "undefined" && defined $file_encoding) {
224        $filename_encoding = $file_encoding;
225    }
226
227    # try locale
228    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
229    }
230   
231    # if still undefined, use utf8 as fallback
232    if ($filename_encoding eq "undefined") {
233    $filename_encoding = "utf8";
234    }
235
236    #print STDERR "**** UTF8 encoding the filename $filemeta ";
237   
238    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
239    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
240    # be always utf8 (in which case the filename's encoding is also set as utf8 even
241    # though the filename need not be if it originates from another system)--in such
242    # cases attempt to make the filename utf8 to match.
243    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
244    &unicode::ensure_utf8(\$filemeta);
245    }
246
247    # convert non-unicode encodings to utf8
248    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
249    $filemeta = &unicode::unicode2utf8(
250                       &unicode::convert2unicode($filename_encoding, \$filemeta)
251                       );
252    }
253
254    #print STDERR " from encoding $filename_encoding -> $filemeta\n";
255    return $filemeta;
256}
257
258# gets the filename with no path, converts to utf8, and then dm safes it.
259# filename_encoding set by user
260sub filename_to_utf8_metadata
261{
262    my $self = shift (@_); 
263    my ($file, $file_encoding) = @_;
264
265    my $outhandle = $self->{'outhandle'};
266
267    print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
268    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
269    print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
270
271    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
272    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
273
274    return $filemeta;
275}
276
277sub locale_encoding {
278    my $self = shift(@_);
279   
280    if (!defined $self->{'filesystem_encoding'}) {
281    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
282    }
283
284    #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
285    return $self->{'filesystem_encoding'}; # can be the string "undefined"
286}
287
288
289sub textcat_encoding {
290    my $self = shift(@_);
291    my ($filemeta) = @_;
292
293    # analyse filenames without extensions and digits (and trimmed of
294    # surrounding whitespace), so that irrelevant chars don't confuse
295    # textcat
296    my $strictfilemeta = $filemeta;
297    $strictfilemeta =~ s/\.[^\.]+$//g;
298    $strictfilemeta =~ s/\d//g;
299    $strictfilemeta =~ s/^\s*//g;
300    $strictfilemeta =~ s/\s*$//g;
301   
302    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
303    if(!defined $filename_encoding) {
304    $filename_encoding = "undefined";
305    }
306
307    return $filename_encoding; # can be the string "undefined"
308}
309
310# performs textcat
311sub encoding_from_language_analysis {
312    my $self = shift(@_);
313    my ($text) = @_;
314
315    my $outhandle = $self->{'outhandle'};
316    my $best_encoding = undef;
317   
318    # get the language/encoding of the textstring using textcat
319    require textcat;  # Only load the textcat module if it is required
320    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
321    my $results = $self->{'textcat'}->classify_cached_filename(\$text);
322
323
324    if (scalar @$results < 0) {
325    return undef;
326    }
327   
328    # We have some results, we choose the first
329    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
330   
331    $best_encoding = $encoding;
332    if (!defined $best_encoding) {
333    return undef;
334    }
335   
336    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
337    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
338    $best_encoding = 'utf8';
339    }
340   
341   
342    # check for equivalents where textcat doesn't have some encodings...
343    # eg MS versions of standard encodings
344    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
345    my $iso = $1; # which variant of the iso standard?
346    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
347    if ($text =~ /[\x80-\x9f]/) {
348        # Western Europe
349        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
350        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
351        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
352        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
353        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
354        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
355        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
356    }
357    }
358   
359    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
360    !defined $encodings::encodings->{$best_encoding})
361    {
362    if ($self->{'verbosity'}) {
363        gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
364    }
365    $best_encoding = undef;
366    }
367   
368    return $best_encoding;
369}
370
371
372
373sub deduce_filename_encoding
374{
375    my $self = shift (@_); 
376    my ($file,$metadata,$plugin_filename_encoding) = @_;
377
378    my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
379    my $deduced_filename_encoding = undef;
380   
381    # Start by looking for manually assigned metadata
382    if (defined $gs_filename_encoding) {
383    if (ref ($gs_filename_encoding) eq "ARRAY") {
384        my $outhandle = $self->{'outhandle'};
385       
386        $deduced_filename_encoding = $gs_filename_encoding->[0];
387       
388        my $num_vals = scalar(@$gs_filename_encoding);
389        if ($num_vals>1) {
390        print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
391        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
392        }
393    }
394    else {
395        $deduced_filename_encoding = $gs_filename_encoding;
396    }
397    }
398   
399    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
400    # Look to see if plugin specifies this value
401
402    if (defined $plugin_filename_encoding) {
403        # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
404        if ($plugin_filename_encoding =~ m/^auto-.*$/) {
405        my $outhandle = $self->{'outhandle'};
406        print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
407        print $outhandle "         default to 'auto'\n";
408        $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
409        }
410       
411        if ($plugin_filename_encoding ne "auto") {
412        # We've been given a specific filenamne encoding
413        # => so use it!
414        $deduced_filename_encoding = $plugin_filename_encoding;
415        }
416    }
417    }
418   
419    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
420
421    # Look to file system to provide a character encoding
422
423    # If Windows NTFS, then -- assuming we work with long file names got through
424    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
425
426    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
427        # Can do better than working with the DOS character encoding returned by locale     
428        $deduced_filename_encoding = "unicode";
429    }
430    else {
431        # Unix of some form or other
432
433        # See if we can determine the file system encoding through locale
434        $deduced_filename_encoding = $self->locale_encoding();
435       
436        # if locale shows us filesystem is utf8, check to see filename is consistent
437        # => if not, then we have an "alien" filename on our hands
438
439        if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
440        if (!&unicode::check_is_utf8($file)) {
441            # "alien" filename, so revert
442            $deduced_filename_encoding = undef;
443        }
444        }
445    }
446    }
447   
448#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
449#       # Last chance, apply textcat to deduce filename encoding
450#       $deduced_filename_encoding = $self->textcat_encoding($file);
451#    }
452
453    if ($self->{'verbosity'}>3) {
454    my $outhandle = $self->{'outhandle'};
455
456    if (defined $deduced_filename_encoding) {
457        print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n";
458    }
459    else {
460        print $outhandle "  No filename encoding deduced\n";
461    }
462    }
463   
464    return $deduced_filename_encoding;
465}
466
467
468sub guess_filesystem_encoding
469{
470   my $self = shift (@_);
471    # Look to file system to provide a character encoding
472   my $deduced_filename_encoding = "";
473    # If Windows NTFS, then -- assuming we work with long file names got through
474    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
475
476    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
477        # Can do better than working with the DOS character encoding returned by locale     
478        $deduced_filename_encoding = "unicode";
479    }
480    else {
481        # Unix of some form or other
482
483        # See if we can determine the file system encoding through locale
484        $deduced_filename_encoding = $self->locale_encoding(); #utf8??
485       
486    }
487        print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";
488        return $deduced_filename_encoding;
489}
490
491
492# uses locale
493sub get_filesystem_encoding
494{
495
496    my $self = shift(@_);
497
498    my $outhandle = $self->{'outhandle'};
499    my $filesystem_encoding = undef;
500
501    eval {
502    # Works for Windows as well, returning the DOS code page in use
503    use POSIX qw(locale_h);
504   
505    # With only one parameter, setlocale retrieves the
506    # current value
507    my $current_locale = setlocale(LC_CTYPE);
508   
509    my $char_encoding = undef;
510    if ($current_locale =~ m/\./) {
511        ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
512        $char_encoding = lc($char_encoding);
513    }
514    else {
515        if ($current_locale =~ m/^(posix|c)$/i) {
516        $char_encoding = "ascii";
517        }
518    }
519
520    if (defined $char_encoding) {
521        if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
522        $char_encoding = "$1\_$2\_$3";
523        }
524
525        $char_encoding =~ s/-/_/g;
526        $char_encoding =~ s/^utf_8$/utf8/;
527       
528        if ($char_encoding =~ m/^\d+$/) {
529        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
530            $char_encoding = "windows_$char_encoding";
531        }
532        elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
533            $char_encoding = "dos_$char_encoding";
534        }
535        }
536       
537        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
538        || (defined $encodings::encodings->{$char_encoding})) {
539        $filesystem_encoding = $char_encoding;
540        }
541        else {
542        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
543        }
544    }
545   
546
547    };
548    if ($@) {
549    print $outhandle "$@\n";
550    print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n";
551   
552    }
553
554    return $filesystem_encoding;
555}
556
557
558
559# write_file -- used by ConvertToPlug, for example in post processing
560#
561# where should this go, is here the best place??
562sub utf8_write_file {
563    my $self = shift (@_);
564    my ($textref, $filename) = @_;
565   
566    if (!open (FILE, ">:utf8", $filename)) {
567    gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
568    die "\n";
569    }
570    print FILE $$textref;
571   
572    close FILE;
573}
574
575sub block_raw_filename {
576
577    my $self = shift (@_);
578    my ($block_hash,$filename_full_path) = @_;
579
580    my $unicode_filename = $self->raw_filename_to_unicode($filename_full_path);
581    return $self->block_filename($block_hash, $unicode_filename);
582}
583
584# block unicode string filename
585sub block_filename
586{
587    my $self = shift (@_);
588    my ($block_hash,$filename_full_path) = @_;
589    print STDERR "in block filename $filename_full_path\n";
590    print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
591     
592    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
593       # block hash contains long names, lets make sure that we were passed a long name
594       $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
595       # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
596       my $lower_filename_full_path = lc($filename_full_path);
597       $block_hash->{'file_blocks'}->{$lower_filename_full_path} = 1;
598   
599    }
600    elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
601    # we need to normalize the filenames
602        my $composed_filename_full_path = normalize('C', $filename_full_path);
603        print STDERR "darwin, composed filename =". &unicode::debug_unicode_string($composed_filename_full_path)."\n";
604        $block_hash->{'file_blocks'}->{$composed_filename_full_path} = 1;
605   }
606 
607    else {
608    $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
609    }
610}
611
612
613# filename is raw filesystem name
614sub raw_file_is_blocked {
615     my $self = shift (@_);
616     my ($block_hash, $filename_full_path) = @_;
617
618     my $unicode_filename_full_path = $self->raw_filename_to_unicode($filename_full_path);
619     return $self->file_is_blocked($block_hash, $unicode_filename_full_path);
620}
621
622# filename must be perl unicode string
623sub file_is_blocked {
624    my $self = shift (@_);
625    my ($block_hash, $filename_full_path) = @_;
626
627    #
628    print STDERR "in file is blocked $filename_full_path\n";
629    print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
630    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
631    # convert to long filenames if needed
632    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
633    # all block paths are lowercased.
634    my $lower_filename = lc ($filename_full_path);
635    if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
636        $self->{'num_blocked'} ++;
637        return 1;
638    }
639    }
640    else {
641    if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
642        $self->{'num_blocked'} ++;
643        print STDERR "BLOCKED\n";
644        return 1;
645    }
646    }
647    # check Directory plugin's own block_exp
648    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
649    $self->{'num_blocked'} ++;
650    return 1; # blocked
651    }
652    print STDERR "NOT BLOCKED\n";
653    return 0;
654}
655
656
6571;
658
Note: See TracBrowser for help on using the browser.