root/main/trunk/greenstone2/perllib/plugins/CommonUtil.pm @ 32274

Revision 32274, 20.3 KB (checked in by ak19, 15 months ago)

Related to previous commit, forgot to commit with previous revision. A newly added string was referenced by CommonUntil?, and more recently by PDFPlugin, but was not defined in strings.properties. Corrected the reference in CommonUtil? too and committing it now.

Line 
1###########################################################################
2#
3# CommonUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, blocking stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CommonUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34use Unicode::Normalize 'normalize';
35
36use PrintInfo;
37use Encode;
38use Unicode::Normalize 'normalize';
39
40BEGIN {
41    @CommonUtil::ISA = ( 'PrintInfo' );
42}
43
44our $encoding_list =
45    [ { 'name' => "ascii",
46    'desc' => "{CommonUtil.encoding.ascii}" },
47      { 'name' => "utf8",
48    'desc' => "{CommonUtil.encoding.utf8}" },
49      { 'name' => "unicode",
50    'desc' => "{CommonUtil.encoding.unicode}" } ];
51
52
53my $e = $encodings::encodings;
54foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
55{
56    my $hashEncode =
57    {'name' => $enc,
58     'desc' => $e->{$enc}->{'name'}};
59   
60    push(@{$encoding_list},$hashEncode);
61}
62
63our $encoding_plus_auto_list =
64    [ { 'name' => "auto",
65    'desc' => "{CommonUtil.filename_encoding.auto}" },
66      { 'name' => "auto-language-analysis",
67    'desc' => "{CommonUtil.filename_encoding.auto_language_analysis}" }, # textcat
68      { 'name' => "auto-filesystem-encoding",
69    'desc' => "{CommonUtil.filename_encoding.auto_filesystem_encoding}" }, # locale
70      { 'name' => "auto-fl",
71    'desc' => "{CommonUtil.filename_encoding.auto_fl}" }, # locale followed by textcat
72      { 'name' => "auto-lf",
73    'desc' => "{CommonUtil.filename_encoding.auto_lf}" } ]; # texcat followed by locale
74
75push(@{$encoding_plus_auto_list},@{$encoding_list});
76
77my $arguments =
78    [  { 'name' => "block_exp",
79     'desc' => "{CommonUtil.block_exp}",
80     'type' => "regexp",
81     'deft' => "",
82     'reqd' => "no" },
83       { 'name' => "no_blocking",
84     'desc' => "{CommonUtil.no_blocking}",
85     'type' => "flag",
86     'reqd' => "no"},
87       { 'name' => "filename_encoding",
88     'desc' => "{CommonUtil.filename_encoding}",
89     'type' => "enum",
90     'deft' => "auto",
91     'list' => $encoding_plus_auto_list,
92     'reqd' => "no" }
93    ];
94
95my $options = { 'name'     => "CommonUtil",
96        'desc'     => "{CommonUtil.desc}",
97        'abstract' => "yes",
98        'inherits' => "no",
99        'args'     => $arguments };
100
101
102sub new {
103
104    my ($class) = shift (@_);
105    my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
106    push(@$pluginlist, $class);
107
108    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
109    push(@{$hashArgOptLists->{"OptList"}},$options);
110
111    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
112
113    return bless $self, $class;
114 
115}
116
117sub init {   
118    my $self = shift (@_);
119    my ($verbosity, $outhandle, $failhandle) = @_;
120
121    # verbosity is passed through from the processor
122    $self->{'verbosity'} = $verbosity;
123
124    # as are the outhandle and failhandle
125    $self->{'outhandle'} = $outhandle if defined $outhandle;
126    $self->{'failhandle'} = $failhandle;
127
128}
129
130# converts raw filesystem filename to perl unicode format
131sub raw_filename_to_unicode {
132    my $self = shift (@_);
133    my ($file) = @_;
134
135    my $unicode_file = "";
136    ### need it in perl unicode, not raw filesystem
137    my $filename_encoding =  $self->guess_filesystem_encoding(); 
138       
139    # copied this from set_Source_metadata in BaseImporter
140    if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
141    # Use filename_encoding to map raw filename to a Perl unicode-aware string
142    $unicode_file = decode($filename_encoding,$file);       
143    }
144    else {
145    # otherwise generate %xx encoded version of filename for char > 127
146    $unicode_file = &unicode::raw_filename_to_url_encoded($file);
147    }
148    return $unicode_file;
149
150}
151# just converts path as is to utf8.
152sub filepath_to_utf8 {
153    my $self = shift (@_); 
154    my ($file, $file_encoding) = @_;
155    my $filemeta = $file;
156
157    my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
158
159    # Whenever filename-encoding is set to any of the auto settings, we
160    # check if the filename is already in UTF8. If it is, then we're done.
161    if($filename_encoding =~ m/auto/) {
162    if(&unicode::check_is_utf8($filemeta))
163    {
164        $filename_encoding = "utf8";
165        return $filemeta;
166    }
167    }
168   
169    # Auto setting, but filename is not utf8
170    if ($filename_encoding eq "auto")
171    {
172    # try textcat
173    $filename_encoding = $self->textcat_encoding($filemeta);
174   
175    # check the locale next
176    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
177   
178   
179    # now try the encoding of the document, if available
180    if ($filename_encoding eq "undefined" && defined $file_encoding) {
181        $filename_encoding = $file_encoding;
182    }
183
184    }
185
186    elsif ($filename_encoding eq "auto-language-analysis")
187    {   
188    $filename_encoding = $self->textcat_encoding($filemeta);
189
190    # now try the encoding of the document, if available
191    if ($filename_encoding eq "undefined" && defined $file_encoding) {
192        $filename_encoding = $file_encoding;
193    }
194    }
195
196    elsif ($filename_encoding eq "auto-filesystem-encoding")
197    {   
198    # try locale
199    $filename_encoding = $self->locale_encoding();
200    }
201
202    elsif ($filename_encoding eq "auto-fl")
203    {
204    # filesystem-encoding (locale) then language-analysis (textcat)
205    $filename_encoding = $self->locale_encoding();
206   
207    # try textcat
208    $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
209   
210    # else assume filename encoding is encoding of file content, if that's available
211    if ($filename_encoding eq "undefined" && defined $file_encoding) {
212        $filename_encoding = $file_encoding;
213    }
214    }
215   
216    elsif ($filename_encoding eq "auto-lf")
217    {
218    # language-analysis (textcat) then filesystem-encoding (locale)
219    $filename_encoding = $self->textcat_encoding($filemeta);
220   
221    # guess filename encoding from encoding of file content, if available
222    if ($filename_encoding eq "undefined" && defined $file_encoding) {
223        $filename_encoding = $file_encoding;
224    }
225
226    # try locale
227    $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
228    }
229   
230    # if still undefined, use utf8 as fallback
231    if ($filename_encoding eq "undefined") {
232    $filename_encoding = "utf8";
233    }
234
235    #print STDERR "**** UTF8 encoding the filename $filemeta ";
236   
237    # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
238    # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
239    # be always utf8 (in which case the filename's encoding is also set as utf8 even
240    # though the filename need not be if it originates from another system)--in such
241    # cases attempt to make the filename utf8 to match.
242    if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
243    &unicode::ensure_utf8(\$filemeta);
244    }
245
246    # convert non-unicode encodings to utf8
247    if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
248    $filemeta = &unicode::unicode2utf8(
249                       &unicode::convert2unicode($filename_encoding, \$filemeta)
250                       );
251    }
252
253    #print STDERR " from encoding $filename_encoding -> $filemeta\n";
254    return $filemeta;
255}
256
257# gets the filename with no path, converts to utf8, and then dm safes it.
258# filename_encoding set by user
259sub filename_to_utf8_metadata
260{
261    my $self = shift (@_); 
262    my ($file, $file_encoding) = @_;
263
264    my $outhandle = $self->{'outhandle'};
265
266    print $outhandle "****!!!!**** CommonUtil::filename_to_utf8_metadata now deprecated\n";
267    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
268    print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
269
270    my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
271    $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
272
273    return $filemeta;
274}
275
276sub locale_encoding {
277    my $self = shift(@_);
278   
279    if (!defined $self->{'filesystem_encoding'}) {
280    $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
281    }
282
283    #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
284    return $self->{'filesystem_encoding'}; # can be the string "undefined"
285}
286
287
288sub textcat_encoding {
289    my $self = shift(@_);
290    my ($filemeta) = @_;
291
292    # analyse filenames without extensions and digits (and trimmed of
293    # surrounding whitespace), so that irrelevant chars don't confuse
294    # textcat
295    my $strictfilemeta = $filemeta;
296    $strictfilemeta =~ s/\.[^\.]+$//g;
297    $strictfilemeta =~ s/\d//g;
298    $strictfilemeta =~ s/^\s*//g;
299    $strictfilemeta =~ s/\s*$//g;
300   
301    my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
302    if(!defined $filename_encoding) {
303    $filename_encoding = "undefined";
304    }
305
306    return $filename_encoding; # can be the string "undefined"
307}
308
309# performs textcat
310sub encoding_from_language_analysis {
311    my $self = shift(@_);
312    my ($text) = @_;
313
314    my $outhandle = $self->{'outhandle'};
315    my $best_encoding = undef;
316   
317    # get the language/encoding of the textstring using textcat
318    require textcat;  # Only load the textcat module if it is required
319    $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
320    my $results = $self->{'textcat'}->classify_cached_filename(\$text);
321
322
323    if (scalar @$results < 0) {
324    return undef;
325    }
326   
327    # We have some results, we choose the first
328    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
329   
330    $best_encoding = $encoding;
331    if (!defined $best_encoding) {
332    return undef;
333    }
334   
335    if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
336    # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
337    $best_encoding = 'utf8';
338    }
339   
340   
341    # check for equivalents where textcat doesn't have some encodings...
342    # eg MS versions of standard encodings
343    if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
344    my $iso = $1; # which variant of the iso standard?
345    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
346    if ($text =~ /[\x80-\x9f]/) {
347        # Western Europe
348        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
349        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
350        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
351        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
352        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
353        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
354        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
355    }
356    }
357   
358    if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
359    !defined $encodings::encodings->{$best_encoding})
360    {
361    if ($self->{'verbosity'}) {
362        gsprintf($outhandle, "CommonUtil: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
363    }
364    $best_encoding = undef;
365    }
366   
367    return $best_encoding;
368}
369
370
371
372sub deduce_filename_encoding
373{
374    my $self = shift (@_); 
375    my ($file,$metadata,$plugin_filename_encoding) = @_;
376
377    my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
378    my $deduced_filename_encoding = undef;
379   
380    # Start by looking for manually assigned metadata
381    if (defined $gs_filename_encoding) {
382    if (ref ($gs_filename_encoding) eq "ARRAY") {
383        my $outhandle = $self->{'outhandle'};
384       
385        $deduced_filename_encoding = $gs_filename_encoding->[0];
386       
387        my $num_vals = scalar(@$gs_filename_encoding);
388        if ($num_vals>1) {
389        print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
390        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
391        }
392    }
393    else {
394        $deduced_filename_encoding = $gs_filename_encoding;
395    }
396    }
397   
398    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
399    # Look to see if plugin specifies this value
400
401    if (defined $plugin_filename_encoding) {
402        # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
403        if ($plugin_filename_encoding =~ m/^auto-.*$/) {
404        my $outhandle = $self->{'outhandle'};
405        print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
406        print $outhandle "         default to 'auto'\n";
407        $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
408        }
409       
410        if ($plugin_filename_encoding ne "auto") {
411        # We've been given a specific filenamne encoding
412        # => so use it!
413        $deduced_filename_encoding = $plugin_filename_encoding;
414        }
415    }
416    }
417   
418    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
419
420    # Look to file system to provide a character encoding
421
422    # If Windows NTFS, then -- assuming we work with long file names got through
423    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
424
425    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
426        # Can do better than working with the DOS character encoding returned by locale     
427        $deduced_filename_encoding = "unicode";
428    }
429    else {
430        # Unix of some form or other
431
432        # See if we can determine the file system encoding through locale
433        $deduced_filename_encoding = $self->locale_encoding();
434       
435        # if locale shows us filesystem is utf8, check to see filename is consistent
436        # => if not, then we have an "alien" filename on our hands
437
438        if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
439        if (!&unicode::check_is_utf8($file)) {
440            # "alien" filename, so revert
441            $deduced_filename_encoding = undef;
442        }
443        }
444    }
445    }
446   
447#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
448#       # Last chance, apply textcat to deduce filename encoding
449#       $deduced_filename_encoding = $self->textcat_encoding($file);
450#    }
451
452    if ($self->{'verbosity'}>3) {
453    my $outhandle = $self->{'outhandle'};
454
455    if (defined $deduced_filename_encoding) {
456        print $outhandle "  Deduced filename encoding as: $deduced_filename_encoding\n";
457    }
458    else {
459        print $outhandle "  No filename encoding deduced\n";
460    }
461    }
462   
463    return $deduced_filename_encoding;
464}
465
466
467sub guess_filesystem_encoding
468{
469   my $self = shift (@_);
470    # Look to file system to provide a character encoding
471   my $deduced_filename_encoding = "";
472    # If Windows NTFS, then -- assuming we work with long file names got through
473    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
474
475    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
476        # Can do better than working with the DOS character encoding returned by locale     
477        $deduced_filename_encoding = "unicode";
478    }
479    else {
480        # Unix of some form or other
481
482        # See if we can determine the file system encoding through locale
483        $deduced_filename_encoding = $self->locale_encoding(); #utf8??
484       
485    }
486   return $deduced_filename_encoding;
487}
488
489
490# uses locale
491sub get_filesystem_encoding
492{
493
494    my $self = shift(@_);
495
496    my $outhandle = $self->{'outhandle'};
497    my $filesystem_encoding = undef;
498
499    eval {
500    # Works for Windows as well, returning the DOS code page in use
501    use POSIX qw(locale_h);
502   
503    # With only one parameter, setlocale retrieves the
504    # current value
505    my $current_locale = setlocale(LC_CTYPE);
506   
507    my $char_encoding = undef;
508    if ($current_locale =~ m/\./) {
509        ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
510        $char_encoding = lc($char_encoding);
511    }
512    else {
513        if ($current_locale =~ m/^(posix|c)$/i) {
514        $char_encoding = "ascii";
515        }
516    }
517
518    if (defined $char_encoding) {
519        if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
520        $char_encoding = "$1\_$2\_$3";
521        }
522
523        $char_encoding =~ s/-/_/g;
524        $char_encoding =~ s/^utf_8$/utf8/;
525       
526        if ($char_encoding =~ m/^\d+$/) {
527        if (defined $encodings::encodings->{"windows_$char_encoding"}) {
528            $char_encoding = "windows_$char_encoding";
529        }
530        elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
531            $char_encoding = "dos_$char_encoding";
532        }
533        }
534       
535        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
536        || (defined $encodings::encodings->{$char_encoding})) {
537        $filesystem_encoding = $char_encoding;
538        }
539        else {
540        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
541        }
542    }
543   
544
545    };
546    if ($@) {
547    print $outhandle "$@\n";
548    print $outhandle "Warning: Unable to establish locale.  Will assume filesystem is UTF-8\n";
549   
550    }
551
552    return $filesystem_encoding;
553}
554
555
556
557# write_file -- used by ConvertToPlug, for example in post processing
558#
559# where should this go, is here the best place??
560sub utf8_write_file {
561    my $self = shift (@_);
562    my ($textref, $filename) = @_;
563   
564    if (!open (FILE, ">:utf8", $filename)) {
565    gsprintf(STDERR, "CommonUtil::utf8_write_file {CommonUtil.could_not_open_for_writing} ($!)\n", $filename);
566    die "\n";
567    }
568    print FILE $$textref;
569   
570    close FILE;
571}
572
573sub block_raw_filename {
574
575    my $self = shift (@_);
576    my ($block_hash,$filename_full_path) = @_;
577
578    my $unicode_filename = $self->raw_filename_to_unicode($filename_full_path);
579    return $self->block_filename($block_hash, $unicode_filename);
580}
581
582# block unicode string filename
583sub block_filename
584{
585    my $self = shift (@_);
586    my ($block_hash,$filename_full_path) = @_;
587     
588    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
589       # block hash contains long names, lets make sure that we were passed a long name
590       $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
591       # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
592       my $lower_filename_full_path = lc($filename_full_path);
593       $block_hash->{'file_blocks'}->{$lower_filename_full_path} = 1;
594   
595    }
596    elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
597    # we need to normalize the filenames
598        my $composed_filename_full_path = normalize('C', $filename_full_path);
599       ## print STDERR "darwin, composed filename =". &unicode::debug_unicode_string($composed_filename_full_path)."\n";
600        $block_hash->{'file_blocks'}->{$composed_filename_full_path} = 1;
601   }
602 
603    else {
604    $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
605    }
606}
607
608
609# filename is raw filesystem name
610sub raw_file_is_blocked {
611     my $self = shift (@_);
612     my ($block_hash, $filename_full_path) = @_;
613
614     my $unicode_filename_full_path = $self->raw_filename_to_unicode($filename_full_path);
615     return $self->file_is_blocked($block_hash, $unicode_filename_full_path);
616}
617
618# filename must be perl unicode string
619sub file_is_blocked {
620    my $self = shift (@_);
621    my ($block_hash, $filename_full_path) = @_;
622
623    #
624    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
625    # convert to long filenames if needed
626    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
627    # all block paths are lowercased.
628    my $lower_filename = lc ($filename_full_path);
629    if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
630        $self->{'num_blocked'} ++;
631        return 1;
632    }
633    }
634    elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
635
636    # on mac, we want composed form in the block hash   
637        my $composed_form = normalize('C', $filename_full_path);
638        if (defined $block_hash->{'file_blocks'}->{$composed_form}) {
639            $self->{'num_blocked'} ++;
640            return 1;
641        }
642    }
643
644    else {
645    if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
646        $self->{'num_blocked'} ++;
647        return 1;
648    }
649    }
650    # check Directory plugin's own block_exp
651    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
652    $self->{'num_blocked'} ++;
653    return 1; # blocked
654    }
655    return 0;
656}
657
658
6591;
660
Note: See TracBrowser for help on using the browser.