source: main/trunk/greenstone2/perllib/plugins/EncodingUtil.pm@ 31487

Last change on this file since 31487 was 31487, checked in by ak19, 7 years ago

Important import statement for the recent commits related to encoding.

File size: 20.5 KB
RevLine 
[31456]1###########################################################################
2#
3# EncodingUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package EncodingUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34
35use PrintInfo;
[31478]36use Encode;
[31487]37use Unicode::Normalize 'normalize';
[31456]38
39BEGIN {
40 @EncodingUtil::ISA = ( 'PrintInfo' );
41}
42
43our $encoding_list =
44 [ { 'name' => "ascii",
45 'desc' => "{BasePlugin.encoding.ascii}" },
46 { 'name' => "utf8",
47 'desc' => "{BasePlugin.encoding.utf8}" },
48 { 'name' => "unicode",
49 'desc' => "{BasePlugin.encoding.unicode}" } ];
50
51
52my $e = $encodings::encodings;
53foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
54{
55 my $hashEncode =
56 {'name' => $enc,
57 'desc' => $e->{$enc}->{'name'}};
58
59 push(@{$encoding_list},$hashEncode);
60}
61
62our $encoding_plus_auto_list =
63 [ { 'name' => "auto",
64 'desc' => "{BasePlugin.filename_encoding.auto}" },
65 { 'name' => "auto-language-analysis",
66 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
67 { 'name' => "auto-filesystem-encoding",
68 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
69 { 'name' => "auto-fl",
70 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
71 { 'name' => "auto-lf",
72 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
73
74push(@{$encoding_plus_auto_list},@{$encoding_list});
75
76my $arguments =
[31478]77 [ { 'name' => "block_exp",
78 'desc' => "{BasePlugin.block_exp}",
79 'type' => "regexp",
80 'deft' => "",
81 'reqd' => "no" },
82 { 'name' => "no_blocking",
83 'desc' => "{BasePlugin.no_blocking}",
84 'type' => "flag",
85 'reqd' => "no"},
86 { 'name' => "filename_encoding",
87 'desc' => "{BasePlugin.filename_encoding}",
88 'type' => "enum",
89 'deft' => "auto",
90 'list' => $encoding_plus_auto_list,
91 'reqd' => "no" }
92 ];
[31456]93
94my $options = { 'name' => "EncodingUtil",
95 'desc' => "{EncodingUtil.desc}",
96 'abstract' => "yes",
97 'inherits' => "no",
98 'args' => $arguments };
99
100
101sub new {
102
103 my ($class) = shift (@_);
104 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
105 push(@$pluginlist, $class);
106
107 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
108 push(@{$hashArgOptLists->{"OptList"}},$options);
109
110 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
111
112 return bless $self, $class;
113
114}
115
116sub init {
117 my $self = shift (@_);
118 my ($verbosity, $outhandle, $failhandle) = @_;
119
120 print STDERR "guess encoding = ".$self->guess_filesystem_encoding()."\n";
121 print STDERR "get encoding = ".$self->get_filesystem_encoding()."\n";
122
123 # verbosity is passed through from the processor
124 $self->{'verbosity'} = $verbosity;
125
126 # as are the outhandle and failhandle
127 $self->{'outhandle'} = $outhandle if defined $outhandle;
128 $self->{'failhandle'} = $failhandle;
129
130}
131
[31478]132# converts raw filesystem filename to perl unicode format
133sub raw_filename_to_unicode {
134 my $self = shift (@_);
135 my ($file) = @_;
[31456]136
[31478]137 my $unicode_file = "";
138 ### need it in perl unicode, not raw filesystem
139 my $filename_encoding = $self->guess_filesystem_encoding();
140
141 # copied this from set_Source_metadata in BasePlugin
142 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
143 # Use filename_encoding to map raw filename to a Perl unicode-aware string
144 $unicode_file = decode($filename_encoding,$file);
145 }
146 else {
147 # otherwise generate %xx encoded version of filename for char > 127
148 $unicode_file = &unicode::raw_filename_to_url_encoded($file);
149 }
150 return $unicode_file;
151
152}
[31456]153# just converts path as is to utf8.
154sub filepath_to_utf8 {
155 my $self = shift (@_);
156 my ($file, $file_encoding) = @_;
157 my $filemeta = $file;
158
159 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
160
161 # Whenever filename-encoding is set to any of the auto settings, we
162 # check if the filename is already in UTF8. If it is, then we're done.
163 if($filename_encoding =~ m/auto/) {
164 if(&unicode::check_is_utf8($filemeta))
165 {
166 $filename_encoding = "utf8";
167 return $filemeta;
168 }
169 }
170
171 # Auto setting, but filename is not utf8
172 if ($filename_encoding eq "auto")
173 {
174 # try textcat
175 $filename_encoding = $self->textcat_encoding($filemeta);
176
177 # check the locale next
178 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
179
180
181 # now try the encoding of the document, if available
182 if ($filename_encoding eq "undefined" && defined $file_encoding) {
183 $filename_encoding = $file_encoding;
184 }
185
186 }
187
188 elsif ($filename_encoding eq "auto-language-analysis")
189 {
190 $filename_encoding = $self->textcat_encoding($filemeta);
191
192 # now try the encoding of the document, if available
193 if ($filename_encoding eq "undefined" && defined $file_encoding) {
194 $filename_encoding = $file_encoding;
195 }
196 }
197
198 elsif ($filename_encoding eq "auto-filesystem-encoding")
199 {
200 # try locale
201 $filename_encoding = $self->locale_encoding();
202 }
203
204 elsif ($filename_encoding eq "auto-fl")
205 {
206 # filesystem-encoding (locale) then language-analysis (textcat)
207 $filename_encoding = $self->locale_encoding();
208
209 # try textcat
210 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
211
212 # else assume filename encoding is encoding of file content, if that's available
213 if ($filename_encoding eq "undefined" && defined $file_encoding) {
214 $filename_encoding = $file_encoding;
215 }
216 }
217
218 elsif ($filename_encoding eq "auto-lf")
219 {
220 # language-analysis (textcat) then filesystem-encoding (locale)
221 $filename_encoding = $self->textcat_encoding($filemeta);
222
223 # guess filename encoding from encoding of file content, if available
224 if ($filename_encoding eq "undefined" && defined $file_encoding) {
225 $filename_encoding = $file_encoding;
226 }
227
228 # try locale
229 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
230 }
231
232 # if still undefined, use utf8 as fallback
233 if ($filename_encoding eq "undefined") {
234 $filename_encoding = "utf8";
235 }
236
237 #print STDERR "**** UTF8 encoding the filename $filemeta ";
238
239 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
240 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
241 # be always utf8 (in which case the filename's encoding is also set as utf8 even
242 # though the filename need not be if it originates from another system)--in such
243 # cases attempt to make the filename utf8 to match.
244 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
245 &unicode::ensure_utf8(\$filemeta);
246 }
247
248 # convert non-unicode encodings to utf8
249 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
250 $filemeta = &unicode::unicode2utf8(
251 &unicode::convert2unicode($filename_encoding, \$filemeta)
252 );
253 }
254
255 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
256 return $filemeta;
257}
258
259# gets the filename with no path, converts to utf8, and then dm safes it.
260# filename_encoding set by user
261sub filename_to_utf8_metadata
262{
263 my $self = shift (@_);
264 my ($file, $file_encoding) = @_;
265
266 my $outhandle = $self->{'outhandle'};
267
268 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
269 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
270 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
271
272 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
273 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
274
275 return $filemeta;
276}
277
278sub locale_encoding {
279 my $self = shift(@_);
280
281 if (!defined $self->{'filesystem_encoding'}) {
282 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
283 }
284
285 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
286 return $self->{'filesystem_encoding'}; # can be the string "undefined"
287}
288
289
290sub textcat_encoding {
291 my $self = shift(@_);
292 my ($filemeta) = @_;
293
294 # analyse filenames without extensions and digits (and trimmed of
295 # surrounding whitespace), so that irrelevant chars don't confuse
296 # textcat
297 my $strictfilemeta = $filemeta;
298 $strictfilemeta =~ s/\.[^\.]+$//g;
299 $strictfilemeta =~ s/\d//g;
300 $strictfilemeta =~ s/^\s*//g;
301 $strictfilemeta =~ s/\s*$//g;
302
303 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
304 if(!defined $filename_encoding) {
305 $filename_encoding = "undefined";
306 }
307
308 return $filename_encoding; # can be the string "undefined"
309}
310
311# performs textcat
312sub encoding_from_language_analysis {
313 my $self = shift(@_);
314 my ($text) = @_;
315
316 my $outhandle = $self->{'outhandle'};
317 my $best_encoding = undef;
318
319 # get the language/encoding of the textstring using textcat
320 require textcat; # Only load the textcat module if it is required
321 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
322 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
323
324
325 if (scalar @$results < 0) {
326 return undef;
327 }
328
329 # We have some results, we choose the first
330 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
331
332 $best_encoding = $encoding;
333 if (!defined $best_encoding) {
334 return undef;
335 }
336
337 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
338 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
339 $best_encoding = 'utf8';
340 }
341
342
343 # check for equivalents where textcat doesn't have some encodings...
344 # eg MS versions of standard encodings
345 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
346 my $iso = $1; # which variant of the iso standard?
347 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
348 if ($text =~ /[\x80-\x9f]/) {
349 # Western Europe
350 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
351 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
352 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
353 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
354 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
355 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
356 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
357 }
358 }
359
360 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
361 !defined $encodings::encodings->{$best_encoding})
362 {
363 if ($self->{'verbosity'}) {
364 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
365 }
366 $best_encoding = undef;
367 }
368
369 return $best_encoding;
370}
371
372
373
374sub deduce_filename_encoding
375{
376 my $self = shift (@_);
377 my ($file,$metadata,$plugin_filename_encoding) = @_;
378
379 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
380 my $deduced_filename_encoding = undef;
381
382 # Start by looking for manually assigned metadata
383 if (defined $gs_filename_encoding) {
384 if (ref ($gs_filename_encoding) eq "ARRAY") {
385 my $outhandle = $self->{'outhandle'};
386
387 $deduced_filename_encoding = $gs_filename_encoding->[0];
388
389 my $num_vals = scalar(@$gs_filename_encoding);
390 if ($num_vals>1) {
391 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
392 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
393 }
394 }
395 else {
396 $deduced_filename_encoding = $gs_filename_encoding;
397 }
398 }
399
400 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
401 # Look to see if plugin specifies this value
402
403 if (defined $plugin_filename_encoding) {
404 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
405 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
406 my $outhandle = $self->{'outhandle'};
407 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
408 print $outhandle " default to 'auto'\n";
409 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
410 }
411
412 if ($plugin_filename_encoding ne "auto") {
413 # We've been given a specific filenamne encoding
414 # => so use it!
415 $deduced_filename_encoding = $plugin_filename_encoding;
416 }
417 }
418 }
419
420 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
421
422 # Look to file system to provide a character encoding
423
424 # If Windows NTFS, then -- assuming we work with long file names got through
425 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
426
427 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
428 # Can do better than working with the DOS character encoding returned by locale
429 $deduced_filename_encoding = "unicode";
430 }
431 else {
432 # Unix of some form or other
433
434 # See if we can determine the file system encoding through locale
435 $deduced_filename_encoding = $self->locale_encoding();
436
437 # if locale shows us filesystem is utf8, check to see filename is consistent
438 # => if not, then we have an "alien" filename on our hands
439
440 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
441 if (!&unicode::check_is_utf8($file)) {
442 # "alien" filename, so revert
443 $deduced_filename_encoding = undef;
444 }
445 }
446 }
447 }
448
449# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
450# # Last chance, apply textcat to deduce filename encoding
451# $deduced_filename_encoding = $self->textcat_encoding($file);
452# }
453
454 if ($self->{'verbosity'}>3) {
455 my $outhandle = $self->{'outhandle'};
456
457 if (defined $deduced_filename_encoding) {
458 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
459 }
460 else {
461 print $outhandle " No filename encoding deduced\n";
462 }
463 }
464
465 return $deduced_filename_encoding;
466}
467
468
469sub guess_filesystem_encoding
470{
471 my $self = shift (@_);
472 # Look to file system to provide a character encoding
473 my $deduced_filename_encoding = "";
474 # If Windows NTFS, then -- assuming we work with long file names got through
475 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
476
477 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
478 # Can do better than working with the DOS character encoding returned by locale
479 $deduced_filename_encoding = "unicode";
480 }
481 else {
482 # Unix of some form or other
483
484 # See if we can determine the file system encoding through locale
485 $deduced_filename_encoding = $self->locale_encoding(); #utf8??
486
487 }
488 print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";
489 return $deduced_filename_encoding;
490}
491
492
493# uses locale
494sub get_filesystem_encoding
495{
496
497 my $self = shift(@_);
498
499 my $outhandle = $self->{'outhandle'};
500 my $filesystem_encoding = undef;
501
502 eval {
503 # Works for Windows as well, returning the DOS code page in use
504 use POSIX qw(locale_h);
505
506 # With only one parameter, setlocale retrieves the
507 # current value
508 my $current_locale = setlocale(LC_CTYPE);
509
510 my $char_encoding = undef;
511 if ($current_locale =~ m/\./) {
512 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
513 $char_encoding = lc($char_encoding);
514 }
515 else {
516 if ($current_locale =~ m/^(posix|c)$/i) {
517 $char_encoding = "ascii";
518 }
519 }
520
521 if (defined $char_encoding) {
522 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
523 $char_encoding = "$1\_$2\_$3";
524 }
525
526 $char_encoding =~ s/-/_/g;
527 $char_encoding =~ s/^utf_8$/utf8/;
528
529 if ($char_encoding =~ m/^\d+$/) {
530 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
531 $char_encoding = "windows_$char_encoding";
532 }
533 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
534 $char_encoding = "dos_$char_encoding";
535 }
536 }
537
538 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
539 || (defined $encodings::encodings->{$char_encoding})) {
540 $filesystem_encoding = $char_encoding;
541 }
542 else {
543 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
544 }
545 }
546
547
548 };
549 if ($@) {
550 print $outhandle "$@\n";
551 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
552
553 }
554
555 return $filesystem_encoding;
556}
557
558
559
560# write_file -- used by ConvertToPlug, for example in post processing
561#
562# where should this go, is here the best place??
563sub utf8_write_file {
564 my $self = shift (@_);
565 my ($textref, $filename) = @_;
566
567 if (!open (FILE, ">:utf8", $filename)) {
568 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
569 die "\n";
570 }
571 print FILE $$textref;
572
573 close FILE;
574}
575
[31478]576sub block_raw_filename {
577
578 my $self = shift (@_);
579 my ($block_hash,$filename_full_path) = @_;
580
581 my $unicode_filename = $self->raw_filename_to_unicode($filename_full_path);
582 return $self->block_filename($block_hash, $unicode_filename);
583}
584
585# block unicode string filename
586sub block_filename
587{
588 my $self = shift (@_);
589 my ($block_hash,$filename_full_path) = @_;
590 print STDERR "in block filename $filename_full_path\n";
591 print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
592
593 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
594 # block hash contains long names, lets make sure that we were passed a long name
595 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
596 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
597 my $lower_filename_full_path = lc($filename_full_path);
598 $block_hash->{'file_blocks'}->{$lower_filename_full_path} = 1;
599
600 }
601 elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
602 # we need to normalize the filenames
603 my $composed_filename_full_path = normalize('C', $filename_full_path);
604 print STDERR "darwin, composed filename =". &unicode::debug_unicode_string($composed_filename_full_path)."\n";
605 $block_hash->{'file_blocks'}->{$composed_filename_full_path} = 1;
606 }
607
608 else {
609 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
610 }
611}
612
613
614# filename is raw filesystem name
615sub raw_file_is_blocked {
616 my $self = shift (@_);
617 my ($block_hash, $filename_full_path) = @_;
618
619 my $unicode_filename_full_path = $self->raw_filename_to_unicode($filename_full_path);
620 return $self->file_is_blocked($block_hash, $unicode_filename_full_path);
621}
622
623# filename must be perl unicode string
624sub file_is_blocked {
625 my $self = shift (@_);
626 my ($block_hash, $filename_full_path) = @_;
627
628 #
629 print STDERR "in file is blocked $filename_full_path\n";
630 print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
631 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
632 # convert to long filenames if needed
633 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
634 # all block paths are lowercased.
635 my $lower_filename = lc ($filename_full_path);
636 if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
637 $self->{'num_blocked'} ++;
638 return 1;
639 }
640 }
641 else {
642 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
643 $self->{'num_blocked'} ++;
644 print STDERR "BLOCKED\n";
645 return 1;
646 }
647 }
648 # check Directory plugin's own block_exp
649 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
650 $self->{'num_blocked'} ++;
651 return 1; # blocked
652 }
653 print STDERR "NOT BLOCKED\n";
654 return 0;
655}
656
657
[31456]6581;
659
Note: See TracBrowser for help on using the repository browser.