source: main/trunk/greenstone2/perllib/plugins/EncodingUtil.pm@ 31478

Last change on this file since 31478 was 31478, checked in by kjdon, 7 years ago

blocking stuff moved to here

File size: 20.5 KB
Line 
1###########################################################################
2#
3# EncodingUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package EncodingUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34
35use PrintInfo;
36use Encode;
37
38BEGIN {
39 @EncodingUtil::ISA = ( 'PrintInfo' );
40}
41
42our $encoding_list =
43 [ { 'name' => "ascii",
44 'desc' => "{BasePlugin.encoding.ascii}" },
45 { 'name' => "utf8",
46 'desc' => "{BasePlugin.encoding.utf8}" },
47 { 'name' => "unicode",
48 'desc' => "{BasePlugin.encoding.unicode}" } ];
49
50
51my $e = $encodings::encodings;
52foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
53{
54 my $hashEncode =
55 {'name' => $enc,
56 'desc' => $e->{$enc}->{'name'}};
57
58 push(@{$encoding_list},$hashEncode);
59}
60
61our $encoding_plus_auto_list =
62 [ { 'name' => "auto",
63 'desc' => "{BasePlugin.filename_encoding.auto}" },
64 { 'name' => "auto-language-analysis",
65 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
66 { 'name' => "auto-filesystem-encoding",
67 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
68 { 'name' => "auto-fl",
69 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
70 { 'name' => "auto-lf",
71 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
72
73push(@{$encoding_plus_auto_list},@{$encoding_list});
74
75my $arguments =
76 [ { 'name' => "block_exp",
77 'desc' => "{BasePlugin.block_exp}",
78 'type' => "regexp",
79 'deft' => "",
80 'reqd' => "no" },
81 { 'name' => "no_blocking",
82 'desc' => "{BasePlugin.no_blocking}",
83 'type' => "flag",
84 'reqd' => "no"},
85 { 'name' => "filename_encoding",
86 'desc' => "{BasePlugin.filename_encoding}",
87 'type' => "enum",
88 'deft' => "auto",
89 'list' => $encoding_plus_auto_list,
90 'reqd' => "no" }
91 ];
92
93my $options = { 'name' => "EncodingUtil",
94 'desc' => "{EncodingUtil.desc}",
95 'abstract' => "yes",
96 'inherits' => "no",
97 'args' => $arguments };
98
99
100sub new {
101
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
104 push(@$pluginlist, $class);
105
106 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107 push(@{$hashArgOptLists->{"OptList"}},$options);
108
109 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
110
111 return bless $self, $class;
112
113}
114
115sub init {
116 my $self = shift (@_);
117 my ($verbosity, $outhandle, $failhandle) = @_;
118
119 print STDERR "guess encoding = ".$self->guess_filesystem_encoding()."\n";
120 print STDERR "get encoding = ".$self->get_filesystem_encoding()."\n";
121
122 # verbosity is passed through from the processor
123 $self->{'verbosity'} = $verbosity;
124
125 # as are the outhandle and failhandle
126 $self->{'outhandle'} = $outhandle if defined $outhandle;
127 $self->{'failhandle'} = $failhandle;
128
129}
130
131# converts raw filesystem filename to perl unicode format
132sub raw_filename_to_unicode {
133 my $self = shift (@_);
134 my ($file) = @_;
135
136 my $unicode_file = "";
137 ### need it in perl unicode, not raw filesystem
138 my $filename_encoding = $self->guess_filesystem_encoding();
139
140 # copied this from set_Source_metadata in BasePlugin
141 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
142 # Use filename_encoding to map raw filename to a Perl unicode-aware string
143 $unicode_file = decode($filename_encoding,$file);
144 }
145 else {
146 # otherwise generate %xx encoded version of filename for char > 127
147 $unicode_file = &unicode::raw_filename_to_url_encoded($file);
148 }
149 return $unicode_file;
150
151}
152# just converts path as is to utf8.
153sub filepath_to_utf8 {
154 my $self = shift (@_);
155 my ($file, $file_encoding) = @_;
156 my $filemeta = $file;
157
158 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
159
160 # Whenever filename-encoding is set to any of the auto settings, we
161 # check if the filename is already in UTF8. If it is, then we're done.
162 if($filename_encoding =~ m/auto/) {
163 if(&unicode::check_is_utf8($filemeta))
164 {
165 $filename_encoding = "utf8";
166 return $filemeta;
167 }
168 }
169
170 # Auto setting, but filename is not utf8
171 if ($filename_encoding eq "auto")
172 {
173 # try textcat
174 $filename_encoding = $self->textcat_encoding($filemeta);
175
176 # check the locale next
177 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
178
179
180 # now try the encoding of the document, if available
181 if ($filename_encoding eq "undefined" && defined $file_encoding) {
182 $filename_encoding = $file_encoding;
183 }
184
185 }
186
187 elsif ($filename_encoding eq "auto-language-analysis")
188 {
189 $filename_encoding = $self->textcat_encoding($filemeta);
190
191 # now try the encoding of the document, if available
192 if ($filename_encoding eq "undefined" && defined $file_encoding) {
193 $filename_encoding = $file_encoding;
194 }
195 }
196
197 elsif ($filename_encoding eq "auto-filesystem-encoding")
198 {
199 # try locale
200 $filename_encoding = $self->locale_encoding();
201 }
202
203 elsif ($filename_encoding eq "auto-fl")
204 {
205 # filesystem-encoding (locale) then language-analysis (textcat)
206 $filename_encoding = $self->locale_encoding();
207
208 # try textcat
209 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
210
211 # else assume filename encoding is encoding of file content, if that's available
212 if ($filename_encoding eq "undefined" && defined $file_encoding) {
213 $filename_encoding = $file_encoding;
214 }
215 }
216
217 elsif ($filename_encoding eq "auto-lf")
218 {
219 # language-analysis (textcat) then filesystem-encoding (locale)
220 $filename_encoding = $self->textcat_encoding($filemeta);
221
222 # guess filename encoding from encoding of file content, if available
223 if ($filename_encoding eq "undefined" && defined $file_encoding) {
224 $filename_encoding = $file_encoding;
225 }
226
227 # try locale
228 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
229 }
230
231 # if still undefined, use utf8 as fallback
232 if ($filename_encoding eq "undefined") {
233 $filename_encoding = "utf8";
234 }
235
236 #print STDERR "**** UTF8 encoding the filename $filemeta ";
237
238 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
239 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
240 # be always utf8 (in which case the filename's encoding is also set as utf8 even
241 # though the filename need not be if it originates from another system)--in such
242 # cases attempt to make the filename utf8 to match.
243 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
244 &unicode::ensure_utf8(\$filemeta);
245 }
246
247 # convert non-unicode encodings to utf8
248 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
249 $filemeta = &unicode::unicode2utf8(
250 &unicode::convert2unicode($filename_encoding, \$filemeta)
251 );
252 }
253
254 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
255 return $filemeta;
256}
257
258# gets the filename with no path, converts to utf8, and then dm safes it.
259# filename_encoding set by user
260sub filename_to_utf8_metadata
261{
262 my $self = shift (@_);
263 my ($file, $file_encoding) = @_;
264
265 my $outhandle = $self->{'outhandle'};
266
267 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
268 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
269 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
270
271 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
272 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
273
274 return $filemeta;
275}
276
277sub locale_encoding {
278 my $self = shift(@_);
279
280 if (!defined $self->{'filesystem_encoding'}) {
281 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
282 }
283
284 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
285 return $self->{'filesystem_encoding'}; # can be the string "undefined"
286}
287
288
289sub textcat_encoding {
290 my $self = shift(@_);
291 my ($filemeta) = @_;
292
293 # analyse filenames without extensions and digits (and trimmed of
294 # surrounding whitespace), so that irrelevant chars don't confuse
295 # textcat
296 my $strictfilemeta = $filemeta;
297 $strictfilemeta =~ s/\.[^\.]+$//g;
298 $strictfilemeta =~ s/\d//g;
299 $strictfilemeta =~ s/^\s*//g;
300 $strictfilemeta =~ s/\s*$//g;
301
302 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
303 if(!defined $filename_encoding) {
304 $filename_encoding = "undefined";
305 }
306
307 return $filename_encoding; # can be the string "undefined"
308}
309
310# performs textcat
311sub encoding_from_language_analysis {
312 my $self = shift(@_);
313 my ($text) = @_;
314
315 my $outhandle = $self->{'outhandle'};
316 my $best_encoding = undef;
317
318 # get the language/encoding of the textstring using textcat
319 require textcat; # Only load the textcat module if it is required
320 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
321 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
322
323
324 if (scalar @$results < 0) {
325 return undef;
326 }
327
328 # We have some results, we choose the first
329 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
330
331 $best_encoding = $encoding;
332 if (!defined $best_encoding) {
333 return undef;
334 }
335
336 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
337 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
338 $best_encoding = 'utf8';
339 }
340
341
342 # check for equivalents where textcat doesn't have some encodings...
343 # eg MS versions of standard encodings
344 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
345 my $iso = $1; # which variant of the iso standard?
346 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
347 if ($text =~ /[\x80-\x9f]/) {
348 # Western Europe
349 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
350 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
351 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
352 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
353 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
354 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
355 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
356 }
357 }
358
359 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
360 !defined $encodings::encodings->{$best_encoding})
361 {
362 if ($self->{'verbosity'}) {
363 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
364 }
365 $best_encoding = undef;
366 }
367
368 return $best_encoding;
369}
370
371
372
373sub deduce_filename_encoding
374{
375 my $self = shift (@_);
376 my ($file,$metadata,$plugin_filename_encoding) = @_;
377
378 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
379 my $deduced_filename_encoding = undef;
380
381 # Start by looking for manually assigned metadata
382 if (defined $gs_filename_encoding) {
383 if (ref ($gs_filename_encoding) eq "ARRAY") {
384 my $outhandle = $self->{'outhandle'};
385
386 $deduced_filename_encoding = $gs_filename_encoding->[0];
387
388 my $num_vals = scalar(@$gs_filename_encoding);
389 if ($num_vals>1) {
390 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
391 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
392 }
393 }
394 else {
395 $deduced_filename_encoding = $gs_filename_encoding;
396 }
397 }
398
399 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
400 # Look to see if plugin specifies this value
401
402 if (defined $plugin_filename_encoding) {
403 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
404 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
405 my $outhandle = $self->{'outhandle'};
406 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
407 print $outhandle " default to 'auto'\n";
408 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
409 }
410
411 if ($plugin_filename_encoding ne "auto") {
412 # We've been given a specific filenamne encoding
413 # => so use it!
414 $deduced_filename_encoding = $plugin_filename_encoding;
415 }
416 }
417 }
418
419 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
420
421 # Look to file system to provide a character encoding
422
423 # If Windows NTFS, then -- assuming we work with long file names got through
424 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
425
426 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
427 # Can do better than working with the DOS character encoding returned by locale
428 $deduced_filename_encoding = "unicode";
429 }
430 else {
431 # Unix of some form or other
432
433 # See if we can determine the file system encoding through locale
434 $deduced_filename_encoding = $self->locale_encoding();
435
436 # if locale shows us filesystem is utf8, check to see filename is consistent
437 # => if not, then we have an "alien" filename on our hands
438
439 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
440 if (!&unicode::check_is_utf8($file)) {
441 # "alien" filename, so revert
442 $deduced_filename_encoding = undef;
443 }
444 }
445 }
446 }
447
448# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
449# # Last chance, apply textcat to deduce filename encoding
450# $deduced_filename_encoding = $self->textcat_encoding($file);
451# }
452
453 if ($self->{'verbosity'}>3) {
454 my $outhandle = $self->{'outhandle'};
455
456 if (defined $deduced_filename_encoding) {
457 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
458 }
459 else {
460 print $outhandle " No filename encoding deduced\n";
461 }
462 }
463
464 return $deduced_filename_encoding;
465}
466
467
468sub guess_filesystem_encoding
469{
470 my $self = shift (@_);
471 # Look to file system to provide a character encoding
472 my $deduced_filename_encoding = "";
473 # If Windows NTFS, then -- assuming we work with long file names got through
474 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
475
476 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
477 # Can do better than working with the DOS character encoding returned by locale
478 $deduced_filename_encoding = "unicode";
479 }
480 else {
481 # Unix of some form or other
482
483 # See if we can determine the file system encoding through locale
484 $deduced_filename_encoding = $self->locale_encoding(); #utf8??
485
486 }
487 print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";
488 return $deduced_filename_encoding;
489}
490
491
492# uses locale
493sub get_filesystem_encoding
494{
495
496 my $self = shift(@_);
497
498 my $outhandle = $self->{'outhandle'};
499 my $filesystem_encoding = undef;
500
501 eval {
502 # Works for Windows as well, returning the DOS code page in use
503 use POSIX qw(locale_h);
504
505 # With only one parameter, setlocale retrieves the
506 # current value
507 my $current_locale = setlocale(LC_CTYPE);
508
509 my $char_encoding = undef;
510 if ($current_locale =~ m/\./) {
511 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
512 $char_encoding = lc($char_encoding);
513 }
514 else {
515 if ($current_locale =~ m/^(posix|c)$/i) {
516 $char_encoding = "ascii";
517 }
518 }
519
520 if (defined $char_encoding) {
521 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
522 $char_encoding = "$1\_$2\_$3";
523 }
524
525 $char_encoding =~ s/-/_/g;
526 $char_encoding =~ s/^utf_8$/utf8/;
527
528 if ($char_encoding =~ m/^\d+$/) {
529 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
530 $char_encoding = "windows_$char_encoding";
531 }
532 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
533 $char_encoding = "dos_$char_encoding";
534 }
535 }
536
537 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
538 || (defined $encodings::encodings->{$char_encoding})) {
539 $filesystem_encoding = $char_encoding;
540 }
541 else {
542 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
543 }
544 }
545
546
547 };
548 if ($@) {
549 print $outhandle "$@\n";
550 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
551
552 }
553
554 return $filesystem_encoding;
555}
556
557
558
559# write_file -- used by ConvertToPlug, for example in post processing
560#
561# where should this go, is here the best place??
562sub utf8_write_file {
563 my $self = shift (@_);
564 my ($textref, $filename) = @_;
565
566 if (!open (FILE, ">:utf8", $filename)) {
567 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
568 die "\n";
569 }
570 print FILE $$textref;
571
572 close FILE;
573}
574
575sub block_raw_filename {
576
577 my $self = shift (@_);
578 my ($block_hash,$filename_full_path) = @_;
579
580 my $unicode_filename = $self->raw_filename_to_unicode($filename_full_path);
581 return $self->block_filename($block_hash, $unicode_filename);
582}
583
584# block unicode string filename
585sub block_filename
586{
587 my $self = shift (@_);
588 my ($block_hash,$filename_full_path) = @_;
589 print STDERR "in block filename $filename_full_path\n";
590 print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
591
592 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
593 # block hash contains long names, lets make sure that we were passed a long name
594 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
595 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
596 my $lower_filename_full_path = lc($filename_full_path);
597 $block_hash->{'file_blocks'}->{$lower_filename_full_path} = 1;
598
599 }
600 elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
601 # we need to normalize the filenames
602 my $composed_filename_full_path = normalize('C', $filename_full_path);
603 print STDERR "darwin, composed filename =". &unicode::debug_unicode_string($composed_filename_full_path)."\n";
604 $block_hash->{'file_blocks'}->{$composed_filename_full_path} = 1;
605 }
606
607 else {
608 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
609 }
610}
611
612
613# filename is raw filesystem name
614sub raw_file_is_blocked {
615 my $self = shift (@_);
616 my ($block_hash, $filename_full_path) = @_;
617
618 my $unicode_filename_full_path = $self->raw_filename_to_unicode($filename_full_path);
619 return $self->file_is_blocked($block_hash, $unicode_filename_full_path);
620}
621
622# filename must be perl unicode string
623sub file_is_blocked {
624 my $self = shift (@_);
625 my ($block_hash, $filename_full_path) = @_;
626
627 #
628 print STDERR "in file is blocked $filename_full_path\n";
629 print STDERR &unicode::debug_unicode_string($filename_full_path)."\n";
630 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
631 # convert to long filenames if needed
632 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
633 # all block paths are lowercased.
634 my $lower_filename = lc ($filename_full_path);
635 if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
636 $self->{'num_blocked'} ++;
637 return 1;
638 }
639 }
640 else {
641 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
642 $self->{'num_blocked'} ++;
643 print STDERR "BLOCKED\n";
644 return 1;
645 }
646 }
647 # check Directory plugin's own block_exp
648 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
649 $self->{'num_blocked'} ++;
650 return 1; # blocked
651 }
652 print STDERR "NOT BLOCKED\n";
653 return 0;
654}
655
656
6571;
658
Note: See TracBrowser for help on using the repository browser.