source: main/trunk/greenstone2/perllib/plugins/EncodingUtil.pm@ 31456

Last change on this file since 31456 was 31456, checked in by kjdon, 7 years ago

new base plugin for directories and files. DirectoryPLugin needs filename_encoding option, and encoding help so it can get directories from the filesystem into perl unicode. So moved all the encoding code out from BasePlugin into this EncodingUtil plugin, which sits after PrintInfo, but before BasePlugin in the plugin inheritance hierarchy

File size: 16.8 KB
Line 
1###########################################################################
2#
3# EncodingUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package EncodingUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34
35use PrintInfo;
36
37
38BEGIN {
39 @EncodingUtil::ISA = ( 'PrintInfo' );
40}
41
42our $encoding_list =
43 [ { 'name' => "ascii",
44 'desc' => "{BasePlugin.encoding.ascii}" },
45 { 'name' => "utf8",
46 'desc' => "{BasePlugin.encoding.utf8}" },
47 { 'name' => "unicode",
48 'desc' => "{BasePlugin.encoding.unicode}" } ];
49
50
51my $e = $encodings::encodings;
52foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
53{
54 my $hashEncode =
55 {'name' => $enc,
56 'desc' => $e->{$enc}->{'name'}};
57
58 push(@{$encoding_list},$hashEncode);
59}
60
61our $encoding_plus_auto_list =
62 [ { 'name' => "auto",
63 'desc' => "{BasePlugin.filename_encoding.auto}" },
64 { 'name' => "auto-language-analysis",
65 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat
66 { 'name' => "auto-filesystem-encoding",
67 'desc' => "{BasePlugin.filename_encoding.auto_filesystem_encoding}" }, # locale
68 { 'name' => "auto-fl",
69 'desc' => "{BasePlugin.filename_encoding.auto_fl}" }, # locale followed by textcat
70 { 'name' => "auto-lf",
71 'desc' => "{BasePlugin.filename_encoding.auto_lf}" } ]; # texcat followed by locale
72
73push(@{$encoding_plus_auto_list},@{$encoding_list});
74
75my $arguments =
76 [ { 'name' => "filename_encoding",
77 'desc' => "{BasePlugin.filename_encoding}",
78 'type' => "enum",
79 'deft' => "auto",
80 'list' => $encoding_plus_auto_list,
81 'reqd' => "no" }
82 ];
83
84my $options = { 'name' => "EncodingUtil",
85 'desc' => "{EncodingUtil.desc}",
86 'abstract' => "yes",
87 'inherits' => "no",
88 'args' => $arguments };
89
90
91sub new {
92
93 my ($class) = shift (@_);
94 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
95 push(@$pluginlist, $class);
96
97 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98 push(@{$hashArgOptLists->{"OptList"}},$options);
99
100 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
101
102 return bless $self, $class;
103
104}
105
106sub init {
107 my $self = shift (@_);
108 my ($verbosity, $outhandle, $failhandle) = @_;
109
110 print STDERR "guess encoding = ".$self->guess_filesystem_encoding()."\n";
111 print STDERR "get encoding = ".$self->get_filesystem_encoding()."\n";
112
113 # verbosity is passed through from the processor
114 $self->{'verbosity'} = $verbosity;
115
116 # as are the outhandle and failhandle
117 $self->{'outhandle'} = $outhandle if defined $outhandle;
118 $self->{'failhandle'} = $failhandle;
119
120}
121
122
123# just converts path as is to utf8.
124sub filepath_to_utf8 {
125 my $self = shift (@_);
126 my ($file, $file_encoding) = @_;
127 my $filemeta = $file;
128
129 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
130
131 # Whenever filename-encoding is set to any of the auto settings, we
132 # check if the filename is already in UTF8. If it is, then we're done.
133 if($filename_encoding =~ m/auto/) {
134 if(&unicode::check_is_utf8($filemeta))
135 {
136 $filename_encoding = "utf8";
137 return $filemeta;
138 }
139 }
140
141 # Auto setting, but filename is not utf8
142 if ($filename_encoding eq "auto")
143 {
144 # try textcat
145 $filename_encoding = $self->textcat_encoding($filemeta);
146
147 # check the locale next
148 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
149
150
151 # now try the encoding of the document, if available
152 if ($filename_encoding eq "undefined" && defined $file_encoding) {
153 $filename_encoding = $file_encoding;
154 }
155
156 }
157
158 elsif ($filename_encoding eq "auto-language-analysis")
159 {
160 $filename_encoding = $self->textcat_encoding($filemeta);
161
162 # now try the encoding of the document, if available
163 if ($filename_encoding eq "undefined" && defined $file_encoding) {
164 $filename_encoding = $file_encoding;
165 }
166 }
167
168 elsif ($filename_encoding eq "auto-filesystem-encoding")
169 {
170 # try locale
171 $filename_encoding = $self->locale_encoding();
172 }
173
174 elsif ($filename_encoding eq "auto-fl")
175 {
176 # filesystem-encoding (locale) then language-analysis (textcat)
177 $filename_encoding = $self->locale_encoding();
178
179 # try textcat
180 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
181
182 # else assume filename encoding is encoding of file content, if that's available
183 if ($filename_encoding eq "undefined" && defined $file_encoding) {
184 $filename_encoding = $file_encoding;
185 }
186 }
187
188 elsif ($filename_encoding eq "auto-lf")
189 {
190 # language-analysis (textcat) then filesystem-encoding (locale)
191 $filename_encoding = $self->textcat_encoding($filemeta);
192
193 # guess filename encoding from encoding of file content, if available
194 if ($filename_encoding eq "undefined" && defined $file_encoding) {
195 $filename_encoding = $file_encoding;
196 }
197
198 # try locale
199 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
200 }
201
202 # if still undefined, use utf8 as fallback
203 if ($filename_encoding eq "undefined") {
204 $filename_encoding = "utf8";
205 }
206
207 #print STDERR "**** UTF8 encoding the filename $filemeta ";
208
209 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
210 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
211 # be always utf8 (in which case the filename's encoding is also set as utf8 even
212 # though the filename need not be if it originates from another system)--in such
213 # cases attempt to make the filename utf8 to match.
214 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
215 &unicode::ensure_utf8(\$filemeta);
216 }
217
218 # convert non-unicode encodings to utf8
219 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
220 $filemeta = &unicode::unicode2utf8(
221 &unicode::convert2unicode($filename_encoding, \$filemeta)
222 );
223 }
224
225 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
226 return $filemeta;
227}
228
229# gets the filename with no path, converts to utf8, and then dm safes it.
230# filename_encoding set by user
231sub filename_to_utf8_metadata
232{
233 my $self = shift (@_);
234 my ($file, $file_encoding) = @_;
235
236 my $outhandle = $self->{'outhandle'};
237
238 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n";
239 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
240 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
241
242 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
243 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
244
245 return $filemeta;
246}
247
248sub locale_encoding {
249 my $self = shift(@_);
250
251 if (!defined $self->{'filesystem_encoding'}) {
252 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
253 }
254
255 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
256 return $self->{'filesystem_encoding'}; # can be the string "undefined"
257}
258
259
260sub textcat_encoding {
261 my $self = shift(@_);
262 my ($filemeta) = @_;
263
264 # analyse filenames without extensions and digits (and trimmed of
265 # surrounding whitespace), so that irrelevant chars don't confuse
266 # textcat
267 my $strictfilemeta = $filemeta;
268 $strictfilemeta =~ s/\.[^\.]+$//g;
269 $strictfilemeta =~ s/\d//g;
270 $strictfilemeta =~ s/^\s*//g;
271 $strictfilemeta =~ s/\s*$//g;
272
273 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
274 if(!defined $filename_encoding) {
275 $filename_encoding = "undefined";
276 }
277
278 return $filename_encoding; # can be the string "undefined"
279}
280
281# performs textcat
282sub encoding_from_language_analysis {
283 my $self = shift(@_);
284 my ($text) = @_;
285
286 my $outhandle = $self->{'outhandle'};
287 my $best_encoding = undef;
288
289 # get the language/encoding of the textstring using textcat
290 require textcat; # Only load the textcat module if it is required
291 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
292 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
293
294
295 if (scalar @$results < 0) {
296 return undef;
297 }
298
299 # We have some results, we choose the first
300 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
301
302 $best_encoding = $encoding;
303 if (!defined $best_encoding) {
304 return undef;
305 }
306
307 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
308 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
309 $best_encoding = 'utf8';
310 }
311
312
313 # check for equivalents where textcat doesn't have some encodings...
314 # eg MS versions of standard encodings
315 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
316 my $iso = $1; # which variant of the iso standard?
317 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
318 if ($text =~ /[\x80-\x9f]/) {
319 # Western Europe
320 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
321 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
322 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
323 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
324 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
325 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
326 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
327 }
328 }
329
330 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
331 !defined $encodings::encodings->{$best_encoding})
332 {
333 if ($self->{'verbosity'}) {
334 gsprintf($outhandle, "BasePlugin: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
335 }
336 $best_encoding = undef;
337 }
338
339 return $best_encoding;
340}
341
342
343
344sub deduce_filename_encoding
345{
346 my $self = shift (@_);
347 my ($file,$metadata,$plugin_filename_encoding) = @_;
348
349 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
350 my $deduced_filename_encoding = undef;
351
352 # Start by looking for manually assigned metadata
353 if (defined $gs_filename_encoding) {
354 if (ref ($gs_filename_encoding) eq "ARRAY") {
355 my $outhandle = $self->{'outhandle'};
356
357 $deduced_filename_encoding = $gs_filename_encoding->[0];
358
359 my $num_vals = scalar(@$gs_filename_encoding);
360 if ($num_vals>1) {
361 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
362 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
363 }
364 }
365 else {
366 $deduced_filename_encoding = $gs_filename_encoding;
367 }
368 }
369
370 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
371 # Look to see if plugin specifies this value
372
373 if (defined $plugin_filename_encoding) {
374 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
375 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
376 my $outhandle = $self->{'outhandle'};
377 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
378 print $outhandle " default to 'auto'\n";
379 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
380 }
381
382 if ($plugin_filename_encoding ne "auto") {
383 # We've been given a specific filenamne encoding
384 # => so use it!
385 $deduced_filename_encoding = $plugin_filename_encoding;
386 }
387 }
388 }
389
390 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
391
392 # Look to file system to provide a character encoding
393
394 # If Windows NTFS, then -- assuming we work with long file names got through
395 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
396
397 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
398 # Can do better than working with the DOS character encoding returned by locale
399 $deduced_filename_encoding = "unicode";
400 }
401 else {
402 # Unix of some form or other
403
404 # See if we can determine the file system encoding through locale
405 $deduced_filename_encoding = $self->locale_encoding();
406
407 # if locale shows us filesystem is utf8, check to see filename is consistent
408 # => if not, then we have an "alien" filename on our hands
409
410 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
411 if (!&unicode::check_is_utf8($file)) {
412 # "alien" filename, so revert
413 $deduced_filename_encoding = undef;
414 }
415 }
416 }
417 }
418
419# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
420# # Last chance, apply textcat to deduce filename encoding
421# $deduced_filename_encoding = $self->textcat_encoding($file);
422# }
423
424 if ($self->{'verbosity'}>3) {
425 my $outhandle = $self->{'outhandle'};
426
427 if (defined $deduced_filename_encoding) {
428 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
429 }
430 else {
431 print $outhandle " No filename encoding deduced\n";
432 }
433 }
434
435 return $deduced_filename_encoding;
436}
437
438
439sub guess_filesystem_encoding
440{
441 my $self = shift (@_);
442 # Look to file system to provide a character encoding
443 my $deduced_filename_encoding = "";
444 # If Windows NTFS, then -- assuming we work with long file names got through
445 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
446
447 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
448 # Can do better than working with the DOS character encoding returned by locale
449 $deduced_filename_encoding = "unicode";
450 }
451 else {
452 # Unix of some form or other
453
454 # See if we can determine the file system encoding through locale
455 $deduced_filename_encoding = $self->locale_encoding(); #utf8??
456
457 }
458 print STDERR "guessing filesystem encoding is $deduced_filename_encoding\n";
459 return $deduced_filename_encoding;
460}
461
462
463# uses locale
464sub get_filesystem_encoding
465{
466
467 my $self = shift(@_);
468
469 my $outhandle = $self->{'outhandle'};
470 my $filesystem_encoding = undef;
471
472 eval {
473 # Works for Windows as well, returning the DOS code page in use
474 use POSIX qw(locale_h);
475
476 # With only one parameter, setlocale retrieves the
477 # current value
478 my $current_locale = setlocale(LC_CTYPE);
479
480 my $char_encoding = undef;
481 if ($current_locale =~ m/\./) {
482 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
483 $char_encoding = lc($char_encoding);
484 }
485 else {
486 if ($current_locale =~ m/^(posix|c)$/i) {
487 $char_encoding = "ascii";
488 }
489 }
490
491 if (defined $char_encoding) {
492 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
493 $char_encoding = "$1\_$2\_$3";
494 }
495
496 $char_encoding =~ s/-/_/g;
497 $char_encoding =~ s/^utf_8$/utf8/;
498
499 if ($char_encoding =~ m/^\d+$/) {
500 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
501 $char_encoding = "windows_$char_encoding";
502 }
503 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
504 $char_encoding = "dos_$char_encoding";
505 }
506 }
507
508 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
509 || (defined $encodings::encodings->{$char_encoding})) {
510 $filesystem_encoding = $char_encoding;
511 }
512 else {
513 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
514 }
515 }
516
517
518 };
519 if ($@) {
520 print $outhandle "$@\n";
521 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
522
523 }
524
525 return $filesystem_encoding;
526}
527
528
529
530# write_file -- used by ConvertToPlug, for example in post processing
531#
532# where should this go, is here the best place??
533sub utf8_write_file {
534 my $self = shift (@_);
535 my ($textref, $filename) = @_;
536
537 if (!open (FILE, ">:utf8", $filename)) {
538 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
539 die "\n";
540 }
541 print FILE $$textref;
542
543 close FILE;
544}
545
5461;
547
Note: See TracBrowser for help on using the repository browser.