source: main/trunk/greenstone2/perllib/plugins/CommonUtil.pm@ 31688

Last change on this file since 31688 was 31688, checked in by kjdon, 7 years ago

removing debug statements

File size: 20.3 KB
Line 
1###########################################################################
2#
3# CommonUtil.pm -- base class for file and directory plugins - aims to
4# handle all encoding stuff, blocking stuff, to keep it in one place
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2017 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package CommonUtil;
28
29use strict;
30no strict 'subs';
31no strict 'refs'; # allow filehandles to be variables and viceversa
32
33use encodings;
34use Unicode::Normalize 'normalize';
35
36use PrintInfo;
37use Encode;
38use Unicode::Normalize 'normalize';
39
40BEGIN {
41 @CommonUtil::ISA = ( 'PrintInfo' );
42}
43
44our $encoding_list =
45 [ { 'name' => "ascii",
46 'desc' => "{CommonUtil.encoding.ascii}" },
47 { 'name' => "utf8",
48 'desc' => "{CommonUtil.encoding.utf8}" },
49 { 'name' => "unicode",
50 'desc' => "{CommonUtil.encoding.unicode}" } ];
51
52
53my $e = $encodings::encodings;
54foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
55{
56 my $hashEncode =
57 {'name' => $enc,
58 'desc' => $e->{$enc}->{'name'}};
59
60 push(@{$encoding_list},$hashEncode);
61}
62
63our $encoding_plus_auto_list =
64 [ { 'name' => "auto",
65 'desc' => "{CommonUtil.filename_encoding.auto}" },
66 { 'name' => "auto-language-analysis",
67 'desc' => "{CommonUtil.filename_encoding.auto_language_analysis}" }, # textcat
68 { 'name' => "auto-filesystem-encoding",
69 'desc' => "{CommonUtil.filename_encoding.auto_filesystem_encoding}" }, # locale
70 { 'name' => "auto-fl",
71 'desc' => "{CommonUtil.filename_encoding.auto_fl}" }, # locale followed by textcat
72 { 'name' => "auto-lf",
73 'desc' => "{CommonUtil.filename_encoding.auto_lf}" } ]; # texcat followed by locale
74
75push(@{$encoding_plus_auto_list},@{$encoding_list});
76
77my $arguments =
78 [ { 'name' => "block_exp",
79 'desc' => "{CommonUtil.block_exp}",
80 'type' => "regexp",
81 'deft' => "",
82 'reqd' => "no" },
83 { 'name' => "no_blocking",
84 'desc' => "{CommonUtil.no_blocking}",
85 'type' => "flag",
86 'reqd' => "no"},
87 { 'name' => "filename_encoding",
88 'desc' => "{CommonUtil.filename_encoding}",
89 'type' => "enum",
90 'deft' => "auto",
91 'list' => $encoding_plus_auto_list,
92 'reqd' => "no" }
93 ];
94
95my $options = { 'name' => "CommonUtil",
96 'desc' => "{CommonUtil.desc}",
97 'abstract' => "yes",
98 'inherits' => "no",
99 'args' => $arguments };
100
101
102sub new {
103
104 my ($class) = shift (@_);
105 my ($pluginlist,$inputargs,$hashArgOptLists,$auxiliary) = @_;
106 push(@$pluginlist, $class);
107
108 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
109 push(@{$hashArgOptLists->{"OptList"}},$options);
110
111 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists,$auxiliary);
112
113 return bless $self, $class;
114
115}
116
117sub init {
118 my $self = shift (@_);
119 my ($verbosity, $outhandle, $failhandle) = @_;
120
121 # verbosity is passed through from the processor
122 $self->{'verbosity'} = $verbosity;
123
124 # as are the outhandle and failhandle
125 $self->{'outhandle'} = $outhandle if defined $outhandle;
126 $self->{'failhandle'} = $failhandle;
127
128}
129
130# converts raw filesystem filename to perl unicode format
131sub raw_filename_to_unicode {
132 my $self = shift (@_);
133 my ($file) = @_;
134
135 my $unicode_file = "";
136 ### need it in perl unicode, not raw filesystem
137 my $filename_encoding = $self->guess_filesystem_encoding();
138
139 # copied this from set_Source_metadata in BaseImporter
140 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
141 # Use filename_encoding to map raw filename to a Perl unicode-aware string
142 $unicode_file = decode($filename_encoding,$file);
143 }
144 else {
145 # otherwise generate %xx encoded version of filename for char > 127
146 $unicode_file = &unicode::raw_filename_to_url_encoded($file);
147 }
148 return $unicode_file;
149
150}
151# just converts path as is to utf8.
152sub filepath_to_utf8 {
153 my $self = shift (@_);
154 my ($file, $file_encoding) = @_;
155 my $filemeta = $file;
156
157 my $filename_encoding = $self->{'filename_encoding'}; # filename encoding setting
158
159 # Whenever filename-encoding is set to any of the auto settings, we
160 # check if the filename is already in UTF8. If it is, then we're done.
161 if($filename_encoding =~ m/auto/) {
162 if(&unicode::check_is_utf8($filemeta))
163 {
164 $filename_encoding = "utf8";
165 return $filemeta;
166 }
167 }
168
169 # Auto setting, but filename is not utf8
170 if ($filename_encoding eq "auto")
171 {
172 # try textcat
173 $filename_encoding = $self->textcat_encoding($filemeta);
174
175 # check the locale next
176 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
177
178
179 # now try the encoding of the document, if available
180 if ($filename_encoding eq "undefined" && defined $file_encoding) {
181 $filename_encoding = $file_encoding;
182 }
183
184 }
185
186 elsif ($filename_encoding eq "auto-language-analysis")
187 {
188 $filename_encoding = $self->textcat_encoding($filemeta);
189
190 # now try the encoding of the document, if available
191 if ($filename_encoding eq "undefined" && defined $file_encoding) {
192 $filename_encoding = $file_encoding;
193 }
194 }
195
196 elsif ($filename_encoding eq "auto-filesystem-encoding")
197 {
198 # try locale
199 $filename_encoding = $self->locale_encoding();
200 }
201
202 elsif ($filename_encoding eq "auto-fl")
203 {
204 # filesystem-encoding (locale) then language-analysis (textcat)
205 $filename_encoding = $self->locale_encoding();
206
207 # try textcat
208 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined";
209
210 # else assume filename encoding is encoding of file content, if that's available
211 if ($filename_encoding eq "undefined" && defined $file_encoding) {
212 $filename_encoding = $file_encoding;
213 }
214 }
215
216 elsif ($filename_encoding eq "auto-lf")
217 {
218 # language-analysis (textcat) then filesystem-encoding (locale)
219 $filename_encoding = $self->textcat_encoding($filemeta);
220
221 # guess filename encoding from encoding of file content, if available
222 if ($filename_encoding eq "undefined" && defined $file_encoding) {
223 $filename_encoding = $file_encoding;
224 }
225
226 # try locale
227 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined";
228 }
229
230 # if still undefined, use utf8 as fallback
231 if ($filename_encoding eq "undefined") {
232 $filename_encoding = "utf8";
233 }
234
235 #print STDERR "**** UTF8 encoding the filename $filemeta ";
236
237 # if the filename encoding is set to utf8 but it isn't utf8 already--such as when
238 # 1. the utf8 fallback is used, or 2. if the system locale is used and happens to
239 # be always utf8 (in which case the filename's encoding is also set as utf8 even
240 # though the filename need not be if it originates from another system)--in such
241 # cases attempt to make the filename utf8 to match.
242 if($filename_encoding eq "utf8" && !&unicode::check_is_utf8($filemeta)) {
243 &unicode::ensure_utf8(\$filemeta);
244 }
245
246 # convert non-unicode encodings to utf8
247 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) {
248 $filemeta = &unicode::unicode2utf8(
249 &unicode::convert2unicode($filename_encoding, \$filemeta)
250 );
251 }
252
253 #print STDERR " from encoding $filename_encoding -> $filemeta\n";
254 return $filemeta;
255}
256
257# gets the filename with no path, converts to utf8, and then dm safes it.
258# filename_encoding set by user
259sub filename_to_utf8_metadata
260{
261 my $self = shift (@_);
262 my ($file, $file_encoding) = @_;
263
264 my $outhandle = $self->{'outhandle'};
265
266 print $outhandle "****!!!!**** CommonUtil::filename_to_utf8_metadata now deprecated\n";
267 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
268 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n";
269
270 my ($filemeta) = $file =~ /([^\\\/]+)$/; # getting the tail of the filepath (skips all string parts containing slashes upto the end)
271 $filemeta = $self->filepath_to_utf8($filemeta, $file_encoding);
272
273 return $filemeta;
274}
275
276sub locale_encoding {
277 my $self = shift(@_);
278
279 if (!defined $self->{'filesystem_encoding'}) {
280 $self->{'filesystem_encoding'} = $self->get_filesystem_encoding();
281 }
282
283 #print STDERR "*** filename encoding determined based on locale: " . $self->{'filesystem_encoding'} . "\n";
284 return $self->{'filesystem_encoding'}; # can be the string "undefined"
285}
286
287
288sub textcat_encoding {
289 my $self = shift(@_);
290 my ($filemeta) = @_;
291
292 # analyse filenames without extensions and digits (and trimmed of
293 # surrounding whitespace), so that irrelevant chars don't confuse
294 # textcat
295 my $strictfilemeta = $filemeta;
296 $strictfilemeta =~ s/\.[^\.]+$//g;
297 $strictfilemeta =~ s/\d//g;
298 $strictfilemeta =~ s/^\s*//g;
299 $strictfilemeta =~ s/\s*$//g;
300
301 my $filename_encoding = $self->encoding_from_language_analysis($strictfilemeta);
302 if(!defined $filename_encoding) {
303 $filename_encoding = "undefined";
304 }
305
306 return $filename_encoding; # can be the string "undefined"
307}
308
309# performs textcat
310sub encoding_from_language_analysis {
311 my $self = shift(@_);
312 my ($text) = @_;
313
314 my $outhandle = $self->{'outhandle'};
315 my $best_encoding = undef;
316
317 # get the language/encoding of the textstring using textcat
318 require textcat; # Only load the textcat module if it is required
319 $self->{'textcat'} = new textcat() unless defined($self->{'textcat'});
320 my $results = $self->{'textcat'}->classify_cached_filename(\$text);
321
322
323 if (scalar @$results < 0) {
324 return undef;
325 }
326
327 # We have some results, we choose the first
328 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
329
330 $best_encoding = $encoding;
331 if (!defined $best_encoding) {
332 return undef;
333 }
334
335 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) {
336 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities)
337 $best_encoding = 'utf8';
338 }
339
340
341 # check for equivalents where textcat doesn't have some encodings...
342 # eg MS versions of standard encodings
343 if (defined $best_encoding && $best_encoding =~ /^iso_8859_(\d+)/) {
344 my $iso = $1; # which variant of the iso standard?
345 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
346 if ($text =~ /[\x80-\x9f]/) {
347 # Western Europe
348 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
349 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
350 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
351 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
352 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
353 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
354 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
355 }
356 }
357
358 if (defined $best_encoding && $best_encoding !~ /^(ascii|utf8|unicode)$/ &&
359 !defined $encodings::encodings->{$best_encoding})
360 {
361 if ($self->{'verbosity'}) {
362 gsprintf($outhandle, "CommonUtil: {ReadTextFile.unsupported_encoding}\n", $text, $best_encoding, "undef");
363 }
364 $best_encoding = undef;
365 }
366
367 return $best_encoding;
368}
369
370
371
372sub deduce_filename_encoding
373{
374 my $self = shift (@_);
375 my ($file,$metadata,$plugin_filename_encoding) = @_;
376
377 my $gs_filename_encoding = $metadata->{"gs.filenameEncoding"};
378 my $deduced_filename_encoding = undef;
379
380 # Start by looking for manually assigned metadata
381 if (defined $gs_filename_encoding) {
382 if (ref ($gs_filename_encoding) eq "ARRAY") {
383 my $outhandle = $self->{'outhandle'};
384
385 $deduced_filename_encoding = $gs_filename_encoding->[0];
386
387 my $num_vals = scalar(@$gs_filename_encoding);
388 if ($num_vals>1) {
389 print $outhandle "Warning: gs.filenameEncoding multiply defined for $file\n";
390 print $outhandle " Selecting first value: $deduced_filename_encoding\n";
391 }
392 }
393 else {
394 $deduced_filename_encoding = $gs_filename_encoding;
395 }
396 }
397
398 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
399 # Look to see if plugin specifies this value
400
401 if (defined $plugin_filename_encoding) {
402 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options)
403 if ($plugin_filename_encoding =~ m/^auto-.*$/) {
404 my $outhandle = $self->{'outhandle'};
405 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n";
406 print $outhandle " default to 'auto'\n";
407 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto";
408 }
409
410 if ($plugin_filename_encoding ne "auto") {
411 # We've been given a specific filenamne encoding
412 # => so use it!
413 $deduced_filename_encoding = $plugin_filename_encoding;
414 }
415 }
416 }
417
418 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
419
420 # Look to file system to provide a character encoding
421
422 # If Windows NTFS, then -- assuming we work with long file names got through
423 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
424
425 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
426 # Can do better than working with the DOS character encoding returned by locale
427 $deduced_filename_encoding = "unicode";
428 }
429 else {
430 # Unix of some form or other
431
432 # See if we can determine the file system encoding through locale
433 $deduced_filename_encoding = $self->locale_encoding();
434
435 # if locale shows us filesystem is utf8, check to see filename is consistent
436 # => if not, then we have an "alien" filename on our hands
437
438 if (defined $deduced_filename_encoding && $deduced_filename_encoding =~ m/^utf-?8$/i) {
439 if (!&unicode::check_is_utf8($file)) {
440 # "alien" filename, so revert
441 $deduced_filename_encoding = undef;
442 }
443 }
444 }
445 }
446
447# if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
448# # Last chance, apply textcat to deduce filename encoding
449# $deduced_filename_encoding = $self->textcat_encoding($file);
450# }
451
452 if ($self->{'verbosity'}>3) {
453 my $outhandle = $self->{'outhandle'};
454
455 if (defined $deduced_filename_encoding) {
456 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n";
457 }
458 else {
459 print $outhandle " No filename encoding deduced\n";
460 }
461 }
462
463 return $deduced_filename_encoding;
464}
465
466
467sub guess_filesystem_encoding
468{
469 my $self = shift (@_);
470 # Look to file system to provide a character encoding
471 my $deduced_filename_encoding = "";
472 # If Windows NTFS, then -- assuming we work with long file names got through
473 # Win32::GetLongFilePath() -- then the underlying file system is UTF16
474
475 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
476 # Can do better than working with the DOS character encoding returned by locale
477 $deduced_filename_encoding = "unicode";
478 }
479 else {
480 # Unix of some form or other
481
482 # See if we can determine the file system encoding through locale
483 $deduced_filename_encoding = $self->locale_encoding(); #utf8??
484
485 }
486 return $deduced_filename_encoding;
487}
488
489
490# uses locale
491sub get_filesystem_encoding
492{
493
494 my $self = shift(@_);
495
496 my $outhandle = $self->{'outhandle'};
497 my $filesystem_encoding = undef;
498
499 eval {
500 # Works for Windows as well, returning the DOS code page in use
501 use POSIX qw(locale_h);
502
503 # With only one parameter, setlocale retrieves the
504 # current value
505 my $current_locale = setlocale(LC_CTYPE);
506
507 my $char_encoding = undef;
508 if ($current_locale =~ m/\./) {
509 ($char_encoding) = ($current_locale =~ m/^.*\.(.*?)$/);
510 $char_encoding = lc($char_encoding);
511 }
512 else {
513 if ($current_locale =~ m/^(posix|c)$/i) {
514 $char_encoding = "ascii";
515 }
516 }
517
518 if (defined $char_encoding) {
519 if ($char_encoding =~ m/^(iso)(8859)-?(\d{1,2})$/) {
520 $char_encoding = "$1\_$2\_$3";
521 }
522
523 $char_encoding =~ s/-/_/g;
524 $char_encoding =~ s/^utf_8$/utf8/;
525
526 if ($char_encoding =~ m/^\d+$/) {
527 if (defined $encodings::encodings->{"windows_$char_encoding"}) {
528 $char_encoding = "windows_$char_encoding";
529 }
530 elsif (defined $encodings::encodings->{"dos_$char_encoding"}) {
531 $char_encoding = "dos_$char_encoding";
532 }
533 }
534
535 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
536 || (defined $encodings::encodings->{$char_encoding})) {
537 $filesystem_encoding = $char_encoding;
538 }
539 else {
540 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
541 }
542 }
543
544
545 };
546 if ($@) {
547 print $outhandle "$@\n";
548 print $outhandle "Warning: Unable to establish locale. Will assume filesystem is UTF-8\n";
549
550 }
551
552 return $filesystem_encoding;
553}
554
555
556
557# write_file -- used by ConvertToPlug, for example in post processing
558#
559# where should this go, is here the best place??
560sub utf8_write_file {
561 my $self = shift (@_);
562 my ($textref, $filename) = @_;
563
564 if (!open (FILE, ">:utf8", $filename)) {
565 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
566 die "\n";
567 }
568 print FILE $$textref;
569
570 close FILE;
571}
572
573sub block_raw_filename {
574
575 my $self = shift (@_);
576 my ($block_hash,$filename_full_path) = @_;
577
578 my $unicode_filename = $self->raw_filename_to_unicode($filename_full_path);
579 return $self->block_filename($block_hash, $unicode_filename);
580}
581
582# block unicode string filename
583sub block_filename
584{
585 my $self = shift (@_);
586 my ($block_hash,$filename_full_path) = @_;
587
588 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
589 # block hash contains long names, lets make sure that we were passed a long name
590 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
591 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
592 my $lower_filename_full_path = lc($filename_full_path);
593 $block_hash->{'file_blocks'}->{$lower_filename_full_path} = 1;
594
595 }
596 elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
597 # we need to normalize the filenames
598 my $composed_filename_full_path = normalize('C', $filename_full_path);
599 ## print STDERR "darwin, composed filename =". &unicode::debug_unicode_string($composed_filename_full_path)."\n";
600 $block_hash->{'file_blocks'}->{$composed_filename_full_path} = 1;
601 }
602
603 else {
604 $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
605 }
606}
607
608
609# filename is raw filesystem name
610sub raw_file_is_blocked {
611 my $self = shift (@_);
612 my ($block_hash, $filename_full_path) = @_;
613
614 my $unicode_filename_full_path = $self->raw_filename_to_unicode($filename_full_path);
615 return $self->file_is_blocked($block_hash, $unicode_filename_full_path);
616}
617
618# filename must be perl unicode string
619sub file_is_blocked {
620 my $self = shift (@_);
621 my ($block_hash, $filename_full_path) = @_;
622
623 #
624 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
625 # convert to long filenames if needed
626 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
627 # all block paths are lowercased.
628 my $lower_filename = lc ($filename_full_path);
629 if (defined $block_hash->{'file_blocks'}->{$lower_filename}) {
630 $self->{'num_blocked'} ++;
631 return 1;
632 }
633 }
634 elsif ($ENV{'GSDLOS'} =~ m/^darwin$/) {
635
636 # on mac, we want composed form in the block hash
637 my $composed_form = normalize('C', $filename_full_path);
638 if (defined $block_hash->{'file_blocks'}->{$composed_form}) {
639 $self->{'num_blocked'} ++;
640 return 1;
641 }
642 }
643
644 else {
645 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
646 $self->{'num_blocked'} ++;
647 return 1;
648 }
649 }
650 # check Directory plugin's own block_exp
651 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
652 $self->{'num_blocked'} ++;
653 return 1; # blocked
654 }
655 return 0;
656}
657
658
6591;
660
Note: See TracBrowser for help on using the repository browser.