source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Charset.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

  • Property svn:executable set to *
File size: 15.9 KB
Line 
1#------------------------------------------------------------------------------
2# File: Charset.pm
3#
4# Description: ExifTool character encoding routines
5#
6# Revisions: 2009/08/28 - P. Harvey created
7# 2010/01/20 - P. Harvey complete re-write
8# 2010/07/16 - P. Harvey added UTF-16 support
9#------------------------------------------------------------------------------
10
11package Image::ExifTool::Charset;
12
13use strict;
14use vars qw($VERSION %csType);
15use Image::ExifTool qw(:DataAccess :Utils);
16
17$VERSION = '1.11';
18
19my %charsetTable; # character set tables we've loaded
20
21# lookup for converting Unicode to 1-byte character sets
22my %unicode2byte = (
23 Latin => { # pre-load Latin (cp1252) for speed
24 0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96,
25 0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97,
26 0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98,
27 0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99,
28 0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a,
29 0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b,
30 0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c,
31 0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e,
32 0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f,
33 },
34);
35
36# bit flags for all supported character sets
37# (this number must be correct because it dictates the decoding algorithm!)
38# 0x001 = character set requires a translation module
39# 0x002 = inverse conversion not yet supported by Recompose()
40# 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
41# 0x100 = 1-byte fixed-width characters
42# 0x200 = 2-byte fixed-width characters
43# 0x400 = 4-byte fixed-width characters
44# 0x800 = 1- and 2-byte variable-width characters, or 1-byte
45# fixed-width characters that map into multiple codepoints
46# Note: In its public interface, ExifTool can currently only support type 0x101
47# and lower character sets because strings are only converted if they
48# contain characters above 0x7f and there is no provision for specifying
49# the byte order for input/output values
50%csType = (
51 UTF8 => 0x100,
52 ASCII => 0x100, # (treated like UTF8)
53 Arabic => 0x101,
54 Baltic => 0x101,
55 Cyrillic => 0x101,
56 Greek => 0x101,
57 Hebrew => 0x101,
58 Latin => 0x101,
59 Latin2 => 0x101,
60 DOSLatinUS => 0x101,
61 DOSLatin1 => 0x101,
62 DOSCyrillic => 0x101,
63 MacCroatian => 0x101,
64 MacCyrillic => 0x101,
65 MacGreek => 0x101,
66 MacIceland => 0x101,
67 MacLatin2 => 0x101,
68 MacRoman => 0x101,
69 MacRomanian => 0x101,
70 MacTurkish => 0x101,
71 Thai => 0x101,
72 Turkish => 0x101,
73 Vietnam => 0x101,
74 MacArabic => 0x103, # (directional characters not supported)
75 PDFDoc => 0x181,
76 Unicode => 0x200, # (UCS2)
77 UCS2 => 0x200,
78 UTF16 => 0x200,
79 Symbol => 0x201,
80 JIS => 0x201,
81 UCS4 => 0x400,
82 MacChineseCN => 0x803,
83 MacChineseTW => 0x803,
84 MacHebrew => 0x803, # (directional characters not supported)
85 MacKorean => 0x803,
86 MacRSymbol => 0x803,
87 MacThai => 0x803,
88 MacJapanese => 0x883,
89 ShiftJIS => 0x883,
90);
91
92#------------------------------------------------------------------------------
93# Load character set module
94# Inputs: 0) Module name
95# Returns: Reference to lookup hash, or undef on error
96sub LoadCharset($)
97{
98 my $charset = shift;
99 my $conv = $charsetTable{$charset};
100 unless ($conv) {
101 # load translation module
102 my $module = "Image::ExifTool::Charset::$charset";
103 no strict 'refs';
104 if (%$module or eval "require $module") {
105 $conv = $charsetTable{$charset} = \%$module;
106 }
107 }
108 return $conv;
109}
110
111#------------------------------------------------------------------------------
112# Does an array contain valid UTF-16 characters?
113# Inputs: 0) array reference to list of UCS-2 values
114# Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates
115sub IsUTF16($)
116{
117 local $_;
118 my $uni = shift;
119 my $surrogate;
120 foreach (@$uni) {
121 my $hiBits = ($_ & 0xfc00);
122 if ($hiBits == 0xfc00) {
123 # check for invalid values in UTF-16
124 return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef);
125 } elsif ($surrogate) {
126 return 0 if $hiBits != 0xdc00;
127 $surrogate = 0;
128 } else {
129 return 0 if $hiBits == 0xdc00;
130 $surrogate = 1 if $hiBits == 0xd800;
131 }
132 }
133 return 1 if not defined $surrogate;
134 return 2 unless $surrogate;
135 return 0;
136}
137
138#------------------------------------------------------------------------------
139# Decompose string with specified encoding into an array of integer code points
140# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
141# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
142# Returns: Reference to array of Unicode values
143# Notes: Accepts any type of character set
144# - byte order only used for fixed-width 2-byte and 4-byte character sets
145# - byte order mark observed and then removed with UCS2 and UCS4
146# - no warnings are issued if ExifTool object is not provided
147# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
148sub Decompose($$$;$)
149{
150 local $_;
151 my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
152 my $type = $csType{$charset};
153 my (@uni, $conv);
154
155 if ($type & 0x001) {
156 $conv = LoadCharset($charset);
157 unless ($conv) {
158 # (shouldn't happen)
159 $et->Warn("Invalid character set $charset") if $et;
160 return \@uni; # error!
161 }
162 } elsif ($type == 0x100) {
163 # convert ASCII and UTF8 (treat ASCII as UTF8)
164 if ($] < 5.006001) {
165 # do it ourself
166 @uni = Image::ExifTool::UnpackUTF8($val);
167 } else {
168 # handle warnings from malformed UTF-8
169 undef $Image::ExifTool::evalWarning;
170 local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
171 # (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
172 @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
173 # issue warning if we had errors
174 if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
175 $et->Warn('Malformed UTF-8 character(s)');
176 $$et{WarnBadUTF8} = 1;
177 }
178 }
179 return \@uni; # all done!
180 }
181 if ($type & 0x100) { # 1-byte fixed-width characters
182 @uni = unpack('C*', $val);
183 foreach (@uni) {
184 $_ = $$conv{$_} if defined $$conv{$_};
185 }
186 } elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
187 my $unknown;
188 my $byteOrder = $_[3];
189 if (not $byteOrder) {
190 $byteOrder = GetByteOrder();
191 } elsif ($byteOrder eq 'Unknown') {
192 $byteOrder = GetByteOrder();
193 $unknown = 1;
194 }
195 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
196 if ($type & 0x400) { # 4-byte
197 $fmt = uc $fmt; # unpack as 'N*' or 'V*'
198 # honour BOM if it exists
199 $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*';
200 undef $unknown; # (byte order logic applies to 2-byte only)
201 } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) {
202 $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*';
203 undef $unknown;
204 }
205 # convert from UCS2 or UCS4
206 @uni = unpack($fmt, $val);
207
208 if (not $conv) {
209 # no translation necessary
210 if ($unknown) {
211 # check the byte order
212 my (%bh, %bl);
213 my ($zh, $zl) = (0, 0);
214 foreach (@uni) {
215 $bh{$_ >> 8} = 1;
216 $bl{$_ & 0xff} = 1;
217 ++$zh unless $_ & 0xff00;
218 ++$zl unless $_ & 0x00ff;
219 }
220 # count the number of unique values in the hi and lo bytes
221 my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
222 # the byte with the greater number of unique values should be
223 # the low-order byte, otherwise the byte which is zero more
224 # often is likely the high-order byte
225 if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
226 # we guessed wrong, so decode using the other byte order
227 $fmt =~ tr/nvNV/vnVN/;
228 @uni = unpack($fmt, $val);
229 $$et{WrongByteOrder} = 1;
230 }
231 }
232 # handle surrogate pairs of UTF-16
233 if ($charset eq 'UTF16') {
234 my $i;
235 for ($i=0; $i<$#uni; ++$i) {
236 next unless ($uni[$i] & 0xfc00) == 0xd800 and
237 ($uni[$i+1] & 0xfc00) == 0xdc00;
238 my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
239 splice(@uni, $i, 2, $cp);
240 }
241 }
242 } elsif ($unknown) {
243 # count encoding errors as we do the translation
244 my $e1 = 0;
245 foreach (@uni) {
246 defined $$conv{$_} and $_ = $$conv{$_}, next;
247 ++$e1;
248 }
249 # try the other byte order if we had any errors
250 if ($e1) {
251 $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed)
252 my @try = unpack($fmt, $val);
253 my $e2 = 0;
254 foreach (@try) {
255 defined $$conv{$_} and $_ = $$conv{$_}, next;
256 ++$e2;
257 }
258 # use this byte order if there are fewer errors
259 if ($e2 < $e1) {
260 $$et{WrongByteOrder} = 1;
261 return \@try;
262 }
263 }
264 } else {
265 # translate any characters found in the lookup
266 foreach (@uni) {
267 $_ = $$conv{$_} if defined $$conv{$_};
268 }
269 }
270 } else { # variable-width characters
271 # unpack into bytes
272 my @bytes = unpack('C*', $val);
273 while (@bytes) {
274 my $ch = shift @bytes;
275 my $cv = $$conv{$ch};
276 # pass straight through if no translation
277 $cv or push(@uni, $ch), next;
278 # byte translates into single Unicode character
279 ref $cv or push(@uni, $cv), next;
280 # byte maps into multiple Unicode characters
281 ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
282 # handle 2-byte character codes
283 $ch = shift @bytes;
284 if (defined $ch) {
285 if ($$cv{$ch}) {
286 $cv = $$cv{$ch};
287 ref $cv or push(@uni, $cv), next;
288 push @uni, @$cv; # multiple Unicode characters
289 } else {
290 push @uni, ord('?'); # encoding error
291 unshift @bytes, $ch;
292 }
293 } else {
294 push @uni, ord('?'); # encoding error
295 }
296 }
297 }
298 return \@uni;
299}
300
301#------------------------------------------------------------------------------
302# Convert array of code point integers into a string with specified encoding
303# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
304# 2) character set (note: not all types are supported)
305# 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
306# Returns: converted string (truncated at null character if it exists), empty on error
307# Notes: converts elements of input character array to new code points
308# - ExifTool ref may be undef provided $charset is defined
309sub Recompose($$;$$)
310{
311 local $_;
312 my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required)
313 my ($outVal, $conv, $inv);
314 $charset or $charset = $$et{OPTIONS}{Charset};
315 my $csType = $csType{$charset};
316 if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8)
317 if ($] >= 5.006001) {
318 # let Perl do it
319 $outVal = pack('C0U*', @$uni);
320 } else {
321 # do it ourself
322 $outVal = Image::ExifTool::PackUTF8(@$uni);
323 }
324 $outVal =~ s/\0.*//s; # truncate at null terminator
325 return $outVal;
326 }
327 # get references to forward and inverse lookup tables
328 if ($csType & 0x801) {
329 $conv = LoadCharset($charset);
330 unless ($conv) {
331 $et->Warn("Missing charset $charset") if $et;
332 return '';
333 }
334 $inv = $unicode2byte{$charset};
335 # generate inverse lookup if necessary
336 unless ($inv) {
337 if (not $csType or $csType & 0x802) {
338 $et->Warn("Invalid destination charset $charset") if $et;
339 return '';
340 }
341 # prepare table to convert from Unicode to 1-byte characters
342 my ($char, %inv);
343 foreach $char (keys %$conv) {
344 $inv{$$conv{$char}} = $char;
345 }
346 $inv = $unicode2byte{$charset} = \%inv;
347 }
348 }
349 if ($csType & 0x100) { # 1-byte fixed-width
350 # convert to specified character set
351 foreach (@$uni) {
352 next if $_ < 0x80;
353 $$inv{$_} and $_ = $$inv{$_}, next;
354 # our tables omit 1-byte characters with the same values as Unicode,
355 # so pass them straight through after making sure there isn't a
356 # different character with this byte value
357 next if $_ < 0x100 and not $$conv{$_};
358 $_ = ord('?'); # set invalid characters to '?'
359 if ($et and not $$et{EncodingError}) {
360 $et->Warn("Some character(s) could not be encoded in $charset");
361 $$et{EncodingError} = 1;
362 }
363 }
364 # repack as an 8-bit string and truncate at null
365 $outVal = pack('C*', @$uni);
366 $outVal =~ s/\0.*//s;
367 } else { # 2-byte and 4-byte fixed-width
368 # convert if required
369 if ($inv) {
370 $$inv{$_} and $_ = $$inv{$_} foreach @$uni;
371 }
372 # generate surrogate pairs of UTF-16
373 if ($charset eq 'UTF16') {
374 my $i;
375 for ($i=0; $i<@$uni; ++$i) {
376 next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
377 my $t = $$uni[$i] - 0x10000;
378 my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
379 my $w2 = 0xdc00 + ($t & 0x3ff);
380 splice(@$uni, $i, 1, $w1, $w2);
381 ++$i; # skip surrogate pair
382 }
383 }
384 # pack as 2- or 4-byte integer in specified byte order
385 my $byteOrder = $_[3] || GetByteOrder();
386 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
387 $fmt = uc($fmt) if $csType & 0x400;
388 $outVal = pack($fmt, @$uni);
389 }
390 return $outVal;
391}
392
3931; # end
394
395__END__
396
397=head1 NAME
398
399Image::ExifTool::Charset - ExifTool character encoding routines
400
401=head1 SYNOPSIS
402
403This module is required by Image::ExifTool.
404
405=head1 DESCRIPTION
406
407This module contains routines used by ExifTool to translate special
408character sets. Currently, the following character sets are supported:
409
410 UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS,
411 Latin, Latin2, DOSLatinUS, DOSLatin1, DOSCyrillic, MacArabic,
412 MacChineseCN, MacChineseTW, MacCroatian, MacCyrillic, MacGreek, MacHebrew,
413 MacIceland, MacJapanese, MacKorean, MacLatin2, MacRSymbol, MacRoman,
414 MacRomanian, MacThai, MacTurkish, PDFDoc, RSymbol, ShiftJIS, Symbol, Thai,
415 Turkish, Vietnam
416
417However, only some of these character sets are available to the user via
418ExifTool options -- the multi-byte character sets are used only internally
419when decoding certain types of information.
420
421=head1 AUTHOR
422
423Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
424
425This library is free software; you can redistribute it and/or modify it
426under the same terms as Perl itself.
427
428=head1 SEE ALSO
429
430L<Image::ExifTool(3pm)|Image::ExifTool>
431
432=cut
Note: See TracBrowser for help on using the repository browser.