source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/Image/ExifTool/Charset.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

  • Property svn:executable set to *
File size: 14.8 KB
Line 
1#------------------------------------------------------------------------------
2# File: Charset.pm
3#
4# Description: ExifTool character encoding routines
5#
6# Revisions: 2009/08/28 - P. Harvey created
7# 2010/01/20 - P. Harvey complete re-write
8# 2010/07/16 - P. Harvey added UTF-16 support
9#------------------------------------------------------------------------------
10
11package Image::ExifTool::Charset;
12
13use strict;
14use vars qw($VERSION %csType);
15use Image::ExifTool qw(:DataAccess :Utils);
16
17$VERSION = '1.07';
18
19my %charsetTable; # character set tables we've loaded
20
21# lookup for converting Unicode to 1-byte character sets
22my %unicode2byte = (
23 Latin => { # pre-load Latin (cp1252) for speed
24 0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96,
25 0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97,
26 0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98,
27 0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99,
28 0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a,
29 0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b,
30 0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c,
31 0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e,
32 0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f,
33 },
34);
35
36# bit flags for all supported character sets
37# (this number must be correct because it dictates the decoding algorithm!)
38# 0x001 = character set requires a translation module
39# 0x002 = inverse conversion not yet supported by Recompose()
40# 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
41# 0x100 = 1-byte fixed-width characters
42# 0x200 = 2-byte fixed-width characters
43# 0x400 = 4-byte fixed-width characters
44# 0x800 = 1- and 2-byte variable-width characters, or 1-byte
45# fixed-width characters that map into multiple codepoints
46# Note: In its public interface, ExifTool can currently only support type 0x101
47# and lower character sets because strings are only converted if they
48# contain characters above 0x7f and there is no provision for specifying
49# the byte order for input/output values
50%csType = (
51 UTF8 => 0x100,
52 ASCII => 0x100, # (treated like UTF8)
53 Arabic => 0x101,
54 Baltic => 0x101,
55 Cyrillic => 0x101,
56 Greek => 0x101,
57 Hebrew => 0x101,
58 Latin => 0x101,
59 Latin2 => 0x101,
60 MacCroatian => 0x101,
61 MacCyrillic => 0x101,
62 MacGreek => 0x101,
63 MacIceland => 0x101,
64 MacLatin2 => 0x101,
65 MacRoman => 0x101,
66 MacRomanian => 0x101,
67 MacTurkish => 0x101,
68 Thai => 0x101,
69 Turkish => 0x101,
70 Vietnam => 0x101,
71 MacArabic => 0x103, # (directional characters not supported)
72 PDFDoc => 0x181,
73 Unicode => 0x200, # (UCS2)
74 UCS2 => 0x200,
75 UTF16 => 0x200,
76 Symbol => 0x201,
77 JIS => 0x201,
78 UCS4 => 0x400,
79 MacChineseCN => 0x803,
80 MacChineseTW => 0x803,
81 MacHebrew => 0x803, # (directional characters not supported)
82 MacKorean => 0x803,
83 MacRSymbol => 0x803,
84 MacThai => 0x803,
85 MacJapanese => 0x883,
86 ShiftJIS => 0x883,
87);
88
89#------------------------------------------------------------------------------
90# Load character set module
91# Inputs: 0) Module name
92# Returns: Reference to lookup hash, or undef on error
93sub LoadCharset($)
94{
95 my $charset = shift;
96 my $conv = $charsetTable{$charset};
97 unless ($conv) {
98 # load translation module
99 my $module = "Image::ExifTool::Charset::$charset";
100 no strict 'refs';
101 if (%$module or eval "require $module") {
102 $conv = $charsetTable{$charset} = \%$module;
103 }
104 }
105 return $conv;
106}
107
108#------------------------------------------------------------------------------
109# Decompose string with specified encoding into an array of integer code points
110# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
111# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
112# Returns: Reference to array of Unicode values
113# Notes: Accepts any type of character set
114# - byte order only used for fixed-width 2-byte and 4-byte character sets
115# - byte order mark observed and then removed with UCS2 and UCS4
116# - no warnings are issued if ExifTool object is not provided
117sub Decompose($$$;$)
118{
119 local $_;
120 my ($exifTool, $val, $charset) = @_; # ($byteOrder assigned later if required)
121 my $type = $csType{$charset};
122 my (@uni, $conv);
123
124 if ($type & 0x001) {
125 $conv = LoadCharset($charset);
126 unless ($conv) {
127 # (shouldn't happen)
128 $exifTool->Warn("Invalid character set $charset") if $exifTool;
129 return \@uni; # error!
130 }
131 } elsif ($type == 0x100) {
132 # convert ASCII and UTF8 (treat ASCII as UTF8)
133 if ($] < 5.006001) {
134 # do it ourself
135 @uni = Image::ExifTool::UnpackUTF8($val);
136 } else {
137 # handle warnings from malformed UTF-8
138 undef $Image::ExifTool::evalWarning;
139 local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
140 # (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
141 @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
142 # issue warning if we had errors
143 if ($Image::ExifTool::evalWarning and $exifTool and not $$exifTool{WarnBadUTF8}) {
144 $exifTool->Warn('Malformed UTF-8 character(s)');
145 $$exifTool{WarnBadUTF8} = 1;
146 }
147 }
148 return \@uni; # all done!
149 }
150 if ($type & 0x100) { # 1-byte fixed-width characters
151 @uni = unpack('C*', $val);
152 foreach (@uni) {
153 $_ = $$conv{$_} if defined $$conv{$_};
154 }
155 } elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
156 my $unknown;
157 my $byteOrder = $_[3];
158 if (not $byteOrder) {
159 $byteOrder = GetByteOrder();
160 } elsif ($byteOrder eq 'Unknown') {
161 $byteOrder = GetByteOrder();
162 $unknown = 1;
163 }
164 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
165 if ($type & 0x400) { # 4-byte
166 $fmt = uc $fmt; # unpack as 'N*' or 'V*'
167 # honour BOM if it exists
168 $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*';
169 undef $unknown; # (byte order logic applies to 2-byte only)
170 } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) {
171 $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*';
172 undef $unknown;
173 }
174 # convert from UCS2 or UCS4
175 @uni = unpack($fmt, $val);
176
177 if (not $conv) {
178 # no translation necessary
179 if ($unknown) {
180 # check the byte order
181 my (%bh, %bl);
182 my ($zh, $zl) = (0, 0);
183 foreach (@uni) {
184 $bh{$_ >> 8} = 1;
185 $bl{$_ & 0xff} = 1;
186 ++$zh unless $_ & 0xff00;
187 ++$zl unless $_ & 0x00ff;
188 }
189 # count the number of unique values in the hi and lo bytes
190 my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
191 # the byte with the greater number of unique values should be
192 # the low-order byte, otherwise the byte which is zero more
193 # often is likely the high-order byte
194 if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
195 # we guessed wrong, so decode using the other byte order
196 $fmt =~ tr/nvNV/vnVN/;
197 @uni = unpack($fmt, $val);
198 }
199 }
200 # handle surrogate pairs of UTF-16
201 if ($charset eq 'UTF16') {
202 my $i;
203 for ($i=0; $i<$#uni; ++$i) {
204 next unless ($uni[$i] & 0xfc00) == 0xd800 and
205 ($uni[$i+1] & 0xfc00) == 0xdc00;
206 my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
207 splice(@uni, $i, 2, $cp);
208 }
209 }
210 } elsif ($unknown) {
211 # count encoding errors as we do the translation
212 my $e1 = 0;
213 foreach (@uni) {
214 defined $$conv{$_} and $_ = $$conv{$_}, next;
215 ++$e1;
216 }
217 # try the other byte order if we had any errors
218 if ($e1) {
219 $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed)
220 my @try = unpack($fmt, $val);
221 my $e2 = 0;
222 foreach (@try) {
223 defined $$conv{$_} and $_ = $$conv{$_}, next;
224 ++$e2;
225 }
226 # use this byte order if there are fewer errors
227 return \@try if $e2 < $e1;
228 }
229 } else {
230 # translate any characters found in the lookup
231 foreach (@uni) {
232 $_ = $$conv{$_} if defined $$conv{$_};
233 }
234 }
235 } else { # variable-width characters
236 # unpack into bytes
237 my @bytes = unpack('C*', $val);
238 while (@bytes) {
239 my $ch = shift @bytes;
240 my $cv = $$conv{$ch};
241 # pass straight through if no translation
242 $cv or push(@uni, $ch), next;
243 # byte translates into single Unicode character
244 ref $cv or push(@uni, $cv), next;
245 # byte maps into multiple Unicode characters
246 ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
247 # handle 2-byte character codes
248 $ch = shift @bytes;
249 if (defined $ch) {
250 if ($$cv{$ch}) {
251 $cv = $$cv{$ch};
252 ref $cv or push(@uni, $cv), next;
253 push @uni, @$cv; # multiple Unicode characters
254 } else {
255 push @uni, ord('?'); # encoding error
256 unshift @bytes, $ch;
257 }
258 } else {
259 push @uni, ord('?'); # encoding error
260 }
261 }
262 }
263 return \@uni;
264}
265
266#------------------------------------------------------------------------------
267# Convert array of code point integers into a string with specified encoding
268# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
269# 2) character set (note: not all types are supported)
270# 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
271# Returns: converted string (truncated at null character if it exists), empty on error
272# Notes: converts elements of input character array to new code points
273# - ExifTool ref may be undef provided $charset is defined
274sub Recompose($$;$$)
275{
276 local $_;
277 my ($exifTool, $uni, $charset) = @_; # ($byteOrder assigned later if required)
278 my ($outVal, $conv, $inv);
279 $charset or $charset = $$exifTool{OPTIONS}{Charset};
280 my $csType = $csType{$charset};
281 if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8)
282 if ($] >= 5.006001) {
283 # let Perl do it
284 $outVal = pack('C0U*', @$uni);
285 } else {
286 # do it ourself
287 $outVal = Image::ExifTool::PackUTF8(@$uni);
288 }
289 $outVal =~ s/\0.*//s; # truncate at null terminator
290 return $outVal;
291 }
292 # get references to forward and inverse lookup tables
293 if ($csType & 0x801) {
294 $conv = LoadCharset($charset);
295 unless ($conv) {
296 $exifTool->Warn("Missing charset $charset") if $exifTool;
297 return '';
298 }
299 $inv = $unicode2byte{$charset};
300 # generate inverse lookup if necessary
301 unless ($inv) {
302 if (not $csType or $csType & 0x802) {
303 $exifTool->Warn("Invalid destination charset $charset") if $exifTool;
304 return '';
305 }
306 # prepare table to convert from Unicode to 1-byte characters
307 my ($char, %inv);
308 foreach $char (keys %$conv) {
309 $inv{$$conv{$char}} = $char;
310 }
311 $inv = $unicode2byte{$charset} = \%inv;
312 }
313 }
314 if ($csType & 0x100) { # 1-byte fixed-width
315 # convert to specified character set
316 foreach (@$uni) {
317 next if $_ < 0x80;
318 $$inv{$_} and $_ = $$inv{$_}, next;
319 # our tables omit 1-byte characters with the same values as Unicode,
320 # so pass them straight through after making sure there isn't a
321 # different character with this byte value
322 next if $_ < 0x100 and not $$conv{$_};
323 $_ = ord('?'); # set invalid characters to '?'
324 if ($exifTool and not $$exifTool{EncodingError}) {
325 $exifTool->Warn("Some character(s) could not be encoded in $charset");
326 $$exifTool{EncodingError} = 1;
327 }
328 }
329 # repack as an 8-bit string and truncate at null
330 $outVal = pack('C*', @$uni);
331 $outVal =~ s/\0.*//s;
332 } else { # 2-byte and 4-byte fixed-width
333 # convert if required
334 if ($inv) {
335 $$inv{$_} and $_ = $$inv{$_} foreach @$uni;
336 }
337 # generate surrogate pairs of UTF-16
338 if ($charset eq 'UTF16') {
339 my $i;
340 for ($i=0; $i<@$uni; ++$i) {
341 next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
342 my $t = $$uni[$i] - 0x10000;
343 my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
344 my $w2 = 0xdc00 + ($t & 0x3ff);
345 splice(@$uni, $i, 1, $w1, $w2);
346 ++$i; # skip surrogate pair
347 }
348 }
349 # pack as 2- or 4-byte integer in specified byte order
350 my $byteOrder = $_[3] || GetByteOrder();
351 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
352 $fmt = uc($fmt) if $csType & 0x400;
353 $outVal = pack($fmt, @$uni);
354 }
355 return $outVal;
356}
357
3581; # end
359
360__END__
361
362=head1 NAME
363
364Image::ExifTool::Charset - ExifTool character encoding routines
365
366=head1 SYNOPSIS
367
368This module is required by Image::ExifTool.
369
370=head1 DESCRIPTION
371
372This module contains routines used by ExifTool to translate special
373character sets. Currently, the following character sets are supported:
374
375 UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS,
376 Latin, Latin2, MacArabic, MacChineseCN, MacChineseTW, MacCroatian,
377 MacCyrillic, MacGreek, MacHebrew, MacIceland, MacJapanese, MacKorean,
378 MacLatin2, MacRSymbol, MacRoman, MacRomanian, MacThai, MacTurkish,
379 PDFDoc, RSymbol, ShiftJIS, Symbol, Thai, Turkish, Vietnam
380
381However, only some of these character sets are available to the user via
382ExifTool options; the multi-byte character sets are used only internally
383when decoding certain types of information.
384
385=head1 AUTHOR
386
387Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
388
389This library is free software; you can redistribute it and/or modify it
390under the same terms as Perl itself.
391
392=head1 SEE ALSO
393
394L<Image::ExifTool(3pm)|Image::ExifTool>
395
396=cut
Note: See TracBrowser for help on using the repository browser.