Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/Image/ExifTool/Charset.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago
An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)
Property svn:executable set to ``*
File size: 14.8 KB

Line
1	#------------------------------------------------------------------------------
2	# File: Charset.pm
3	#
4	# Description: ExifTool character encoding routines
5	#
6	# Revisions: 2009/08/28 - P. Harvey created
7	# 2010/01/20 - P. Harvey complete re-write
8	# 2010/07/16 - P. Harvey added UTF-16 support
9	#------------------------------------------------------------------------------
10
11	package Image::ExifTool::Charset;
12
13	use strict;
14	use vars qw($VERSION %csType);
15	use Image::ExifTool qw(:DataAccess :Utils);
16
17	$VERSION = '1.07';
18
19	my %charsetTable; # character set tables we've loaded
20
21	# lookup for converting Unicode to 1-byte character sets
22	my %unicode2byte = (
23	Latin => { # pre-load Latin (cp1252) for speed
24	0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96,
25	0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97,
26	0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98,
27	0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99,
28	0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a,
29	0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b,
30	0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c,
31	0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e,
32	0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f,
33	},
34	);
35
36	# bit flags for all supported character sets
37	# (this number must be correct because it dictates the decoding algorithm!)
38	# 0x001 = character set requires a translation module
39	# 0x002 = inverse conversion not yet supported by Recompose()
40	# 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
41	# 0x100 = 1-byte fixed-width characters
42	# 0x200 = 2-byte fixed-width characters
43	# 0x400 = 4-byte fixed-width characters
44	# 0x800 = 1- and 2-byte variable-width characters, or 1-byte
45	# fixed-width characters that map into multiple codepoints
46	# Note: In its public interface, ExifTool can currently only support type 0x101
47	# and lower character sets because strings are only converted if they
48	# contain characters above 0x7f and there is no provision for specifying
49	# the byte order for input/output values
50	%csType = (
51	UTF8 => 0x100,
52	ASCII => 0x100, # (treated like UTF8)
53	Arabic => 0x101,
54	Baltic => 0x101,
55	Cyrillic => 0x101,
56	Greek => 0x101,
57	Hebrew => 0x101,
58	Latin => 0x101,
59	Latin2 => 0x101,
60	MacCroatian => 0x101,
61	MacCyrillic => 0x101,
62	MacGreek => 0x101,
63	MacIceland => 0x101,
64	MacLatin2 => 0x101,
65	MacRoman => 0x101,
66	MacRomanian => 0x101,
67	MacTurkish => 0x101,
68	Thai => 0x101,
69	Turkish => 0x101,
70	Vietnam => 0x101,
71	MacArabic => 0x103, # (directional characters not supported)
72	PDFDoc => 0x181,
73	Unicode => 0x200, # (UCS2)
74	UCS2 => 0x200,
75	UTF16 => 0x200,
76	Symbol => 0x201,
77	JIS => 0x201,
78	UCS4 => 0x400,
79	MacChineseCN => 0x803,
80	MacChineseTW => 0x803,
81	MacHebrew => 0x803, # (directional characters not supported)
82	MacKorean => 0x803,
83	MacRSymbol => 0x803,
84	MacThai => 0x803,
85	MacJapanese => 0x883,
86	ShiftJIS => 0x883,
87	);
88
89	#------------------------------------------------------------------------------
90	# Load character set module
91	# Inputs: 0) Module name
92	# Returns: Reference to lookup hash, or undef on error
93	sub LoadCharset($)
94	{
95	my $charset = shift;
96	my $conv = $charsetTable{$charset};
97	unless ($conv) {
98	# load translation module
99	my $module = "Image::ExifTool::Charset::$charset";
100	no strict 'refs';
101	if (%$module or eval "require $module") {
102	$conv = $charsetTable{$charset} = \%$module;
103	}
104	}
105	return $conv;
106	}
107
108	#------------------------------------------------------------------------------
109	# Decompose string with specified encoding into an array of integer code points
110	# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
111	# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
112	# Returns: Reference to array of Unicode values
113	# Notes: Accepts any type of character set
114	# - byte order only used for fixed-width 2-byte and 4-byte character sets
115	# - byte order mark observed and then removed with UCS2 and UCS4
116	# - no warnings are issued if ExifTool object is not provided
117	sub Decompose($$$;$)
118	{
119	local $_;
120	my ($exifTool, $val, $charset) = @_; # ($byteOrder assigned later if required)
121	my $type = $csType{$charset};
122	my (@uni, $conv);
123
124	if ($type & 0x001) {
125	$conv = LoadCharset($charset);
126	unless ($conv) {
127	# (shouldn't happen)
128	$exifTool->Warn("Invalid character set $charset") if $exifTool;
129	return \@uni; # error!
130	}
131	} elsif ($type == 0x100) {
132	# convert ASCII and UTF8 (treat ASCII as UTF8)
133	if ($] < 5.006001) {
134	# do it ourself
135	@uni = Image::ExifTool::UnpackUTF8($val);
136	} else {
137	# handle warnings from malformed UTF-8
138	undef $Image::ExifTool::evalWarning;
139	local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
140	# (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
141	@uni = unpack($] < 5.010000 ? 'U0U' : 'C0U', $val);
142	# issue warning if we had errors
143	if ($Image::ExifTool::evalWarning and $exifTool and not $$exifTool{WarnBadUTF8}) {
144	$exifTool->Warn('Malformed UTF-8 character(s)');
145	$$exifTool{WarnBadUTF8} = 1;
146	}
147	}
148	return \@uni; # all done!
149	}
150	if ($type & 0x100) { # 1-byte fixed-width characters
151	@uni = unpack('C*', $val);
152	foreach (@uni) {
153	$_ = $$conv{$_} if defined $$conv{$_};
154	}
155	} elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
156	my $unknown;
157	my $byteOrder = $_[3];
158	if (not $byteOrder) {
159	$byteOrder = GetByteOrder();
160	} elsif ($byteOrder eq 'Unknown') {
161	$byteOrder = GetByteOrder();
162	$unknown = 1;
163	}
164	my $fmt = $byteOrder eq 'MM' ? 'n' : 'v';
165	if ($type & 0x400) { # 4-byte
166	$fmt = uc $fmt; # unpack as 'N' or 'V'
167	# honour BOM if it exists
168	$val =~ s/^(\0\0\xfe\xff\|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N' : 'V';
169	undef $unknown; # (byte order logic applies to 2-byte only)
170	} elsif ($val =~ s/^(\xfe\xff\|\xff\xfe)//) {
171	$fmt = $1 eq "\xfe\xff" ? 'n' : 'v';
172	undef $unknown;
173	}
174	# convert from UCS2 or UCS4
175	@uni = unpack($fmt, $val);
176
177	if (not $conv) {
178	# no translation necessary
179	if ($unknown) {
180	# check the byte order
181	my (%bh, %bl);
182	my ($zh, $zl) = (0, 0);
183	foreach (@uni) {
184	$bh{$_ >> 8} = 1;
185	$bl{$_ & 0xff} = 1;
186	++$zh unless $_ & 0xff00;
187	++$zl unless $_ & 0x00ff;
188	}
189	# count the number of unique values in the hi and lo bytes
190	my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
191	# the byte with the greater number of unique values should be
192	# the low-order byte, otherwise the byte which is zero more
193	# often is likely the high-order byte
194	if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
195	# we guessed wrong, so decode using the other byte order
196	$fmt =~ tr/nvNV/vnVN/;
197	@uni = unpack($fmt, $val);
198	}
199	}
200	# handle surrogate pairs of UTF-16
201	if ($charset eq 'UTF16') {
202	my $i;
203	for ($i=0; $i<$#uni; ++$i) {
204	next unless ($uni[$i] & 0xfc00) == 0xd800 and
205	($uni[$i+1] & 0xfc00) == 0xdc00;
206	my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
207	splice(@uni, $i, 2, $cp);
208	}
209	}
210	} elsif ($unknown) {
211	# count encoding errors as we do the translation
212	my $e1 = 0;
213	foreach (@uni) {
214	defined $$conv{$_} and $_ = $$conv{$_}, next;
215	++$e1;
216	}
217	# try the other byte order if we had any errors
218	if ($e1) {
219	$fmt = $byteOrder eq 'MM' ? 'v' : 'n'; #(reversed)
220	my @try = unpack($fmt, $val);
221	my $e2 = 0;
222	foreach (@try) {
223	defined $$conv{$_} and $_ = $$conv{$_}, next;
224	++$e2;
225	}
226	# use this byte order if there are fewer errors
227	return \@try if $e2 < $e1;
228	}
229	} else {
230	# translate any characters found in the lookup
231	foreach (@uni) {
232	$_ = $$conv{$_} if defined $$conv{$_};
233	}
234	}
235	} else { # variable-width characters
236	# unpack into bytes
237	my @bytes = unpack('C*', $val);
238	while (@bytes) {
239	my $ch = shift @bytes;
240	my $cv = $$conv{$ch};
241	# pass straight through if no translation
242	$cv or push(@uni, $ch), next;
243	# byte translates into single Unicode character
244	ref $cv or push(@uni, $cv), next;
245	# byte maps into multiple Unicode characters
246	ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
247	# handle 2-byte character codes
248	$ch = shift @bytes;
249	if (defined $ch) {
250	if ($$cv{$ch}) {
251	$cv = $$cv{$ch};
252	ref $cv or push(@uni, $cv), next;
253	push @uni, @$cv; # multiple Unicode characters
254	} else {
255	push @uni, ord('?'); # encoding error
256	unshift @bytes, $ch;
257	}
258	} else {
259	push @uni, ord('?'); # encoding error
260	}
261	}
262	}
263	return \@uni;
264	}
265
266	#------------------------------------------------------------------------------
267	# Convert array of code point integers into a string with specified encoding
268	# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
269	# 2) character set (note: not all types are supported)
270	# 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
271	# Returns: converted string (truncated at null character if it exists), empty on error
272	# Notes: converts elements of input character array to new code points
273	# - ExifTool ref may be undef provided $charset is defined
274	sub Recompose($$;$$)
275	{
276	local $_;
277	my ($exifTool, $uni, $charset) = @_; # ($byteOrder assigned later if required)
278	my ($outVal, $conv, $inv);
279	$charset or $charset = $$exifTool{OPTIONS}{Charset};
280	my $csType = $csType{$charset};
281	if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8)
282	if ($] >= 5.006001) {
283	# let Perl do it
284	$outVal = pack('C0U*', @$uni);
285	} else {
286	# do it ourself
287	$outVal = Image::ExifTool::PackUTF8(@$uni);
288	}
289	$outVal =~ s/\0.*//s; # truncate at null terminator
290	return $outVal;
291	}
292	# get references to forward and inverse lookup tables
293	if ($csType & 0x801) {
294	$conv = LoadCharset($charset);
295	unless ($conv) {
296	$exifTool->Warn("Missing charset $charset") if $exifTool;
297	return '';
298	}
299	$inv = $unicode2byte{$charset};
300	# generate inverse lookup if necessary
301	unless ($inv) {
302	if (not $csType or $csType & 0x802) {
303	$exifTool->Warn("Invalid destination charset $charset") if $exifTool;
304	return '';
305	}
306	# prepare table to convert from Unicode to 1-byte characters
307	my ($char, %inv);
308	foreach $char (keys %$conv) {
309	$inv{$$conv{$char}} = $char;
310	}
311	$inv = $unicode2byte{$charset} = \%inv;
312	}
313	}
314	if ($csType & 0x100) { # 1-byte fixed-width
315	# convert to specified character set
316	foreach (@$uni) {
317	next if $_ < 0x80;
318	$$inv{$_} and $_ = $$inv{$_}, next;
319	# our tables omit 1-byte characters with the same values as Unicode,
320	# so pass them straight through after making sure there isn't a
321	# different character with this byte value
322	next if $_ < 0x100 and not $$conv{$_};
323	$_ = ord('?'); # set invalid characters to '?'
324	if ($exifTool and not $$exifTool{EncodingError}) {
325	$exifTool->Warn("Some character(s) could not be encoded in $charset");
326	$$exifTool{EncodingError} = 1;
327	}
328	}
329	# repack as an 8-bit string and truncate at null
330	$outVal = pack('C*', @$uni);
331	$outVal =~ s/\0.*//s;
332	} else { # 2-byte and 4-byte fixed-width
333	# convert if required
334	if ($inv) {
335	$$inv{$_} and $_ = $$inv{$_} foreach @$uni;
336	}
337	# generate surrogate pairs of UTF-16
338	if ($charset eq 'UTF16') {
339	my $i;
340	for ($i=0; $i<@$uni; ++$i) {
341	next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
342	my $t = $$uni[$i] - 0x10000;
343	my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
344	my $w2 = 0xdc00 + ($t & 0x3ff);
345	splice(@$uni, $i, 1, $w1, $w2);
346	++$i; # skip surrogate pair
347	}
348	}
349	# pack as 2- or 4-byte integer in specified byte order
350	my $byteOrder = $_[3] \|\| GetByteOrder();
351	my $fmt = $byteOrder eq 'MM' ? 'n' : 'v';
352	$fmt = uc($fmt) if $csType & 0x400;
353	$outVal = pack($fmt, @$uni);
354	}
355	return $outVal;
356	}
357
358	1; # end
359
360	__END__
361
362	=head1 NAME
363
364	Image::ExifTool::Charset - ExifTool character encoding routines
365
366	=head1 SYNOPSIS
367
368	This module is required by Image::ExifTool.
369
370	=head1 DESCRIPTION
371
372	This module contains routines used by ExifTool to translate special
373	character sets. Currently, the following character sets are supported:
374
375	UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS,
376	Latin, Latin2, MacArabic, MacChineseCN, MacChineseTW, MacCroatian,
377	MacCyrillic, MacGreek, MacHebrew, MacIceland, MacJapanese, MacKorean,
378	MacLatin2, MacRSymbol, MacRoman, MacRomanian, MacThai, MacTurkish,
379	PDFDoc, RSymbol, ShiftJIS, Symbol, Thai, Turkish, Vietnam
380
381	However, only some of these character sets are available to the user via
382	ExifTool options; the multi-byte character sets are used only internally
383	when decoding certain types of information.
384
385	=head1 AUTHOR
386
387	Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
388
389	This library is free software; you can redistribute it and/or modify it
390	under the same terms as Perl itself.
391
392	=head1 SEE ALSO
393
394	L<Image::ExifTool(3pm)\|Image::ExifTool>
395
396	=cut

Note: See TracBrowser for help on using the repository browser.

Download in other formats: