Context Navigation

Charset.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago
Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.
Property svn:executable set to ``*
File size: 15.9 KB

Line
1	#------------------------------------------------------------------------------
2	# File: Charset.pm
3	#
4	# Description: ExifTool character encoding routines
5	#
6	# Revisions: 2009/08/28 - P. Harvey created
7	# 2010/01/20 - P. Harvey complete re-write
8	# 2010/07/16 - P. Harvey added UTF-16 support
9	#------------------------------------------------------------------------------
10
11	package Image::ExifTool::Charset;
12
13	use strict;
14	use vars qw($VERSION %csType);
15	use Image::ExifTool qw(:DataAccess :Utils);
16
17	$VERSION = '1.11';
18
19	my %charsetTable; # character set tables we've loaded
20
21	# lookup for converting Unicode to 1-byte character sets
22	my %unicode2byte = (
23	Latin => { # pre-load Latin (cp1252) for speed
24	0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96,
25	0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97,
26	0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98,
27	0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99,
28	0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a,
29	0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b,
30	0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c,
31	0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e,
32	0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f,
33	},
34	);
35
36	# bit flags for all supported character sets
37	# (this number must be correct because it dictates the decoding algorithm!)
38	# 0x001 = character set requires a translation module
39	# 0x002 = inverse conversion not yet supported by Recompose()
40	# 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
41	# 0x100 = 1-byte fixed-width characters
42	# 0x200 = 2-byte fixed-width characters
43	# 0x400 = 4-byte fixed-width characters
44	# 0x800 = 1- and 2-byte variable-width characters, or 1-byte
45	# fixed-width characters that map into multiple codepoints
46	# Note: In its public interface, ExifTool can currently only support type 0x101
47	# and lower character sets because strings are only converted if they
48	# contain characters above 0x7f and there is no provision for specifying
49	# the byte order for input/output values
50	%csType = (
51	UTF8 => 0x100,
52	ASCII => 0x100, # (treated like UTF8)
53	Arabic => 0x101,
54	Baltic => 0x101,
55	Cyrillic => 0x101,
56	Greek => 0x101,
57	Hebrew => 0x101,
58	Latin => 0x101,
59	Latin2 => 0x101,
60	DOSLatinUS => 0x101,
61	DOSLatin1 => 0x101,
62	DOSCyrillic => 0x101,
63	MacCroatian => 0x101,
64	MacCyrillic => 0x101,
65	MacGreek => 0x101,
66	MacIceland => 0x101,
67	MacLatin2 => 0x101,
68	MacRoman => 0x101,
69	MacRomanian => 0x101,
70	MacTurkish => 0x101,
71	Thai => 0x101,
72	Turkish => 0x101,
73	Vietnam => 0x101,
74	MacArabic => 0x103, # (directional characters not supported)
75	PDFDoc => 0x181,
76	Unicode => 0x200, # (UCS2)
77	UCS2 => 0x200,
78	UTF16 => 0x200,
79	Symbol => 0x201,
80	JIS => 0x201,
81	UCS4 => 0x400,
82	MacChineseCN => 0x803,
83	MacChineseTW => 0x803,
84	MacHebrew => 0x803, # (directional characters not supported)
85	MacKorean => 0x803,
86	MacRSymbol => 0x803,
87	MacThai => 0x803,
88	MacJapanese => 0x883,
89	ShiftJIS => 0x883,
90	);
91
92	#------------------------------------------------------------------------------
93	# Load character set module
94	# Inputs: 0) Module name
95	# Returns: Reference to lookup hash, or undef on error
96	sub LoadCharset($)
97	{
98	my $charset = shift;
99	my $conv = $charsetTable{$charset};
100	unless ($conv) {
101	# load translation module
102	my $module = "Image::ExifTool::Charset::$charset";
103	no strict 'refs';
104	if (%$module or eval "require $module") {
105	$conv = $charsetTable{$charset} = \%$module;
106	}
107	}
108	return $conv;
109	}
110
111	#------------------------------------------------------------------------------
112	# Does an array contain valid UTF-16 characters?
113	# Inputs: 0) array reference to list of UCS-2 values
114	# Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates
115	sub IsUTF16($)
116	{
117	local $_;
118	my $uni = shift;
119	my $surrogate;
120	foreach (@$uni) {
121	my $hiBits = ($_ & 0xfc00);
122	if ($hiBits == 0xfc00) {
123	# check for invalid values in UTF-16
124	return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef);
125	} elsif ($surrogate) {
126	return 0 if $hiBits != 0xdc00;
127	$surrogate = 0;
128	} else {
129	return 0 if $hiBits == 0xdc00;
130	$surrogate = 1 if $hiBits == 0xd800;
131	}
132	}
133	return 1 if not defined $surrogate;
134	return 2 unless $surrogate;
135	return 0;
136	}
137
138	#------------------------------------------------------------------------------
139	# Decompose string with specified encoding into an array of integer code points
140	# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
141	# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
142	# Returns: Reference to array of Unicode values
143	# Notes: Accepts any type of character set
144	# - byte order only used for fixed-width 2-byte and 4-byte character sets
145	# - byte order mark observed and then removed with UCS2 and UCS4
146	# - no warnings are issued if ExifTool object is not provided
147	# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
148	sub Decompose($$$;$)
149	{
150	local $_;
151	my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
152	my $type = $csType{$charset};
153	my (@uni, $conv);
154
155	if ($type & 0x001) {
156	$conv = LoadCharset($charset);
157	unless ($conv) {
158	# (shouldn't happen)
159	$et->Warn("Invalid character set $charset") if $et;
160	return \@uni; # error!
161	}
162	} elsif ($type == 0x100) {
163	# convert ASCII and UTF8 (treat ASCII as UTF8)
164	if ($] < 5.006001) {
165	# do it ourself
166	@uni = Image::ExifTool::UnpackUTF8($val);
167	} else {
168	# handle warnings from malformed UTF-8
169	undef $Image::ExifTool::evalWarning;
170	local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
171	# (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
172	@uni = unpack($] < 5.010000 ? 'U0U' : 'C0U', $val);
173	# issue warning if we had errors
174	if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
175	$et->Warn('Malformed UTF-8 character(s)');
176	$$et{WarnBadUTF8} = 1;
177	}
178	}
179	return \@uni; # all done!
180	}
181	if ($type & 0x100) { # 1-byte fixed-width characters
182	@uni = unpack('C*', $val);
183	foreach (@uni) {
184	$_ = $$conv{$_} if defined $$conv{$_};
185	}
186	} elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters
187	my $unknown;
188	my $byteOrder = $_[3];
189	if (not $byteOrder) {
190	$byteOrder = GetByteOrder();
191	} elsif ($byteOrder eq 'Unknown') {
192	$byteOrder = GetByteOrder();
193	$unknown = 1;
194	}
195	my $fmt = $byteOrder eq 'MM' ? 'n' : 'v';
196	if ($type & 0x400) { # 4-byte
197	$fmt = uc $fmt; # unpack as 'N' or 'V'
198	# honour BOM if it exists
199	$val =~ s/^(\0\0\xfe\xff\|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N' : 'V';
200	undef $unknown; # (byte order logic applies to 2-byte only)
201	} elsif ($val =~ s/^(\xfe\xff\|\xff\xfe)//) {
202	$fmt = $1 eq "\xfe\xff" ? 'n' : 'v';
203	undef $unknown;
204	}
205	# convert from UCS2 or UCS4
206	@uni = unpack($fmt, $val);
207
208	if (not $conv) {
209	# no translation necessary
210	if ($unknown) {
211	# check the byte order
212	my (%bh, %bl);
213	my ($zh, $zl) = (0, 0);
214	foreach (@uni) {
215	$bh{$_ >> 8} = 1;
216	$bl{$_ & 0xff} = 1;
217	++$zh unless $_ & 0xff00;
218	++$zl unless $_ & 0x00ff;
219	}
220	# count the number of unique values in the hi and lo bytes
221	my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
222	# the byte with the greater number of unique values should be
223	# the low-order byte, otherwise the byte which is zero more
224	# often is likely the high-order byte
225	if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
226	# we guessed wrong, so decode using the other byte order
227	$fmt =~ tr/nvNV/vnVN/;
228	@uni = unpack($fmt, $val);
229	$$et{WrongByteOrder} = 1;
230	}
231	}
232	# handle surrogate pairs of UTF-16
233	if ($charset eq 'UTF16') {
234	my $i;
235	for ($i=0; $i<$#uni; ++$i) {
236	next unless ($uni[$i] & 0xfc00) == 0xd800 and
237	($uni[$i+1] & 0xfc00) == 0xdc00;
238	my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
239	splice(@uni, $i, 2, $cp);
240	}
241	}
242	} elsif ($unknown) {
243	# count encoding errors as we do the translation
244	my $e1 = 0;
245	foreach (@uni) {
246	defined $$conv{$_} and $_ = $$conv{$_}, next;
247	++$e1;
248	}
249	# try the other byte order if we had any errors
250	if ($e1) {
251	$fmt = $byteOrder eq 'MM' ? 'v' : 'n'; #(reversed)
252	my @try = unpack($fmt, $val);
253	my $e2 = 0;
254	foreach (@try) {
255	defined $$conv{$_} and $_ = $$conv{$_}, next;
256	++$e2;
257	}
258	# use this byte order if there are fewer errors
259	if ($e2 < $e1) {
260	$$et{WrongByteOrder} = 1;
261	return \@try;
262	}
263	}
264	} else {
265	# translate any characters found in the lookup
266	foreach (@uni) {
267	$_ = $$conv{$_} if defined $$conv{$_};
268	}
269	}
270	} else { # variable-width characters
271	# unpack into bytes
272	my @bytes = unpack('C*', $val);
273	while (@bytes) {
274	my $ch = shift @bytes;
275	my $cv = $$conv{$ch};
276	# pass straight through if no translation
277	$cv or push(@uni, $ch), next;
278	# byte translates into single Unicode character
279	ref $cv or push(@uni, $cv), next;
280	# byte maps into multiple Unicode characters
281	ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
282	# handle 2-byte character codes
283	$ch = shift @bytes;
284	if (defined $ch) {
285	if ($$cv{$ch}) {
286	$cv = $$cv{$ch};
287	ref $cv or push(@uni, $cv), next;
288	push @uni, @$cv; # multiple Unicode characters
289	} else {
290	push @uni, ord('?'); # encoding error
291	unshift @bytes, $ch;
292	}
293	} else {
294	push @uni, ord('?'); # encoding error
295	}
296	}
297	}
298	return \@uni;
299	}
300
301	#------------------------------------------------------------------------------
302	# Convert array of code point integers into a string with specified encoding
303	# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
304	# 2) character set (note: not all types are supported)
305	# 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
306	# Returns: converted string (truncated at null character if it exists), empty on error
307	# Notes: converts elements of input character array to new code points
308	# - ExifTool ref may be undef provided $charset is defined
309	sub Recompose($$;$$)
310	{
311	local $_;
312	my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required)
313	my ($outVal, $conv, $inv);
314	$charset or $charset = $$et{OPTIONS}{Charset};
315	my $csType = $csType{$charset};
316	if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8)
317	if ($] >= 5.006001) {
318	# let Perl do it
319	$outVal = pack('C0U*', @$uni);
320	} else {
321	# do it ourself
322	$outVal = Image::ExifTool::PackUTF8(@$uni);
323	}
324	$outVal =~ s/\0.*//s; # truncate at null terminator
325	return $outVal;
326	}
327	# get references to forward and inverse lookup tables
328	if ($csType & 0x801) {
329	$conv = LoadCharset($charset);
330	unless ($conv) {
331	$et->Warn("Missing charset $charset") if $et;
332	return '';
333	}
334	$inv = $unicode2byte{$charset};
335	# generate inverse lookup if necessary
336	unless ($inv) {
337	if (not $csType or $csType & 0x802) {
338	$et->Warn("Invalid destination charset $charset") if $et;
339	return '';
340	}
341	# prepare table to convert from Unicode to 1-byte characters
342	my ($char, %inv);
343	foreach $char (keys %$conv) {
344	$inv{$$conv{$char}} = $char;
345	}
346	$inv = $unicode2byte{$charset} = \%inv;
347	}
348	}
349	if ($csType & 0x100) { # 1-byte fixed-width
350	# convert to specified character set
351	foreach (@$uni) {
352	next if $_ < 0x80;
353	$$inv{$_} and $_ = $$inv{$_}, next;
354	# our tables omit 1-byte characters with the same values as Unicode,
355	# so pass them straight through after making sure there isn't a
356	# different character with this byte value
357	next if $_ < 0x100 and not $$conv{$_};
358	$_ = ord('?'); # set invalid characters to '?'
359	if ($et and not $$et{EncodingError}) {
360	$et->Warn("Some character(s) could not be encoded in $charset");
361	$$et{EncodingError} = 1;
362	}
363	}
364	# repack as an 8-bit string and truncate at null
365	$outVal = pack('C*', @$uni);
366	$outVal =~ s/\0.*//s;
367	} else { # 2-byte and 4-byte fixed-width
368	# convert if required
369	if ($inv) {
370	$$inv{$_} and $_ = $$inv{$_} foreach @$uni;
371	}
372	# generate surrogate pairs of UTF-16
373	if ($charset eq 'UTF16') {
374	my $i;
375	for ($i=0; $i<@$uni; ++$i) {
376	next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
377	my $t = $$uni[$i] - 0x10000;
378	my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
379	my $w2 = 0xdc00 + ($t & 0x3ff);
380	splice(@$uni, $i, 1, $w1, $w2);
381	++$i; # skip surrogate pair
382	}
383	}
384	# pack as 2- or 4-byte integer in specified byte order
385	my $byteOrder = $_[3] \|\| GetByteOrder();
386	my $fmt = $byteOrder eq 'MM' ? 'n' : 'v';
387	$fmt = uc($fmt) if $csType & 0x400;
388	$outVal = pack($fmt, @$uni);
389	}
390	return $outVal;
391	}
392
393	1; # end
394
395	__END__
396
397	=head1 NAME
398
399	Image::ExifTool::Charset - ExifTool character encoding routines
400
401	=head1 SYNOPSIS
402
403	This module is required by Image::ExifTool.
404
405	=head1 DESCRIPTION
406
407	This module contains routines used by ExifTool to translate special
408	character sets. Currently, the following character sets are supported:
409
410	UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS,
411	Latin, Latin2, DOSLatinUS, DOSLatin1, DOSCyrillic, MacArabic,
412	MacChineseCN, MacChineseTW, MacCroatian, MacCyrillic, MacGreek, MacHebrew,
413	MacIceland, MacJapanese, MacKorean, MacLatin2, MacRSymbol, MacRoman,
414	MacRomanian, MacThai, MacTurkish, PDFDoc, RSymbol, ShiftJIS, Symbol, Thai,
415	Turkish, Vietnam
416
417	However, only some of these character sets are available to the user via
418	ExifTool options -- the multi-byte character sets are used only internally
419	when decoding certain types of information.
420
421	=head1 AUTHOR
422
423	Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
424
425	This library is free software; you can redistribute it and/or modify it
426	under the same terms as Perl itself.
427
428	=head1 SEE ALSO
429
430	L<Image::ExifTool(3pm)\|Image::ExifTool>
431
432	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Charset.pm@ 34921

Download in other formats: