source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/PLIST.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 17.1 KB
Line 
1#------------------------------------------------------------------------------
2# File: PLIST.pm
3#
4# Description: Read Apple PLIST information
5#
6# Revisions: 2013-02-01 - P. Harvey Created
7#
8# References: 1) http://www.apple.com/DTDs/PropertyList-1.0.dtd
9# 2) http://opensource.apple.com/source/CF/CF-550/CFBinaryPList.c
10#
11# Notes: - Sony MODD files also use XML PLIST format, but with a few quirks
12#
13# - Decodes both the binary and XML-based PLIST formats
14#------------------------------------------------------------------------------
15
16package Image::ExifTool::PLIST;
17
18use strict;
19use vars qw($VERSION);
20use Image::ExifTool qw(:DataAccess :Utils);
21use Image::ExifTool::XMP;
22use Image::ExifTool::GPS;
23
24$VERSION = '1.09';
25
26sub ExtractObject($$;$);
27sub Get24u($$);
28
29# access routines to read various-sized integer/real values (add 0x100 to size for reals)
30my %readProc = (
31 1 => \&Get8u,
32 2 => \&Get16u,
33 3 => \&Get24u,
34 4 => \&Get32u,
35 8 => \&Get64u,
36 0x104 => \&GetFloat,
37 0x108 => \&GetDouble,
38);
39
40# recognize different types of PLIST files based on certain tags
41my %plistType = (
42 adjustmentBaseVersion => 'AAE',
43);
44
45# PLIST tags (generated on-the-fly for most tags)
46%Image::ExifTool::PLIST::Main = (
47 PROCESS_PROC => \&ProcessPLIST,
48 GROUPS => { 0 => 'PLIST', 1 => 'XML', 2 => 'Document' },
49 VARS => { LONG_TAGS => 4 },
50 NOTES => q{
51 Apple Property List tags. ExifTool reads both XML and binary-format PLIST
52 files, and will extract any existing tags even if they aren't listed below.
53 These tags belong to the family 0 "PLIST" group, but family 1 group may be
54 either "XML" or "PLIST" depending on whether the format is XML or binary.
55 },
56#
57# tags found in PLIST information of QuickTime iTunesInfo iTunMOVI atom (ref PH)
58#
59 'cast//name' => { Name => 'Cast', List => 1 },
60 'directors//name' => { Name => 'Directors', List => 1 },
61 'producers//name' => { Name => 'Producers', List => 1 },
62 'screenwriters//name' => { Name => 'Screenwriters', List => 1 },
63 'codirectors//name' => { Name => 'Codirectors', List => 1 }, # (NC)
64 'studio//name' => { Name => 'Studio', List => 1 }, # (NC)
65#
66# tags found in MODD files (ref PH)
67#
68 'MetaDataList//DateTimeOriginal' => {
69 Name => 'DateTimeOriginal',
70 Description => 'Date/Time Original',
71 Groups => { 2 => 'Time' },
72 # Sony uses a "real" here -- number of days since Dec 31, 1899
73 ValueConv => 'IsFloat($val) ? ConvertUnixTime(($val - 25569) * 24 * 3600) : $val',
74 PrintConv => '$self->ConvertDateTime($val)',
75 },
76 'MetaDataList//Duration' => {
77 Name => 'Duration',
78 Groups => { 2 => 'Video' },
79 PrintConv => 'ConvertDuration($val)',
80 },
81 'MetaDataList//Geolocation/Latitude' => {
82 Name => 'GPSLatitude',
83 Groups => { 2 => 'Location' },
84 PrintConv => 'Image::ExifTool::GPS::ToDMS($self, $val, 1, "N")',
85 },
86 'MetaDataList//Geolocation/Longitude' => {
87 Name => 'GPSLongitude',
88 Groups => { 2 => 'Location' },
89 PrintConv => 'Image::ExifTool::GPS::ToDMS($self, $val, 1, "E")',
90 },
91 'MetaDataList//Geolocation/MapDatum' => {
92 Name => 'GPSMapDatum',
93 Groups => { 2 => 'Location' },
94 },
95 XMLFileType => {
96 # recognize MODD files by their content
97 RawConv => q{
98 if ($val eq 'ModdXML' and $$self{FILE_TYPE} eq 'XMP') {
99 $self->OverrideFileType('MODD');
100 }
101 return $val;
102 },
103 },
104);
105
106#------------------------------------------------------------------------------
107# We found a PLIST XML property name/value
108# Inputs: 0) ExifTool object ref, 1) tag table ref
109# 2) reference to array of XML property names (last is current property)
110# 3) property value, 4) attribute hash ref (not used here)
111# Returns: 1 if valid tag was found
112sub FoundTag($$$$;$)
113{
114 my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
115 return 0 unless @$props;
116 my $verbose = $et->Options('Verbose');
117 my $keys = $$et{PListKeys} || ( $$et{PListKeys} = [] );
118
119 my $prop = $$props[-1];
120 if ($verbose > 1) {
121 $et->VPrint(0, $$et{INDENT}, '[', join('/',@$props), ' = ',
122 $et->Printable($val), "]\n");
123 }
124 # un-escape XML character entities
125 $val = Image::ExifTool::XMP::UnescapeXML($val);
126
127 # handle the various PLIST properties
128 if ($prop eq 'data') {
129 if ($val =~ /^[0-9a-f]+$/ and not length($val) & 0x01) {
130 # MODD files use ASCII-hex encoded "data"...
131 my $buff = pack('H*', $val);
132 $val = \$buff;
133 } else {
134 # ...but the PLIST DTD specifies Base64 encoding
135 $val = Image::ExifTool::XMP::DecodeBase64($val);
136 }
137 } elsif ($prop eq 'date') {
138 $val = Image::ExifTool::XMP::ConvertXMPDate($val);
139 } elsif ($prop eq 'true' or $prop eq 'false') {
140 $val = ucfirst $prop;
141 } else {
142 # convert from UTF8 to ExifTool Charset
143 $val = $et->Decode($val, 'UTF8');
144 if ($prop eq 'key') {
145 if (@$props <= 3) { # top-level key should be plist/dict/key
146 @$keys = ( $val );
147 } else {
148 # save key names to be used in tag name
149 push @$keys, '' while @$keys < @$props - 3;
150 pop @$keys while @$keys > @$props - 2;
151 $$keys[@$props - 3] = $val;
152 }
153 return 0;
154 }
155 }
156
157 return 0 unless @$keys; # can't store value if no associated key
158
159 my $tag = join '/', @$keys; # generate tag ID from 'key' values
160 my $tagInfo = $$tagTablePtr{$tag};
161 unless ($tagInfo) {
162 $et->VPrint(0, $$et{INDENT}, "[adding $tag]\n") if $verbose;
163 # generate tag name from ID
164 my $name = $tag;
165 $name =~ s{^MetaDataList//}{}; # shorten long MODD metadata tag names
166 $name =~ s{//name$}{}; # remove unnecessary MODD "name" property
167 $name =~ s/([^A-Za-z])([a-z])/$1\u$2/g; # capitalize words
168 $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
169 $tagInfo = { Name => ucfirst($name), List => 1 };
170 if ($prop eq 'date') {
171 $$tagInfo{Groups}{2} = 'Time';
172 $$tagInfo{PrintConv} = '$self->ConvertDateTime($val)';
173 }
174 AddTagToTable($tagTablePtr, $tag, $tagInfo);
175 }
176 # allow list-behaviour only for consecutive tags with the same ID
177 if ($$et{LastPListTag} and $$et{LastPListTag} ne $tagInfo) {
178 delete $$et{LIST_TAGS}{$$et{LastPListTag}};
179 }
180 $$et{LastPListTag} = $tagInfo;
181 # override file type if applicable
182 $et->OverrideFileType($plistType{$tag}) if $plistType{$tag} and $$et{FILE_TYPE} eq 'XMP';
183 # save the tag
184 $et->HandleTag($tagTablePtr, $tag, $val);
185
186 return 1;
187}
188
189#------------------------------------------------------------------------------
190# Get big-endian 24-bit integer
191# Inputs: 0) data ref, 1) offset
192# Returns: integer value
193sub Get24u($$)
194{
195 my ($dataPt, $off) = @_;
196 return unpack 'N', "\0" . substr($$dataPt, $off, 3);
197}
198
199#------------------------------------------------------------------------------
200# Extract object from binary PLIST file at the current file position (ref 2)
201# Inputs: 0) ExifTool ref, 1) PLIST info ref, 2) parent tag ID (undef for top)
202# Returns: the object, or undef on error
203sub ExtractObject($$;$)
204{
205 my ($et, $plistInfo, $parent) = @_;
206 my $raf = $$plistInfo{RAF};
207 my ($buff, $val);
208
209 $raf->Read($buff, 1) == 1 or return undef;
210 my $type = ord($buff) >> 4;
211 my $size = ord($buff) & 0x0f;
212 if ($type == 0) { # null/bool/fill
213 $val = { 0x00=>'<null>', 0x08=>'True', 0x09=>'False', 0x0f=>'<fill>' }->{$size};
214 } elsif ($type == 1 or $type == 2 or $type == 3) { # int, float or date
215 $size = 1 << $size;
216 my $proc = ($type == 1 ? $readProc{$size} : $readProc{$size + 0x100}) or return undef;
217 $val = &$proc(\$buff, 0) if $raf->Read($buff, $size) == $size;
218 if ($type == 3 and defined $val) { # date
219 # dates are referenced to Jan 1, 2001 (11323 days from Unix time zero)
220 $val = Image::ExifTool::ConvertUnixTime($val + 11323 * 24 * 3600, 1);
221 $$plistInfo{DateFormat} = 1;
222 }
223 } elsif ($type == 8) { # UID
224 ++$size;
225 $raf->Read($buff, $size) == $size or return undef;
226 my $proc = $readProc{$size};
227 if ($proc) {
228 $val = &$proc(\$buff, 0);
229 } elsif ($size == 16) {
230 require Image::ExifTool::ASF;
231 $val = Image::ExifTool::ASF::GetGUID($buff);
232 } else {
233 $val = "0x" . unpack 'H*', $buff;
234 }
235 } else {
236 # $size is the size of the remaining types
237 if ($size == 0x0f) {
238 # size is stored in extra integer object
239 $size = ExtractObject($et, $plistInfo);
240 return undef unless defined $size and $size =~ /^\d+$/;
241 }
242 if ($type == 4) { # data
243 if ($size < 1000000 or $et->Options('Binary')) {
244 $raf->Read($buff, $size) == $size or return undef;
245 } else {
246 $buff = "Binary data $size bytes";
247 }
248 $val = \$buff; # (return reference for binary data)
249 } elsif ($type == 5) { # ASCII string
250 $raf->Read($val, $size) == $size or return undef;
251 } elsif ($type == 6) { # UCS-2BE string
252 $size *= 2;
253 $raf->Read($buff, $size) == $size or return undef;
254 $val = $et->Decode($buff, 'UCS2');
255 } elsif ($type == 10 or $type == 12 or $type == 13) { # array, set or dict
256 # the remaining types store a list of references
257 my $refSize = $$plistInfo{RefSize};
258 my $refProc = $$plistInfo{RefProc};
259 my $num = $type == 13 ? $size * 2 : $size;
260 my $len = $num * $refSize;
261 $raf->Read($buff, $len) == $len or return undef;
262 my $table = $$plistInfo{Table};
263 my ($i, $ref, @refs, @array);
264 for ($i=0; $i<$num; ++$i) {
265 my $ref = &$refProc(\$buff, $i * $refSize);
266 return 0 if $ref >= @$table;
267 push @refs, $ref;
268 }
269 if ($type == 13) { # dict
270 # prevent infinite recursion
271 if (defined $parent and length $parent > 1000) {
272 $et->WarnOnce('Possible deep recursion while parsing PLIST');
273 return undef;
274 }
275 my $tagTablePtr = $$plistInfo{TagTablePtr};
276 my $verbose = $et->Options('Verbose');
277 for ($i=0; $i<$size; ++$i) {
278 # get the entry key
279 $raf->Seek($$table[$refs[$i]], 0) or return undef;
280 my $key = ExtractObject($et, $plistInfo);
281 next unless defined $key and length $key; # silently ignore bad dict entries
282 # get the entry value
283 $raf->Seek($$table[$refs[$i+$size]], 0) or return undef;
284 # generate an ID for this tag
285 my $tag = defined $parent ? "$parent/$key" : $key;
286 undef $$plistInfo{DateFormat};
287 my $val = ExtractObject($et, $plistInfo, $tag);
288 next if not defined $val or ref($val) eq 'HASH';
289 my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag);
290 unless ($tagInfo) {
291 $et->VPrint(0, $$et{INDENT}, "[adding $tag]\n") if $verbose;
292 my $name = $tag;
293 $name =~ s/([^A-Za-z])([a-z])/$1\u$2/g; # capitalize words
294 $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
295 $tagInfo = { Name => ucfirst($name), List => 1 };
296 if ($$plistInfo{DateFormat}) {
297 $$tagInfo{Groups}{2} = 'Time';
298 $$tagInfo{PrintConv} = '$self->ConvertDateTime($val)';
299 }
300 AddTagToTable($tagTablePtr, $tag, $tagInfo);
301 }
302 # allow list-behaviour only for consecutive tags with the same ID
303 if ($$et{LastPListTag} and $$et{LastPListTag} ne $tagInfo) {
304 delete $$et{LIST_TAGS}{$$et{LastPListTag}};
305 }
306 $$et{LastPListTag} = $tagInfo;
307 $et->HandleTag($tagTablePtr, $tag, $val);
308 }
309 $val = { }; # flag the value as a dictionary (ie. tags already saved)
310 } else {
311 # extract the referenced objects
312 foreach $ref (@refs) {
313 $raf->Seek($$table[$ref], 0) or return undef; # seek to this object
314 $val = ExtractObject($et, $plistInfo, $parent);
315 next unless defined $val and ref $val ne 'HASH';
316 push @array, $val;
317 }
318 $val = \@array;
319 }
320 }
321 }
322 return $val;
323}
324
325#------------------------------------------------------------------------------
326# Process binary PLIST data (ref 2)
327# Inputs: 0) ExifTool object ref, 1) DirInfo ref, 2) tag table ref
328# Returns: 1 on success (and returns plist value as $$dirInfo{Value})
329sub ProcessBinaryPLIST($$$)
330{
331 my ($et, $dirInfo, $tagTablePtr) = @_;
332 my ($i, $buff, @table);
333 my $dataPt = $$dirInfo{DataPt};
334
335 $et->VerboseDir('Binary PLIST');
336 SetByteOrder('MM');
337
338 if ($dataPt) {
339 my $start = $$dirInfo{DirStart};
340 if ($start or ($$dirInfo{DirLen} and $$dirInfo{DirLen} != length $$dataPt)) {
341 my $buf2 = substr($$dataPt, $start || 0, $$dirInfo{DirLen});
342 $$dirInfo{RAF} = new File::RandomAccess(\$buf2);
343 } else {
344 $$dirInfo{RAF} = new File::RandomAccess($dataPt);
345 }
346 my $strt = $$dirInfo{DirStart} || 0;
347 }
348 # read and parse the trailer
349 my $raf = $$dirInfo{RAF} or return 0;
350 $raf->Seek(-32,2) and $raf->Read($buff,32)==32 or return 0;
351 my $intSize = Get8u(\$buff, 6);
352 my $refSize = Get8u(\$buff, 7);
353 my $numObj = Get64u(\$buff, 8);
354 my $topObj = Get64u(\$buff, 16);
355 my $tableOff = Get64u(\$buff, 24);
356
357 return 0 if $topObj >= $numObj;
358 my $intProc = $readProc{$intSize} or return 0;
359 my $refProc = $readProc{$refSize} or return 0;
360
361 # read and parse the offset table
362 my $tableSize = $intSize * $numObj;
363 $raf->Seek($tableOff, 0) and $raf->Read($buff, $tableSize) == $tableSize or return 0;
364 for ($i=0; $i<$numObj; ++$i) {
365 push @table, &$intProc(\$buff, $i * $intSize);
366 }
367 my %plistInfo = (
368 RAF => $raf,
369 RefSize => $refSize,
370 RefProc => $refProc,
371 Table => \@table,
372 TagTablePtr => $tagTablePtr,
373 );
374 # position file pointer at the top object, and extract it
375 $raf->Seek($table[$topObj], 0) or return 0;
376 $$dirInfo{Value} = ExtractObject($et, \%plistInfo);
377 return defined $$dirInfo{Value} ? 1 : 0;
378}
379
380#------------------------------------------------------------------------------
381# Extract information from a PLIST file
382# Inputs: 0) ExifTool object ref, 1) dirInfo ref, 2) tag table ref
383# Returns: 1 on success, 0 if this wasn't valid PLIST
384sub ProcessPLIST($$;$)
385{
386 my ($et, $dirInfo, $tagTablePtr) = @_;
387
388 # process XML PLIST data using the XMP module
389 $$dirInfo{XMPParseOpts}{FoundProc} = \&FoundTag;
390 my $result = Image::ExifTool::XMP::ProcessXMP($et, $dirInfo, $tagTablePtr);
391 delete $$dirInfo{XMPParseOpts};
392
393 unless ($result) {
394 my $buff;
395 my $raf = $$dirInfo{RAF} or return 0;
396 $raf->Seek(0,0) and $raf->Read($buff, 64) or return 0;
397 if ($buff =~ /^bplist0/) {
398 # binary PLIST file
399 my $tagTablePtr = GetTagTable('Image::ExifTool::PLIST::Main');
400 $et->SetFileType('PLIST', 'application/x-plist');
401 $$et{SET_GROUP1} = 'PLIST';
402 unless (ProcessBinaryPLIST($et, $dirInfo, $tagTablePtr)) {
403 $et->Error('Error reading binary PLIST file');
404 }
405 delete $$et{SET_GROUP1};
406 $result = 1;
407 } elsif ($$et{FILE_EXT} and $$et{FILE_EXT} eq 'PLIST' and
408 $buff =~ /^\xfe\xff\x00/)
409 {
410 # (have seen very old PLIST files encoded as UCS-2BE with leading BOM)
411 $et->Error('Old PLIST format currently not supported');
412 $result = 1;
413 }
414 }
415 return $result;
416}
417
4181; # end
419
420__END__
421
422=head1 NAME
423
424Image::ExifTool::PLIST - Read Apple PLIST information
425
426=head1 SYNOPSIS
427
428This module is used by Image::ExifTool
429
430=head1 DESCRIPTION
431
432This module contains the routines used by Image::ExifTool to extract
433information from Apple Property List files.
434
435=head1 NOTES
436
437This module decodes both the binary and XML-based PLIST format.
438
439=head1 AUTHOR
440
441Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
442
443This library is free software; you can redistribute it and/or modify it
444under the same terms as Perl itself.
445
446=head1 REFERENCES
447
448=over 4
449
450=item L<http://www.apple.com/DTDs/PropertyList-1.0.dtd>
451
452=item L<http://opensource.apple.com/source/CF/CF-550/CFBinaryPList.c>
453
454=back
455
456=head1 SEE ALSO
457
458L<Image::ExifTool::TagNames/PLIST Tags>,
459L<Image::ExifTool(3pm)|Image::ExifTool>
460
461=cut
462
Note: See TracBrowser for help on using the repository browser.