1 | #------------------------------------------------------------------------------
|
---|
2 | # File: PDF.pm
|
---|
3 | #
|
---|
4 | # Description: Read PDF meta information
|
---|
5 | #
|
---|
6 | # Revisions: 07/11/05 - P. Harvey Created
|
---|
7 | # 07/25/05 - P. Harvey Add support for encrypted documents
|
---|
8 | #
|
---|
9 | # References: 1) http://partners.adobe.com/public/developer/pdf/index_reference.html
|
---|
10 | # 2) http://www.cr0.net:8040/code/crypto/rc4/
|
---|
11 | #------------------------------------------------------------------------------
|
---|
12 |
|
---|
13 | package Image::ExifTool::PDF;
|
---|
14 |
|
---|
15 | use strict;
|
---|
16 | use vars qw($VERSION);
|
---|
17 | use Image::ExifTool qw(:DataAccess :Utils);
|
---|
18 | require Exporter;
|
---|
19 |
|
---|
20 | $VERSION = '1.10';
|
---|
21 |
|
---|
22 | sub LocateObject($$);
|
---|
23 | sub FetchObject($$$$);
|
---|
24 | sub ExtractObject($$;$$);
|
---|
25 | sub ReadToNested($;$);
|
---|
26 | sub ProcessDict($$$$;$);
|
---|
27 |
|
---|
28 | my %warnedOnce; # hash of warnings we issued
|
---|
29 | my %streamObjs; # hash of stream objects
|
---|
30 | my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
|
---|
31 | my $lastFetched; # last fetched object reference (used for decryption)
|
---|
32 | my $cryptInfo; # encryption object reference (plus additional information)
|
---|
33 |
|
---|
34 | # tags in main PDF directories
|
---|
35 | %Image::ExifTool::PDF::Main = (
|
---|
36 | Info => {
|
---|
37 | SubDirectory => {
|
---|
38 | TagTable => 'Image::ExifTool::PDF::Info',
|
---|
39 | },
|
---|
40 | },
|
---|
41 | Root => {
|
---|
42 | SubDirectory => {
|
---|
43 | TagTable => 'Image::ExifTool::PDF::Root',
|
---|
44 | },
|
---|
45 | },
|
---|
46 | );
|
---|
47 |
|
---|
48 | # tags in PDF Info directory
|
---|
49 | %Image::ExifTool::PDF::Info = (
|
---|
50 | GROUPS => { 2 => 'Image' },
|
---|
51 | EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
|
---|
52 | NOTES => q{
|
---|
53 | As well as the tags listed below, the PDF specification allows for
|
---|
54 | user-defined tags to exist in the Info dictionary. These tags, which should
|
---|
55 | have corresponding XMP-pdfx entries in the PDF Metadata, are also extracted
|
---|
56 | by ExifTool.
|
---|
57 | },
|
---|
58 | Title => { },
|
---|
59 | Author => { Groups => { 2 => 'Author' } },
|
---|
60 | Subject => { },
|
---|
61 | Keywords => { List => 1 }, # this is a list of tokens
|
---|
62 | Creator => { },
|
---|
63 | Producer => { },
|
---|
64 | CreationDate => {
|
---|
65 | Name => 'CreateDate',
|
---|
66 | Groups => { 2 => 'Time' },
|
---|
67 | ValueConv => 'Image::ExifTool::PDF::ConvertPDFDate($self, $val)',
|
---|
68 | },
|
---|
69 | ModDate => {
|
---|
70 | Name => 'ModifyDate',
|
---|
71 | Groups => { 2 => 'Time' },
|
---|
72 | ValueConv => 'Image::ExifTool::PDF::ConvertPDFDate($self, $val)',
|
---|
73 | },
|
---|
74 | Trapped => {
|
---|
75 | # remove leading '/' from '/True' or '/False'
|
---|
76 | ValueConv => '$val=~s{^/}{}; $val',
|
---|
77 | },
|
---|
78 | );
|
---|
79 |
|
---|
80 | # tags in the PDF Root document catalog
|
---|
81 | %Image::ExifTool::PDF::Root = (
|
---|
82 | NOTES => 'This is the PDF document catalog.',
|
---|
83 | Metadata => {
|
---|
84 | SubDirectory => {
|
---|
85 | TagTable => 'Image::ExifTool::PDF::Metadata',
|
---|
86 | },
|
---|
87 | },
|
---|
88 | Pages => {
|
---|
89 | SubDirectory => {
|
---|
90 | TagTable => 'Image::ExifTool::PDF::Pages',
|
---|
91 | },
|
---|
92 | },
|
---|
93 | );
|
---|
94 |
|
---|
95 | # tags in PDF Pages directory
|
---|
96 | %Image::ExifTool::PDF::Pages = (
|
---|
97 | GROUPS => { 2 => 'Image' },
|
---|
98 | Count => 'PageCount',
|
---|
99 | Kids => {
|
---|
100 | SubDirectory => {
|
---|
101 | TagTable => 'Image::ExifTool::PDF::Kids',
|
---|
102 | },
|
---|
103 | },
|
---|
104 | );
|
---|
105 |
|
---|
106 | # tags in PDF Kids directory
|
---|
107 | %Image::ExifTool::PDF::Kids = (
|
---|
108 | Metadata => {
|
---|
109 | SubDirectory => {
|
---|
110 | TagTable => 'Image::ExifTool::PDF::Metadata',
|
---|
111 | },
|
---|
112 | },
|
---|
113 | PieceInfo => {
|
---|
114 | SubDirectory => {
|
---|
115 | TagTable => 'Image::ExifTool::PDF::PieceInfo',
|
---|
116 | },
|
---|
117 | },
|
---|
118 | Resources => {
|
---|
119 | SubDirectory => {
|
---|
120 | TagTable => 'Image::ExifTool::PDF::Resources',
|
---|
121 | },
|
---|
122 | },
|
---|
123 | );
|
---|
124 |
|
---|
125 | # tags in PDF Resources directory
|
---|
126 | %Image::ExifTool::PDF::Resources = (
|
---|
127 | ColorSpace => {
|
---|
128 | SubDirectory => {
|
---|
129 | TagTable => 'Image::ExifTool::PDF::ColorSpace',
|
---|
130 | },
|
---|
131 | },
|
---|
132 | );
|
---|
133 |
|
---|
134 | # tags in PDF ColorSpace directory
|
---|
135 | %Image::ExifTool::PDF::ColorSpace = (
|
---|
136 | DefaultRGB => {
|
---|
137 | SubDirectory => {
|
---|
138 | TagTable => 'Image::ExifTool::PDF::DefaultRGB',
|
---|
139 | },
|
---|
140 | },
|
---|
141 | );
|
---|
142 |
|
---|
143 | # tags in PDF DefaultRGB directory
|
---|
144 | %Image::ExifTool::PDF::DefaultRGB = (
|
---|
145 | ICCBased => {
|
---|
146 | SubDirectory => {
|
---|
147 | TagTable => 'Image::ExifTool::PDF::ICCBased',
|
---|
148 | },
|
---|
149 | },
|
---|
150 | );
|
---|
151 |
|
---|
152 | # tags in PDF ICCBased directory
|
---|
153 | %Image::ExifTool::PDF::ICCBased = (
|
---|
154 | stream => {
|
---|
155 | SubDirectory => {
|
---|
156 | TagTable => 'Image::ExifTool::ICC_Profile::Main',
|
---|
157 | },
|
---|
158 | },
|
---|
159 | );
|
---|
160 |
|
---|
161 | # tags in PDF PieceInfo directory
|
---|
162 | %Image::ExifTool::PDF::PieceInfo = (
|
---|
163 | AdobePhotoshop => {
|
---|
164 | SubDirectory => {
|
---|
165 | TagTable => 'Image::ExifTool::PDF::AdobePhotoshop',
|
---|
166 | },
|
---|
167 | },
|
---|
168 | );
|
---|
169 |
|
---|
170 | # tags in PDF AdobePhotoshop directory
|
---|
171 | %Image::ExifTool::PDF::AdobePhotoshop = (
|
---|
172 | Private => {
|
---|
173 | SubDirectory => {
|
---|
174 | TagTable => 'Image::ExifTool::PDF::Private',
|
---|
175 | },
|
---|
176 | },
|
---|
177 | );
|
---|
178 |
|
---|
179 | # tags in PDF Private directory
|
---|
180 | %Image::ExifTool::PDF::Private = (
|
---|
181 | ImageResources => {
|
---|
182 | SubDirectory => {
|
---|
183 | TagTable => 'Image::ExifTool::PDF::ImageResources',
|
---|
184 | },
|
---|
185 | },
|
---|
186 | );
|
---|
187 |
|
---|
188 | # tags in PDF ImageResources directory
|
---|
189 | %Image::ExifTool::PDF::ImageResources = (
|
---|
190 | stream => {
|
---|
191 | SubDirectory => {
|
---|
192 | TagTable => 'Image::ExifTool::Photoshop::Main',
|
---|
193 | },
|
---|
194 | },
|
---|
195 | );
|
---|
196 |
|
---|
197 | # tags in PDF Metadata directory
|
---|
198 | %Image::ExifTool::PDF::Metadata = (
|
---|
199 | GROUPS => { 2 => 'Image' },
|
---|
200 | XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
|
---|
201 | Name => 'XMP',
|
---|
202 | SubDirectory => {
|
---|
203 | TagTable => 'Image::ExifTool::XMP::Main',
|
---|
204 | },
|
---|
205 | },
|
---|
206 | );
|
---|
207 |
|
---|
208 | # unknown tags for use in verbose option
|
---|
209 | %Image::ExifTool::PDF::Unknown = (
|
---|
210 | GROUPS => { 2 => 'Unknown' },
|
---|
211 | );
|
---|
212 |
|
---|
213 | #------------------------------------------------------------------------------
|
---|
214 | # Issue one warning of each type
|
---|
215 | # Inputs: 0) ExifTool object reference, 1) warning
|
---|
216 | sub WarnOnce($$)
|
---|
217 | {
|
---|
218 | my ($exifTool, $warn) = @_;
|
---|
219 | unless ($warnedOnce{$warn}) {
|
---|
220 | $warnedOnce{$warn} = 1;
|
---|
221 | $exifTool->Warn($warn);
|
---|
222 | }
|
---|
223 | }
|
---|
224 |
|
---|
225 | #------------------------------------------------------------------------------
|
---|
226 | # Set PDF format error warning
|
---|
227 | # Inputs: 0) ExifTool object reference, 1) error string
|
---|
228 | # Returns: 1
|
---|
229 | sub PDFErr($$)
|
---|
230 | {
|
---|
231 | my ($exifTool, $str) = @_;
|
---|
232 | $exifTool->Warn("PDF format error ($str)");
|
---|
233 | return 1;
|
---|
234 | }
|
---|
235 |
|
---|
236 | #------------------------------------------------------------------------------
|
---|
237 | # Convert from PDF to EXIF-style date/time
|
---|
238 | # Inputs: 0) ExifTool object reference,
|
---|
239 | # 1) PDF date/time string (D:yyyymmddhhmmss+hh'mm')
|
---|
240 | # Returns: EXIF date string (yyyy:mm:dd hh:mm:ss+hh:mm)
|
---|
241 | sub ConvertPDFDate($$)
|
---|
242 | {
|
---|
243 | my ($exifTool, $date) = @_;
|
---|
244 | # remove optional 'D:' prefix
|
---|
245 | $date =~ s/^D://;
|
---|
246 | # fill in default values if necessary
|
---|
247 | # yyyymmddhhmmss
|
---|
248 | my $default = '00000101000000';
|
---|
249 | if (length $date < length $default) {
|
---|
250 | $date .= substr($default, length $date);
|
---|
251 | }
|
---|
252 | $date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
|
---|
253 | $date = "$1:$2:$3 $4:$5:$6";
|
---|
254 | if ($7) {
|
---|
255 | my @t = split /'/, $7;
|
---|
256 | $date .= $t[0];
|
---|
257 | $date .= ':' . ($t[1] || 0) if $t[0] ne 'Z';
|
---|
258 | }
|
---|
259 | return $exifTool->ConvertDateTime($date);
|
---|
260 | }
|
---|
261 |
|
---|
262 | #------------------------------------------------------------------------------
|
---|
263 | # Locate an object in the XRref tables
|
---|
264 | # Inputs: 0) XRef reference, 1) object reference string
|
---|
265 | # Returns: offset to object in file, or undefined on error
|
---|
266 | sub LocateObject($$)
|
---|
267 | {
|
---|
268 | my ($xref, $ref) = @_;
|
---|
269 | return undef unless $xref;
|
---|
270 | return $$xref{$ref} if $$xref{$ref};
|
---|
271 | # get the object number
|
---|
272 | return undef unless $ref =~ /^(\d+)/;
|
---|
273 | my $objNum = $1;
|
---|
274 | #
|
---|
275 | # scan our XRef stream dictionaries for this object
|
---|
276 | #
|
---|
277 | return undef unless $$xref{dicts};
|
---|
278 | my $dict;
|
---|
279 | foreach $dict (@{$$xref{dicts}}) {
|
---|
280 | # quick check to see if the object is in the range for this xref stream
|
---|
281 | next if $objNum >= $$dict{Size};
|
---|
282 | my $index = $$dict{Index};
|
---|
283 | next if $objNum < $$index[0];
|
---|
284 | # scan the tables for the specified object
|
---|
285 | my $size = $$dict{entry_size};
|
---|
286 | my $num = scalar(@$index) / 2;
|
---|
287 | my $tot = 0;
|
---|
288 | my $i;
|
---|
289 | for ($i=0; $i<$num; ++$i) {
|
---|
290 | my $start = $$index[$i*2];
|
---|
291 | my $count = $$index[$i*2+1];
|
---|
292 | # table is in ascending order, so quit if we have passed the object
|
---|
293 | last if $objNum >= $start + $count;
|
---|
294 | if ($objNum >= $start) {
|
---|
295 | my $offset = $size * ($objNum - $start + $tot);
|
---|
296 | last if $offset + $size > length $$dict{stream};
|
---|
297 | my @c = unpack("x$offset C$size", $$dict{stream});
|
---|
298 | # extract values from this table entry
|
---|
299 | # (can be 1, 2, 3, 4, etc.. bytes per value)
|
---|
300 | my (@t, $j, $k, $ref2);
|
---|
301 | my $w = $$dict{W};
|
---|
302 | for ($j=0; $j<3; ++$j) {
|
---|
303 | # use default value if W entry is 0 (as per spec)
|
---|
304 | $$w[$j] or $t[$j] = ($j ? 1 : 0), next;
|
---|
305 | $t[$j] = shift(@c);
|
---|
306 | for ($k=1; $k < $$w[$j]; ++$k) {
|
---|
307 | $t[$j] = 256 * $t[$j] + shift(@c);
|
---|
308 | }
|
---|
309 | }
|
---|
310 | if ($t[0] == 1) {
|
---|
311 | # normal object reference: use "o g R" as hash ref
|
---|
312 | # (o = object number, g = generation number)
|
---|
313 | $ref2 = "$objNum $t[2] R";
|
---|
314 | # xref is offset of object from start
|
---|
315 | $$xref{$ref2} = $t[1];
|
---|
316 | } elsif ($t[0] == 2) {
|
---|
317 | # compressed object reference:
|
---|
318 | $ref2 = "$objNum 0 R";
|
---|
319 | # xref is object index and stream object reference
|
---|
320 | $$xref{$ref2} = "I$t[2] $t[1] 0 R";
|
---|
321 | } else {
|
---|
322 | last;
|
---|
323 | }
|
---|
324 | return $$xref{$ref} if $ref eq $ref2;
|
---|
325 | }
|
---|
326 | $tot += $count;
|
---|
327 | }
|
---|
328 | }
|
---|
329 | return undef;
|
---|
330 | }
|
---|
331 |
|
---|
332 | #------------------------------------------------------------------------------
|
---|
333 | # Fetch indirect object from file (from inside a stream if required)
|
---|
334 | # Inputs: 0) ExifTool object reference, 1) object reference string, 2) xref lookup,
|
---|
335 | # Returns: object data or undefined on error
|
---|
336 | sub FetchObject($$$$)
|
---|
337 | {
|
---|
338 | my ($exifTool, $ref, $xref, $tag) = @_;
|
---|
339 | $lastFetched = $ref; # save this for decoding if necessary
|
---|
340 | my $offset = LocateObject($xref, $ref);
|
---|
341 | unless ($offset) {
|
---|
342 | $exifTool->Warn("Bad $tag reference");
|
---|
343 | return undef;
|
---|
344 | }
|
---|
345 | my ($data, $obj);
|
---|
346 | if ($offset =~ s/^I(\d+) //) {
|
---|
347 | my $index = $1; # object index in stream
|
---|
348 | my ($objNum) = split ' ', $ref; # save original object number
|
---|
349 | $ref = $offset; # now a reference to the containing stream object
|
---|
350 | my $obj = $streamObjs{$ref};
|
---|
351 | unless ($obj) {
|
---|
352 | # don't try to load the same object stream twice
|
---|
353 | return undef if defined $obj;
|
---|
354 | $streamObjs{$ref} = '';
|
---|
355 | # load the parent object stream
|
---|
356 | $obj = FetchObject($exifTool, $ref, $xref, $tag);
|
---|
357 | # make sure it contains everything we need
|
---|
358 | return undef unless defined $obj and ref($obj) eq 'HASH';
|
---|
359 | return undef unless $$obj{First} and $$obj{N};
|
---|
360 | return undef unless DecodeStream($exifTool, $obj);
|
---|
361 | # add a special 'table' entry to this dictionary which contains
|
---|
362 | # the list of object number/offset pairs from the stream header
|
---|
363 | my $num = $$obj{N} * 2;
|
---|
364 | my @table = split ' ', $$obj{stream}, $num;
|
---|
365 | return undef unless @table == $num;
|
---|
366 | # remove everything before first object in stream
|
---|
367 | $$obj{stream} = substr($$obj{stream}, $$obj{First});
|
---|
368 | $table[$num-1] =~ s/^(\d+).*/$1/; # trim excess from last number
|
---|
369 | $$obj{table} = \@table;
|
---|
370 | # save the object stream so we don't have to re-load it later
|
---|
371 | $streamObjs{$ref} = $obj;
|
---|
372 | }
|
---|
373 | # verify that we have the specified object
|
---|
374 | my $i = 2 * $index;
|
---|
375 | my $table = $$obj{table};
|
---|
376 | unless ($index < $$obj{N} and $$table[$i] == $objNum) {
|
---|
377 | $exifTool->Warn("Bad index for stream object $tag");
|
---|
378 | return undef;
|
---|
379 | }
|
---|
380 | # extract the object at the specified index in the stream
|
---|
381 | # (offsets in table are in sequential order, so we can subract from
|
---|
382 | # the next offset to get the object length)
|
---|
383 | $offset = $$table[$i + 1];
|
---|
384 | my $len = ($$table[$i + 3] || length($$obj{stream})) - $offset;
|
---|
385 | $data = substr($$obj{stream}, $offset, $len);
|
---|
386 | return ExtractObject($exifTool, \$data);
|
---|
387 | }
|
---|
388 | my $raf = $exifTool->{RAF};
|
---|
389 | $raf->Seek($offset, 0) or $exifTool->Warn("Bad $tag offset"), return undef;
|
---|
390 | # verify that we are reading the expected object
|
---|
391 | $raf->ReadLine($data) or $exifTool->Warn("Error reading $tag data"), return undef;
|
---|
392 | ($obj = $ref) =~ s/R/obj/;
|
---|
393 | unless ($data =~ s/^$obj//) {
|
---|
394 | $exifTool->Warn("$tag object ($obj) not found at $offset");
|
---|
395 | return undef;
|
---|
396 | }
|
---|
397 | return ExtractObject($exifTool, \$data, $raf, $xref);
|
---|
398 | }
|
---|
399 |
|
---|
400 | #------------------------------------------------------------------------------
|
---|
401 | # Extract PDF object from combination of buffered data and file
|
---|
402 | # Inputs: 0) ExifTool object reference, 1) data reference,
|
---|
403 | # 2) optional raf reference, 3) optional xref table
|
---|
404 | # Returns: converted PDF object or undef on error
|
---|
405 | # a) dictionary object --> hash reference
|
---|
406 | # b) array object --> array reference
|
---|
407 | # c) indirect reference --> scalar reference
|
---|
408 | # d) string, name, integer, boolean, null --> scalar value
|
---|
409 | # - updates $$dataPt on return to contain unused data
|
---|
410 | # - creates two bogus entries ('stream' and 'tags') in dictionaries to represent
|
---|
411 | # the stream data and a list of the tags (not including 'stream' and 'tags')
|
---|
412 | # in their original order
|
---|
413 | sub ExtractObject($$;$$)
|
---|
414 | {
|
---|
415 | my ($exifTool, $dataPt, $raf, $xref) = @_;
|
---|
416 | my (@tags, $data, $objData);
|
---|
417 | my $dict = { };
|
---|
418 | my $delim;
|
---|
419 |
|
---|
420 | for (;;) {
|
---|
421 | if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/) {
|
---|
422 | $delim = $1;
|
---|
423 | $objData = ReadToNested($dataPt, $raf);
|
---|
424 | return undef unless defined $objData;
|
---|
425 | last;
|
---|
426 | } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{}) {
|
---|
427 | #
|
---|
428 | # extract boolean, numerical, string, name, null object or indirect reference
|
---|
429 | #
|
---|
430 | $objData = $1;
|
---|
431 | # look for an indirect reference
|
---|
432 | if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//) {
|
---|
433 | $objData .= "$1 R";
|
---|
434 | $objData = \$objData; # return scalar reference
|
---|
435 | }
|
---|
436 | return $objData; # return simple scalar or scalar reference
|
---|
437 | }
|
---|
438 | $raf and $raf->ReadLine($data) or return undef;
|
---|
439 | $$dataPt .= $data;
|
---|
440 | }
|
---|
441 | #
|
---|
442 | # extract literal string
|
---|
443 | #
|
---|
444 | if ($delim eq '(') {
|
---|
445 | $objData = $1 if $objData =~ /.*?\((.*)\)/s; # remove brackets
|
---|
446 | # decode escape sequences in literal strings
|
---|
447 | while ($objData =~ /\\(.)/sg) {
|
---|
448 | my $n = pos($objData) - 2;
|
---|
449 | my $c = $1;
|
---|
450 | my $r;
|
---|
451 | if ($c =~ /[0-7]/) {
|
---|
452 | # get up to 2 more octal digits
|
---|
453 | $c .= $1 if $objData =~ /\G([0-7]{1,2})/g;
|
---|
454 | # convert octal escape code
|
---|
455 | $r = chr(oct($c) & 0xff);
|
---|
456 | } elsif ($c eq "\x0d") {
|
---|
457 | # the string is continued if the line ends with '\'
|
---|
458 | # (also remove "\x0d\x0a")
|
---|
459 | $c .= $1 if $objData =~ /\G(\x0a)/g;
|
---|
460 | $r = '';
|
---|
461 | } elsif ($c eq "\x0a") {
|
---|
462 | # (also remove "\x0a\x0d")
|
---|
463 | $c .= $1 if $objData =~ /\G(\x0d)/g;
|
---|
464 | $r = '';
|
---|
465 | } else {
|
---|
466 | # convert escaped characters
|
---|
467 | ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
|
---|
468 | }
|
---|
469 | substr($objData, $n, length($c)+1) = $r;
|
---|
470 | # contine search after this character
|
---|
471 | pos($objData) = $n + length($r);
|
---|
472 | }
|
---|
473 | Decrypt(\$objData) if $cryptInfo;
|
---|
474 | # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
|
---|
475 | if ($objData =~ s/^\xfe\xff//) {
|
---|
476 | $objData = $exifTool->Unicode2Charset($objData, 'MM');
|
---|
477 | }
|
---|
478 | return $objData;
|
---|
479 | #
|
---|
480 | # extract hex string
|
---|
481 | #
|
---|
482 | } elsif ($delim eq '<') {
|
---|
483 | # decode hex data
|
---|
484 | $objData =~ tr/0-9A-Fa-f//dc;
|
---|
485 | $objData .= '0' if length($objData) & 0x01; # (by the spec)
|
---|
486 | $objData = pack('H*', $objData);
|
---|
487 | Decrypt(\$objData) if $cryptInfo;
|
---|
488 | return $objData;
|
---|
489 | #
|
---|
490 | # extract array
|
---|
491 | #
|
---|
492 | } elsif ($delim eq '[') {
|
---|
493 | $objData =~ /.*?\[(.*)\]/s or return; # remove brackets
|
---|
494 | my $data = $1;
|
---|
495 | my @list;
|
---|
496 | for (;;) {
|
---|
497 | last unless $data =~ m{\s*(\S[^[(/<>\s]*)}sg;
|
---|
498 | my $val = $1;
|
---|
499 | if ($val =~ /^(<{1,2}|\[|\()/) {
|
---|
500 | my $pos = pos($data) - length($val);
|
---|
501 | # nested dict, array, literal string or hex string
|
---|
502 | my $buff = substr($data, $pos);
|
---|
503 | $val = ReadToNested(\$buff);
|
---|
504 | last unless defined $val;
|
---|
505 | pos($data) = $pos + length($val);
|
---|
506 | $val = ExtractObject($exifTool, \$val);
|
---|
507 | } elsif ($val =~ /^\d/) {
|
---|
508 | my $pos = pos($data);
|
---|
509 | if ($data =~ /\G\s+(\d+)\s+R/g) {
|
---|
510 | $val = \ "$val $1 R"; # make a reference
|
---|
511 | } else {
|
---|
512 | pos($data) = $pos;
|
---|
513 | }
|
---|
514 | }
|
---|
515 | push @list, $val;
|
---|
516 | }
|
---|
517 | return \@list;
|
---|
518 | }
|
---|
519 | #
|
---|
520 | # extract dictionary
|
---|
521 | #
|
---|
522 | # Note: entries are not necessarily separated by whitespace (doh!)
|
---|
523 | # ie) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
|
---|
524 | # Also, they may be separated by a comment (ie. "/Tag%comment\nValue"),
|
---|
525 | # but comments have already been removed
|
---|
526 | while ($objData =~ m{(\s*)/([^/[\]()<>{}\s]+)\s*(\S[^[(/<>\s]*)}sg) {
|
---|
527 | my $tag = $2;
|
---|
528 | my $val = $3;
|
---|
529 | if ($val =~ /^(<{1,2}|\[|\()/) {
|
---|
530 | # nested dict, array, literal string or hex string
|
---|
531 | $objData = substr($objData, pos($objData)-length($val));
|
---|
532 | $val = ReadToNested(\$objData, $raf);
|
---|
533 | last unless defined $val;
|
---|
534 | $val = ExtractObject($exifTool, \$val);
|
---|
535 | pos($objData) = 0;
|
---|
536 | } elsif ($val =~ /^\d/) {
|
---|
537 | my $pos = pos($objData);
|
---|
538 | if ($objData =~ /\G\s+(\d+)\s+R/g) {
|
---|
539 | $val = \ "$val $1 R"; # make a reference
|
---|
540 | } else {
|
---|
541 | pos($objData) = $pos;
|
---|
542 | }
|
---|
543 | }
|
---|
544 | if ($$dict{$tag}) {
|
---|
545 | # duplicate dictionary entries are not allowed
|
---|
546 | $exifTool->Warn("Duplicate $tag entry in dictionary (ignored)");
|
---|
547 | } else {
|
---|
548 | # save the entry
|
---|
549 | push @tags, $tag;
|
---|
550 | $$dict{$tag} = $val;
|
---|
551 | }
|
---|
552 | }
|
---|
553 | return undef unless @tags;
|
---|
554 | $$dict{tags} = \@tags;
|
---|
555 | return $dict unless $raf; # direct objects can not have streams
|
---|
556 | #
|
---|
557 | # extract the stream object
|
---|
558 | #
|
---|
559 | # dictionary must specify stream Length
|
---|
560 | my $length = $$dict{Length} or return $dict;
|
---|
561 | if (ref $length) {
|
---|
562 | $length = $$length;
|
---|
563 | my $oldpos = $raf->Tell();
|
---|
564 | # get the location of the object specifying the length
|
---|
565 | my $offset = LocateObject($xref, $length) or return $dict;
|
---|
566 | $raf->Seek($offset, 0) or $exifTool->Warn("Bad Length offset"), return $dict;
|
---|
567 | # verify that we are reading the expected object
|
---|
568 | $raf->ReadLine($data) or $exifTool->Warn("Error reading Length data"), return $dict;
|
---|
569 | $length =~ s/R/obj/;
|
---|
570 | unless ($data =~ /^$length/) {
|
---|
571 | $exifTool->Warn("Length object ($length) not found at $offset");
|
---|
572 | return $dict;
|
---|
573 | }
|
---|
574 | $raf->ReadLine($data) or $exifTool->Warn("Error reading stream Length"), return $dict;
|
---|
575 | $data =~ /(\d+)/ or $exifTool->Warn("Stream length not found"), return $dict;
|
---|
576 | $length = $1;
|
---|
577 | $raf->Seek($oldpos, 0); # restore position to start of stream
|
---|
578 | }
|
---|
579 | # extract the trailing stream data
|
---|
580 | for (;;) {
|
---|
581 | # find the stream token
|
---|
582 | if ($$dataPt =~ /(\S+)/) {
|
---|
583 | last unless $1 eq 'stream';
|
---|
584 | # read an extra line because it may contain our \x0a
|
---|
585 | $$dataPt .= $data if $raf->ReadLine($data);
|
---|
586 | # remove our stream header
|
---|
587 | $$dataPt =~ s/^.*stream(\x0a|\x0d\x0a)//s;
|
---|
588 | my $more = $length - length($$dataPt);
|
---|
589 | if ($more > 0) {
|
---|
590 | unless ($raf->Read($data, $more) == $more) {
|
---|
591 | $exifTool->Warn("Error reading stream data");
|
---|
592 | $$dataPt = '';
|
---|
593 | return $dict;
|
---|
594 | }
|
---|
595 | $$dict{stream} = $$dataPt . $data;
|
---|
596 | $$dataPt = '';
|
---|
597 | } elsif ($more < 0) {
|
---|
598 | $$dict{stream} = substr($$dataPt, 0, $length);
|
---|
599 | $$dataPt = substr($$dataPt, $length);
|
---|
600 | } else {
|
---|
601 | $$dict{stream} = $$dataPt;
|
---|
602 | $$dataPt = '';
|
---|
603 | }
|
---|
604 | last;
|
---|
605 | }
|
---|
606 | $raf->ReadLine($data) or last;
|
---|
607 | $$dataPt .= $data;
|
---|
608 | }
|
---|
609 | return $dict;
|
---|
610 | }
|
---|
611 |
|
---|
612 | #------------------------------------------------------------------------------
|
---|
613 | # Read to nested delimiter
|
---|
614 | # Inputs: 0) data reference, 1) optional raf reference
|
---|
615 | # Returns: data up to and including matching delimiter (or undef on error)
|
---|
616 | # - updates data reference with trailing data
|
---|
617 | # - unescapes characters in literal strings
|
---|
618 | sub ReadToNested($;$)
|
---|
619 | {
|
---|
620 | my ($dataPt, $raf) = @_;
|
---|
621 | # matching closing delimiters
|
---|
622 | my %closingDelim = (
|
---|
623 | '<<' => '>>',
|
---|
624 | '(' => ')',
|
---|
625 | '[' => ']',
|
---|
626 | '<' => '>',
|
---|
627 | );
|
---|
628 | my @delim = (''); # closing delimiter list, most deeply nested first
|
---|
629 | pos($$dataPt) = 0; # begin at start of data
|
---|
630 | for (;;) {
|
---|
631 | unless ($$dataPt =~ /(\\*)(\(|\)|<{1,2}|>{1,2}|\[|\]|%)/g) {
|
---|
632 | # must read some more data
|
---|
633 | my $buff;
|
---|
634 | last unless $raf and $raf->ReadLine($buff);
|
---|
635 | $$dataPt .= $buff;
|
---|
636 | pos($$dataPt) = length($$dataPt) - length($buff);
|
---|
637 | next;
|
---|
638 | }
|
---|
639 | # are we in a literal string?
|
---|
640 | if ($delim[0] eq ')') {
|
---|
641 | # ignore escaped delimiters (preceeded by odd number of \'s)
|
---|
642 | next if length($1) & 0x01;
|
---|
643 | # ignore all delimiters but unescaped braces
|
---|
644 | next unless $2 eq '(' or $2 eq ')';
|
---|
645 | } elsif ($2 eq '%') {
|
---|
646 | # ignore the comment
|
---|
647 | my $pos = pos($$dataPt) - 1;
|
---|
648 | # remove everything from '%' up to but not including newline
|
---|
649 | $$dataPt =~ /.*/g;
|
---|
650 | my $end = pos($$dataPt);
|
---|
651 | $$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
|
---|
652 | pos($$dataPt) = $pos;
|
---|
653 | next;
|
---|
654 | }
|
---|
655 | if ($closingDelim{$2}) {
|
---|
656 | # push the corresponding closing delimiter
|
---|
657 | unshift @delim, $closingDelim{$2};
|
---|
658 | next;
|
---|
659 | }
|
---|
660 | unless ($2 eq $delim[0]) {
|
---|
661 | # handle the case where we find a ">>>" and interpret it
|
---|
662 | # as ">> >" instead of "> >>"
|
---|
663 | next unless $2 eq '>>' and $delim[0] eq '>';
|
---|
664 | pos($$dataPt) = pos($$dataPt) - 1;
|
---|
665 | }
|
---|
666 | my $delim = shift @delim; # remove from nesting list
|
---|
667 | next if $delim[0]; # keep going if we have more nested delimiters
|
---|
668 | my $pos = pos($$dataPt);
|
---|
669 | my $buff = substr($$dataPt, 0, $pos);
|
---|
670 | $$dataPt = substr($$dataPt, $pos);
|
---|
671 | return $buff; # success!
|
---|
672 | }
|
---|
673 | return undef; # didn't find matching delimiter
|
---|
674 | }
|
---|
675 |
|
---|
676 | #------------------------------------------------------------------------------
|
---|
677 | # Decode filtered stream
|
---|
678 | # Inputs: 0) ExifTool object reference, 1) dictionary reference
|
---|
679 | # Returns: true if stream has been decoded OK
|
---|
680 | sub DecodeStream($$)
|
---|
681 | {
|
---|
682 | my ($exifTool, $dict) = @_;
|
---|
683 |
|
---|
684 | return 0 unless $$dict{stream}; # no stream to decode
|
---|
685 | # apply decryption first if required
|
---|
686 | if ($cryptInfo and not $$dict{decrypted}) {
|
---|
687 | $$dict{decrypted} = 1;
|
---|
688 | if ($$cryptInfo{meta} or ($$dict{Type} and $$dict{Type} ne '/Metadata')) {
|
---|
689 | Decrypt(\$$dict{stream});
|
---|
690 | }
|
---|
691 | }
|
---|
692 | return 1 unless $$dict{Filter};
|
---|
693 | if ($$dict{Filter} eq '/FlateDecode') {
|
---|
694 | if (eval 'require Compress::Zlib') {
|
---|
695 | my $inflate = Compress::Zlib::inflateInit();
|
---|
696 | my ($buff, $stat);
|
---|
697 | $inflate and ($buff, $stat) = $inflate->inflate($$dict{stream});
|
---|
698 | if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
|
---|
699 | $$dict{stream} = $buff;
|
---|
700 | # move Filter to prevent double decoding
|
---|
701 | $$dict{oldFilter} = $$dict{Filter};
|
---|
702 | $$dict{Filter} = '';
|
---|
703 | } else {
|
---|
704 | $exifTool->Warn('Error inflating stream');
|
---|
705 | return 0;
|
---|
706 | }
|
---|
707 | } else {
|
---|
708 | WarnOnce($exifTool,'Install Compress::Zlib to decode filtered streams');
|
---|
709 | return 0;
|
---|
710 | }
|
---|
711 | #
|
---|
712 | # apply anti-predictor if necessary
|
---|
713 | #
|
---|
714 | return 1 unless $$dict{DecodeParms};
|
---|
715 | my $pre = $dict->{DecodeParms}->{Predictor};
|
---|
716 | return 1 unless $pre and $pre != 1;
|
---|
717 | if ($pre != 12) {
|
---|
718 | # currently only support 'up' prediction
|
---|
719 | WarnOnce($exifTool,"FlateDecode Predictor $pre not currently supported");
|
---|
720 | return 0;
|
---|
721 | }
|
---|
722 | my $cols = $dict->{DecodeParms}->{Columns};
|
---|
723 | unless ($cols) {
|
---|
724 | # currently only support 'up' prediction
|
---|
725 | WarnOnce($exifTool,'No Columns for decoding stream');
|
---|
726 | return 0;
|
---|
727 | }
|
---|
728 | my @bytes = unpack('C*', $$dict{stream});
|
---|
729 | my @pre = (0) x $cols; # initialize predictor array
|
---|
730 | my $buff = '';
|
---|
731 | while (@bytes > $cols) {
|
---|
732 | unless (($_ = shift @bytes) == 2) {
|
---|
733 | WarnOnce($exifTool, "Unsupported PNG filter $_");
|
---|
734 | return 0;
|
---|
735 | }
|
---|
736 | foreach (@pre) {
|
---|
737 | $_ = ($_ + shift(@bytes)) & 0xff;
|
---|
738 | }
|
---|
739 | $buff .= pack('C*', @pre);
|
---|
740 | }
|
---|
741 | $$dict{stream} = $buff;
|
---|
742 | } else {
|
---|
743 | WarnOnce($exifTool, "Unsupported Filter $$dict{Filter}");
|
---|
744 | return 0;
|
---|
745 | }
|
---|
746 | return 1;
|
---|
747 | }
|
---|
748 |
|
---|
749 | #------------------------------------------------------------------------------
|
---|
750 | # Initialize state for RC4 en/decryption (ref 2)
|
---|
751 | # Inputs: 0) RC4 key string
|
---|
752 | # Returns: RC4 key hash reference
|
---|
753 | sub RC4Init($)
|
---|
754 | {
|
---|
755 | my @key = unpack('C*', shift);
|
---|
756 | my @state = (0 .. 255);
|
---|
757 | my ($i, $j) = (0, 0);
|
---|
758 | while ($i < 256) {
|
---|
759 | my $st = $state[$i];
|
---|
760 | $j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
|
---|
761 | $state[$i++] = $state[$j];
|
---|
762 | $state[$j] = $st;
|
---|
763 | }
|
---|
764 | return { State => \@state, XY => [ 0, 0 ] };
|
---|
765 | }
|
---|
766 |
|
---|
767 | #------------------------------------------------------------------------------
|
---|
768 | # Apply RC4 en/decryption (ref 2)
|
---|
769 | # Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
|
---|
770 | # - can call this method directly with a key string, or with with the key
|
---|
771 | # reference returned by RC4Init
|
---|
772 | # - RC4 is a symmetric algorithm, so encryption is the same as decryption
|
---|
773 | sub RC4Crypt($$)
|
---|
774 | {
|
---|
775 | my ($dataPt, $key) = @_;
|
---|
776 | $key = RC4Init($key) unless ref $key eq 'HASH';
|
---|
777 | my $state = $$key{State};
|
---|
778 | my ($x, $y) = @{$$key{XY}};
|
---|
779 |
|
---|
780 | my @data = unpack('C*', $$dataPt);
|
---|
781 | foreach (@data) {
|
---|
782 | $x = ($x + 1) & 0xff;
|
---|
783 | my $stx = $$state[$x];
|
---|
784 | $y = ($stx + $y) & 0xff;
|
---|
785 | my $sty = $$state[$x] = $$state[$y];
|
---|
786 | $$state[$y] = $stx;
|
---|
787 | $_ ^= $$state[($stx + $sty) & 0xff];
|
---|
788 | }
|
---|
789 | $$key{XY} = [ $x, $y ];
|
---|
790 | $$dataPt = pack('C*', @data);
|
---|
791 | }
|
---|
792 |
|
---|
793 | #------------------------------------------------------------------------------
|
---|
794 | # Initialize decryption
|
---|
795 | # Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
|
---|
796 | # 2) ID from file trailer dictionary
|
---|
797 | # Returns: error string or undef on success
|
---|
798 | sub DecryptInit($$$)
|
---|
799 | {
|
---|
800 | my ($exifTool, $encrypt, $id) = @_;
|
---|
801 | unless ($encrypt and ref $encrypt eq 'HASH') {
|
---|
802 | return 'Error loading Encrypt object';
|
---|
803 | }
|
---|
804 | my $filt = $$encrypt{Filter};
|
---|
805 | unless ($filt and $filt =~ s/^\///) {
|
---|
806 | return 'Encrypt dictionary has no Filter!';
|
---|
807 | }
|
---|
808 | my $ver = $$encrypt{V} || 0;
|
---|
809 | my $rev = $$encrypt{R} || 0;
|
---|
810 | $exifTool->FoundTag('Encryption', "$filt v$ver.$rev");
|
---|
811 | unless ($$encrypt{Filter} eq '/Standard') {
|
---|
812 | $$encrypt{Filter} =~ s/^\///;
|
---|
813 | return "PDF '$$encrypt{Filter}' encryption not currently supported";
|
---|
814 | }
|
---|
815 | unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
|
---|
816 | return 'Incomplete Encrypt specification';
|
---|
817 | }
|
---|
818 | unless ($ver == 1 or $ver == 2) {
|
---|
819 | return "Encryption algorithm $ver currently not supported";
|
---|
820 | }
|
---|
821 | $id or return "Can't decrypt (no document ID)";
|
---|
822 | unless (eval 'require Digest::MD5') {
|
---|
823 | return 'Install Digest::MD5 to extract encrypted information';
|
---|
824 | }
|
---|
825 | # calculate file-level en/decryption key
|
---|
826 | my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
|
---|
827 | "\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
|
---|
828 | my $key = $pad . $$encrypt{O} . pack('V', $$encrypt{P}) . $id;
|
---|
829 | my $rep = 1;
|
---|
830 | $$encrypt{meta} = 1; # set flag that Metadata is encrypted
|
---|
831 | if ($rev >= 3) {
|
---|
832 | # in rev 4 (not yet supported), metadata streams may not be encrypted
|
---|
833 | if ($$encrypt{EncryptMetadata} and $$encrypt{EncryptMetadata} =~ /false/i) {
|
---|
834 | delete $$encrypt{meta}; # Meta data isn't encrypted after all
|
---|
835 | $key .= "\xff\xff\xff\xff"; # must add this if metadata not encrypted
|
---|
836 | }
|
---|
837 | $rep += 50; # repeat MD5 50 more times if revision is 3 or greater
|
---|
838 | }
|
---|
839 | my ($len, $i);
|
---|
840 | if ($ver == 1) {
|
---|
841 | $len = 5;
|
---|
842 | } else {
|
---|
843 | $len = $$encrypt{Length} || 40;
|
---|
844 | $len >= 40 or return 'Bad Encrypt Length';
|
---|
845 | $len = int($len / 8);
|
---|
846 | }
|
---|
847 | for ($i=0; $i<$rep; ++$i) {
|
---|
848 | $key = substr(Digest::MD5::md5($key), 0, $len);
|
---|
849 | }
|
---|
850 | # decrypt U to see if a user password is required
|
---|
851 | my $dat;
|
---|
852 | if ($rev >= 3) {
|
---|
853 | $dat = Digest::MD5::md5($pad . $id);
|
---|
854 | RC4Crypt(\$dat, $key);
|
---|
855 | for ($i=1; $i<=19; ++$i) {
|
---|
856 | my @key = unpack('C*', $key);
|
---|
857 | foreach (@key) { $_ ^= $i; }
|
---|
858 | RC4Crypt(\$dat, pack('C*', @key));
|
---|
859 | }
|
---|
860 | $dat .= substr($$encrypt{U}, 16);
|
---|
861 | } else {
|
---|
862 | $dat = $pad;
|
---|
863 | RC4Crypt(\$dat, $key);
|
---|
864 | }
|
---|
865 | $dat eq $$encrypt{U} or return 'Document is password encrypted';
|
---|
866 | $$encrypt{key} = $key; # save the file-level encryption key
|
---|
867 | $cryptInfo = $encrypt; # save a reference to the Encrypt object
|
---|
868 | return undef; # success!
|
---|
869 | }
|
---|
870 |
|
---|
871 | #------------------------------------------------------------------------------
|
---|
872 | # Decrypt data
|
---|
873 | # Inputs: 0) data reference
|
---|
874 | sub Decrypt($)
|
---|
875 | {
|
---|
876 | my $dataPt = shift;
|
---|
877 | my $key = $$cryptInfo{key};
|
---|
878 | my $len = length($key) + 5;
|
---|
879 | return unless $lastFetched =~ /^(I\d+ )?(\d+) (\d+)/;
|
---|
880 | $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
|
---|
881 | $len = 16 if $len > 16;
|
---|
882 | $key = substr(Digest::MD5::md5($key), 0, $len);
|
---|
883 | RC4Crypt($dataPt, $key);
|
---|
884 | }
|
---|
885 |
|
---|
886 | #------------------------------------------------------------------------------
|
---|
887 | # Process PDF dictionary extract tag values
|
---|
888 | # Inputs: 0) ExifTool object reference, 1) tag table reference
|
---|
889 | # 2) dictionary reference, 3) cross-reference table reference,
|
---|
890 | # 4) nesting depth
|
---|
891 | sub ProcessDict($$$$;$)
|
---|
892 | {
|
---|
893 | my ($exifTool, $tagTablePtr, $dict, $xref, $nesting) = @_;
|
---|
894 | my $verbose = $exifTool->Options('Verbose');
|
---|
895 | my @tags = @{$$dict{tags}};
|
---|
896 | my $index = 0;
|
---|
897 | my $next;
|
---|
898 |
|
---|
899 | $nesting = ($nesting || 0) + 1;
|
---|
900 | if ($nesting > 50) {
|
---|
901 | WarnOnce($exifTool, 'Nesting too deep (directory ignored)');
|
---|
902 | return;
|
---|
903 | }
|
---|
904 | #
|
---|
905 | # extract information from all tags in the dictionary
|
---|
906 | #
|
---|
907 | for (;;) {
|
---|
908 | my ($tag, $tagInfo);
|
---|
909 | if (@tags) {
|
---|
910 | $tag = shift @tags;
|
---|
911 | } elsif (defined $next and not $next) {
|
---|
912 | $tag = 'Next';
|
---|
913 | $next = 1;
|
---|
914 | } else {
|
---|
915 | last;
|
---|
916 | }
|
---|
917 | if ($$tagTablePtr{$tag}) {
|
---|
918 | $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag);
|
---|
919 | }
|
---|
920 | my $val = $$dict{$tag};
|
---|
921 | if ($verbose) {
|
---|
922 | my ($val2, $extra);
|
---|
923 | if (ref $val eq 'SCALAR') {
|
---|
924 | $extra = ", indirect object ($$val)";
|
---|
925 | if ($fetched{$$val}) {
|
---|
926 | $val2 = "ref($$val)";
|
---|
927 | } elsif ($tag eq 'Next' and not $next) {
|
---|
928 | # handle 'Next' links after all others
|
---|
929 | $next = 0;
|
---|
930 | next;
|
---|
931 | } else {
|
---|
932 | $fetched{$$val} = 1;
|
---|
933 | $val = FetchObject($exifTool, $$val, $xref, $tag);
|
---|
934 | $val2 = '<err>' unless defined $val;
|
---|
935 | }
|
---|
936 | } elsif (ref $val eq 'HASH') {
|
---|
937 | $extra = ', direct dictionary';
|
---|
938 | } elsif (ref $val eq 'ARRAY') {
|
---|
939 | $extra = ', direct array of ' . scalar(@$val) . ' objects';
|
---|
940 | } else {
|
---|
941 | $extra = ', direct object';
|
---|
942 | }
|
---|
943 | my $isSubdir;
|
---|
944 | if (ref $val eq 'HASH') {
|
---|
945 | $isSubdir = 1;
|
---|
946 | } elsif (ref $val eq 'ARRAY') {
|
---|
947 | # recurse into objects in arrays only if they are lists of
|
---|
948 | # dictionaries or indirect objects which could be dictionaries
|
---|
949 | $isSubdir = 1 if @$val;
|
---|
950 | foreach (@$val) {
|
---|
951 | next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
|
---|
952 | undef $isSubdir;
|
---|
953 | last;
|
---|
954 | }
|
---|
955 | }
|
---|
956 | if ($isSubdir) {
|
---|
957 | # create bogus subdirectory to recurse into this dict
|
---|
958 | $tagInfo or $tagInfo = {
|
---|
959 | Name => $tag,
|
---|
960 | SubDirectory => {
|
---|
961 | TagTable => 'Image::ExifTool::PDF::Unknown',
|
---|
962 | },
|
---|
963 | };
|
---|
964 | } elsif (ref $val eq 'ARRAY') {
|
---|
965 | my @list = @$val;
|
---|
966 | foreach (@list) {
|
---|
967 | $_ = "ref($$_)" if ref $_ eq 'SCALAR';
|
---|
968 | }
|
---|
969 | $val2 = '[' . join(',',@list) . ']';
|
---|
970 | }
|
---|
971 | $exifTool->VerboseInfo($tag, $tagInfo,
|
---|
972 | Value => $val2 || $val,
|
---|
973 | Extra => $extra,
|
---|
974 | Index => $index++,
|
---|
975 | );
|
---|
976 | }
|
---|
977 | unless ($tagInfo) {
|
---|
978 | # add any tag found in Info directory to table
|
---|
979 | next unless $$tagTablePtr{EXTRACT_UNKNOWN};
|
---|
980 | $tagInfo = { Name => $tag };
|
---|
981 | Image::ExifTool::AddTagToTable($tagTablePtr, $tag, $tagInfo);
|
---|
982 | }
|
---|
983 | unless ($$tagInfo{SubDirectory}) {
|
---|
984 | # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
|
---|
985 | # unless this is binary data (hex-encoded strings would not have been converted)
|
---|
986 | if ($val =~ s/^\xfe\xff// and not $$tagInfo{Binary}) {
|
---|
987 | $val = $exifTool->Unicode2Charset($val, 'MM');
|
---|
988 | }
|
---|
989 | if ($$tagInfo{List}) {
|
---|
990 | # separate tokens in comma or whitespace delimited lists
|
---|
991 | my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val;
|
---|
992 | foreach $val (@values) {
|
---|
993 | $exifTool->FoundTag($tagInfo, $val);
|
---|
994 | }
|
---|
995 | } else {
|
---|
996 | # a tag value
|
---|
997 | $exifTool->FoundTag($tagInfo, $val);
|
---|
998 | }
|
---|
999 | next;
|
---|
1000 | }
|
---|
1001 | # process the subdirectory
|
---|
1002 | my @subDicts;
|
---|
1003 | if (ref $val eq 'ARRAY') {
|
---|
1004 | @subDicts = @{$val};
|
---|
1005 | } else {
|
---|
1006 | @subDicts = ( $val );
|
---|
1007 | }
|
---|
1008 | # loop through all values of this tag
|
---|
1009 | for (;;) {
|
---|
1010 | my $subDict = shift @subDicts or last;
|
---|
1011 | if (ref $subDict eq 'SCALAR') {
|
---|
1012 | # load dictionary via an indirect reference
|
---|
1013 | $fetched{$$subDict} = 1;
|
---|
1014 | $subDict = FetchObject($exifTool, $$subDict, $xref, $tag);
|
---|
1015 | $subDict or $exifTool->Warn("Error reading $tag object"), next;
|
---|
1016 | }
|
---|
1017 | if (ref $subDict eq 'ARRAY') {
|
---|
1018 | # convert array of key/value pairs to a hash
|
---|
1019 | next if @$subDict < 2;
|
---|
1020 | my %hash = ( tags => [] );
|
---|
1021 | while (@$subDict >= 2) {
|
---|
1022 | my $key = shift @$subDict;
|
---|
1023 | $key =~ s/^\///;
|
---|
1024 | push @{$hash{tags}}, $key;
|
---|
1025 | $hash{$key} = shift @$subDict;
|
---|
1026 | }
|
---|
1027 | $subDict = \%hash;
|
---|
1028 | } else {
|
---|
1029 | next unless ref $subDict eq 'HASH';
|
---|
1030 | }
|
---|
1031 | my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
|
---|
1032 | if (not $verbose) {
|
---|
1033 | ProcessDict($exifTool, $subTablePtr, $subDict, $xref, $nesting);
|
---|
1034 | } elsif ($next) {
|
---|
1035 | # handle 'Next' links at this level to avoid deep recursion
|
---|
1036 | undef $next;
|
---|
1037 | $index = 0;
|
---|
1038 | $tagTablePtr = $subTablePtr;
|
---|
1039 | $dict = $subDict;
|
---|
1040 | @tags = @{$$subDict{tags}};
|
---|
1041 | $exifTool->VerboseDir($tag, scalar(@tags));
|
---|
1042 | } else {
|
---|
1043 | my $oldIndent = $exifTool->{INDENT};
|
---|
1044 | my $oldDir = $exifTool->{DIR_NAME};
|
---|
1045 | $exifTool->{INDENT} .= '| ';
|
---|
1046 | $exifTool->{DIR_NAME} = $tag;
|
---|
1047 | $exifTool->VerboseDir($tag, scalar(@{$$subDict{tags}}));
|
---|
1048 | ProcessDict($exifTool, $subTablePtr, $subDict, $xref, $nesting);
|
---|
1049 | $exifTool->{INDENT} = $oldIndent;
|
---|
1050 | $exifTool->{DIR_NAME} = $oldDir;
|
---|
1051 | }
|
---|
1052 | }
|
---|
1053 | }
|
---|
1054 | #
|
---|
1055 | # extract information from stream object if it exists
|
---|
1056 | #
|
---|
1057 | return unless $$dict{stream};
|
---|
1058 | my $tag = 'stream';
|
---|
1059 | # add Subtype (if it exists) to stream name and remove leading '/'
|
---|
1060 | ($tag = "$$dict{Subtype}_$tag") =~ s/^\/// if $$dict{Subtype};
|
---|
1061 | return unless $$tagTablePtr{$tag};
|
---|
1062 | my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag);
|
---|
1063 | # decode stream if necessary
|
---|
1064 | DecodeStream($exifTool, $dict) or return;
|
---|
1065 | # extract information from stream
|
---|
1066 | my %dirInfo = (
|
---|
1067 | DataPt => \$$dict{stream},
|
---|
1068 | DataLen => length $$dict{stream},
|
---|
1069 | DirStart => 0,
|
---|
1070 | DirLen => length $$dict{stream},
|
---|
1071 | Parent => 'PDF',
|
---|
1072 | );
|
---|
1073 | my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
|
---|
1074 | unless ($exifTool->ProcessDirectory(\%dirInfo, $subTablePtr)) {
|
---|
1075 | $exifTool->Warn("Error processing $$tagInfo{Name} information");
|
---|
1076 | }
|
---|
1077 | }
|
---|
1078 |
|
---|
1079 | #------------------------------------------------------------------------------
|
---|
1080 | # Extract information from PDF file
|
---|
1081 | # Inputs: 0) ExifTool object reference, 1) dirInfo reference
|
---|
1082 | # Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
|
---|
1083 | sub ReadPDF($$)
|
---|
1084 | {
|
---|
1085 | my ($exifTool, $dirInfo) = @_;
|
---|
1086 | my $raf = $$dirInfo{RAF};
|
---|
1087 | my $verbose = $exifTool->Options('Verbose');
|
---|
1088 | my ($data, $encrypt, $id);
|
---|
1089 | #
|
---|
1090 | # validate PDF file
|
---|
1091 | #
|
---|
1092 | $raf->Read($data, 4) == 4 or return 0;
|
---|
1093 | $data eq '%PDF' or return 0;
|
---|
1094 | $exifTool->SetFileType(); # set the FileType tag
|
---|
1095 | my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
|
---|
1096 | #
|
---|
1097 | # read the xref tables referenced from startxref at the end of the file
|
---|
1098 | #
|
---|
1099 | my @xrefOffsets;
|
---|
1100 | $raf->Seek(0, 2) or return -2;
|
---|
1101 | # the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
|
---|
1102 | my $len = $raf->Tell();
|
---|
1103 | $len = 1024 if $len > 1024;
|
---|
1104 | $raf->Seek(-$len, 2) or return -2;
|
---|
1105 | $raf->Read($data, $len) == $len or return -3;
|
---|
1106 | $data =~ /.*startxref(\x0d\x0a|\x0a\x0a|\x0d|\x0a)(\d+)\1%%EOF/s or return -4;
|
---|
1107 | $/ = $1; # set input record separator
|
---|
1108 | push @xrefOffsets, $2;
|
---|
1109 | my (%xref, @mainDicts, %loaded);
|
---|
1110 | while (@xrefOffsets) {
|
---|
1111 | my $offset = shift @xrefOffsets;
|
---|
1112 | next if $loaded{$offset}; # avoid infinite recursion
|
---|
1113 | unless ($raf->Seek($offset, 0)) {
|
---|
1114 | %loaded or return -5;
|
---|
1115 | $exifTool->Warn('Bad offset for secondary xref table');
|
---|
1116 | next;
|
---|
1117 | }
|
---|
1118 | unless ($raf->ReadLine($data)) {
|
---|
1119 | %loaded or return -6;
|
---|
1120 | $exifTool->Warn('Bad offset for secondary xref table');
|
---|
1121 | next;
|
---|
1122 | }
|
---|
1123 | my $loadXRefStream;
|
---|
1124 | if ($data eq "xref$/") {
|
---|
1125 | # load xref table
|
---|
1126 | for (;;) {
|
---|
1127 | $raf->ReadLine($data) or return -6;
|
---|
1128 | last if $data eq "trailer$/";
|
---|
1129 | my ($start, $num) = $data =~ /(\d+)\s+(\d+)/;
|
---|
1130 | $num or return -4;
|
---|
1131 | my $i;
|
---|
1132 | for ($i=0; $i<$num; ++$i) {
|
---|
1133 | $raf->Read($data, 20) == 20 or return -6;
|
---|
1134 | $data =~ /^(\d{10}) (\d{5}) (f|n)/ or return -4;
|
---|
1135 | next if $3 eq 'f'; # ignore free entries
|
---|
1136 | # save the object offset keyed by its reference
|
---|
1137 | my $ref = ($start + $i) . ' ' . int($2) . ' R';
|
---|
1138 | $xref{$ref} = int($1);
|
---|
1139 | }
|
---|
1140 | }
|
---|
1141 | %xref or return -4;
|
---|
1142 | $data = '';
|
---|
1143 | } elsif ($data =~ s/^(\d+)\s+(\d+)\s+obj//) {
|
---|
1144 | # this is a PDF-1.5 cross-reference stream dictionary
|
---|
1145 | $loadXRefStream = 1;
|
---|
1146 | } else {
|
---|
1147 | %loaded or return -4;
|
---|
1148 | $exifTool->Warn('Invalid secondary xref table');
|
---|
1149 | next;
|
---|
1150 | }
|
---|
1151 | my $mainDict = ExtractObject($exifTool, \$data, $raf, \%xref);
|
---|
1152 | unless ($mainDict) {
|
---|
1153 | %loaded or return -8;
|
---|
1154 | $exifTool->Warn('Error loading secondary dictionary');
|
---|
1155 | next;
|
---|
1156 | }
|
---|
1157 | if ($loadXRefStream) {
|
---|
1158 | # decode and save our XRef stream from PDF-1.5 file
|
---|
1159 | # (parse it later as required to avoid wasting time)
|
---|
1160 | if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
|
---|
1161 | @{$$mainDict{W}} > 2 and $$mainDict{Size} and
|
---|
1162 | DecodeStream($exifTool, $mainDict))
|
---|
1163 | {
|
---|
1164 | # create Index entry if it doesn't exist
|
---|
1165 | $$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
|
---|
1166 | # create 'entry_size' entry for internal use
|
---|
1167 | my $w = $$mainDict{W};
|
---|
1168 | my $size = 0;
|
---|
1169 | foreach (@$w) { $size += $_; }
|
---|
1170 | $$mainDict{entry_size} = $size;
|
---|
1171 | # save this stream dictionary to use later if required
|
---|
1172 | $xref{dicts} = [] unless $xref{dicts};
|
---|
1173 | push @{$xref{dicts}}, $mainDict;
|
---|
1174 | } else {
|
---|
1175 | %loaded or return -9;
|
---|
1176 | $exifTool->Warn('Invalid xref stream in secondary dictionary');
|
---|
1177 | }
|
---|
1178 | }
|
---|
1179 | $loaded{$offset} = 1;
|
---|
1180 | # load XRef stream in hybrid file if it exists
|
---|
1181 | push @xrefOffsets, $$mainDict{XRefStm} if $$mainDict{XRefStm};
|
---|
1182 | $encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
|
---|
1183 | if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
|
---|
1184 | $id = $mainDict->{ID}->[0];
|
---|
1185 | }
|
---|
1186 | push @mainDicts, $mainDict;
|
---|
1187 | # load previous xref table if it exists
|
---|
1188 | push @xrefOffsets, $$mainDict{Prev} if $$mainDict{Prev};
|
---|
1189 | }
|
---|
1190 | #
|
---|
1191 | # extract encryption information if necessary
|
---|
1192 | #
|
---|
1193 | if ($encrypt) {
|
---|
1194 | if (ref $encrypt eq 'SCALAR') {
|
---|
1195 | $encrypt = FetchObject($exifTool, $$encrypt, \%xref, 'Encrypt');
|
---|
1196 | }
|
---|
1197 | # generate Encryption tag information
|
---|
1198 | my $err = DecryptInit($exifTool, $encrypt, $id);
|
---|
1199 | $err and $exifTool->Warn($err), return -1;
|
---|
1200 | }
|
---|
1201 | #
|
---|
1202 | # extract the information beginning with each of the main dictionaries
|
---|
1203 | #
|
---|
1204 | my $dict;
|
---|
1205 | foreach $dict (@mainDicts) {
|
---|
1206 | if ($verbose) {
|
---|
1207 | my $n = scalar(@{$$dict{tags}});
|
---|
1208 | $exifTool->VPrint(0, "PDF dictionary with $n entries:\n");
|
---|
1209 | }
|
---|
1210 | ProcessDict($exifTool, $tagTablePtr, $dict, \%xref);
|
---|
1211 | }
|
---|
1212 | return 1;
|
---|
1213 | }
|
---|
1214 |
|
---|
1215 | #------------------------------------------------------------------------------
|
---|
1216 | # ReadPDF() warning strings for each error return value
|
---|
1217 | my %pdfWarning = (
|
---|
1218 | # -1 is reserved as error return value with no associated warning
|
---|
1219 | -2 => 'Error seeking in file',
|
---|
1220 | -3 => 'Error reading file',
|
---|
1221 | -4 => 'Invalid xref table',
|
---|
1222 | -5 => 'Invalid xref offset',
|
---|
1223 | -6 => 'Error reading xref table',
|
---|
1224 | -7 => 'Error reading trailer',
|
---|
1225 | -8 => 'Error reading main dictionary',
|
---|
1226 | -9 => 'Invalid xref stream in main dictionary',
|
---|
1227 | );
|
---|
1228 |
|
---|
1229 | #------------------------------------------------------------------------------
|
---|
1230 | # Extract information from PDF file
|
---|
1231 | # Inputs: 0) ExifTool object reference, 1) dirInfo reference
|
---|
1232 | # Returns: 1 if this was a valid PDF file
|
---|
1233 | sub ProcessPDF($$)
|
---|
1234 | {
|
---|
1235 | my ($exifTool, $dirInfo) = @_;
|
---|
1236 |
|
---|
1237 | my $oldsep = $/;
|
---|
1238 | my $result = ReadPDF($exifTool, $dirInfo);
|
---|
1239 | $/ = $oldsep; # restore input record separator in case it was changed
|
---|
1240 | if ($result < 0) {
|
---|
1241 | $exifTool->Warn($pdfWarning{$result}) if $pdfWarning{$result};
|
---|
1242 | $result = 1;
|
---|
1243 | }
|
---|
1244 | # clean up and return
|
---|
1245 | undef %warnedOnce;
|
---|
1246 | undef %streamObjs;
|
---|
1247 | undef %fetched;
|
---|
1248 | undef $cryptInfo;
|
---|
1249 | return $result;
|
---|
1250 | }
|
---|
1251 |
|
---|
1252 | 1; # end
|
---|
1253 |
|
---|
1254 |
|
---|
1255 | __END__
|
---|
1256 |
|
---|
1257 | =head1 NAME
|
---|
1258 |
|
---|
1259 | Image::ExifTool::PDF - Read PDF meta information
|
---|
1260 |
|
---|
1261 | =head1 SYNOPSIS
|
---|
1262 |
|
---|
1263 | This module is loaded automatically by Image::ExifTool when required.
|
---|
1264 |
|
---|
1265 | =head1 DESCRIPTION
|
---|
1266 |
|
---|
1267 | This code reads meta information from PDF (Adobe Portable Document Format)
|
---|
1268 | files. It supports object streams introduced in PDF-1.5 but only with a
|
---|
1269 | limited set of Filter and Predictor algorithms, and it decodes encrypted
|
---|
1270 | information but only with a limited number of algorithms.
|
---|
1271 |
|
---|
1272 | =head1 AUTHOR
|
---|
1273 |
|
---|
1274 | Copyright 2003-2007, Phil Harvey (phil at owl.phy.queensu.ca)
|
---|
1275 |
|
---|
1276 | This library is free software; you can redistribute it and/or modify it
|
---|
1277 | under the same terms as Perl itself.
|
---|
1278 |
|
---|
1279 | =head1 REFERENCES
|
---|
1280 |
|
---|
1281 | =over 4
|
---|
1282 |
|
---|
1283 | =item L<http://partners.adobe.com/public/developer/pdf/index_reference.html>
|
---|
1284 |
|
---|
1285 | =item L<http://www.cr0.net:8040/code/crypto/rc4/>
|
---|
1286 |
|
---|
1287 | =back
|
---|
1288 |
|
---|
1289 | =head1 SEE ALSO
|
---|
1290 |
|
---|
1291 | L<Image::ExifTool::TagNames/PDF Tags>,
|
---|
1292 | L<Image::ExifTool(3pm)|Image::ExifTool>
|
---|
1293 |
|
---|
1294 | =cut
|
---|