source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/Image/ExifTool/PDF.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 72.8 KB
Line 
1#------------------------------------------------------------------------------
2# File: PDF.pm
3#
4# Description: Read PDF meta information
5#
6# Revisions: 07/11/2005 - P. Harvey Created
7# 07/25/2005 - P. Harvey Add support for encrypted documents
8#
9# References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
10# 2) http://search.cpan.org/dist/Crypt-RC4/
11# 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
12# 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
13# 5) http://tools.ietf.org/search/rfc3454
14# 6) http://www.armware.dk/RFC/rfc/rfc4013.html
15#------------------------------------------------------------------------------
16
17package Image::ExifTool::PDF;
18
19use strict;
20use vars qw($VERSION $AUTOLOAD $lastFetched);
21use Image::ExifTool qw(:DataAccess :Utils);
22require Exporter;
23
24$VERSION = '1.29';
25
26sub FetchObject($$$$);
27sub ExtractObject($$;$$);
28sub ReadToNested($;$);
29sub ProcessDict($$$$;$$);
30sub ReadPDFValue($);
31sub CheckPDF($$$);
32
33# $lastFetched - last fetched object reference (used for decryption)
34# (undefined if fetched object was already decrypted, ie. object from stream)
35
36my $cryptInfo; # encryption object reference (plus additional information)
37my $cryptString; # flag that strings are encrypted
38my $cryptStream; # flag that streams are encrypted
39my $lastOffset; # last fetched object offset
40my %streamObjs; # hash of stream objects
41my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
42my $pdfVer; # version of PDF file being processed
43
44# tags in main PDF directories
45%Image::ExifTool::PDF::Main = (
46 GROUPS => { 2 => 'Document' },
47 VARS => { CAPTURE => ['Main','Prev'] },
48 Info => {
49 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
50 },
51 Root => {
52 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
53 },
54 Encrypt => {
55 NoProcess => 1, # don't process normally (processed in advance)
56 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
57 },
58 _linearized => {
59 Name => 'Linearized',
60 Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
61 PrintConv => { 'true' => 'Yes', 'false' => 'No' },
62 },
63);
64
65# tags in PDF Info directory
66%Image::ExifTool::PDF::Info = (
67 GROUPS => { 2 => 'Document' },
68 VARS => { CAPTURE => ['Info'] },
69 EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
70 WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
71 CHECK_PROC => \&CheckPDF,
72 WRITABLE => 'string',
73 NOTES => q{
74 As well as the tags listed below, the PDF specification allows for
75 user-defined tags to exist in the Info dictionary. These tags, which should
76 have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
77 object, are also extracted by ExifTool.
78
79 B<Writable> specifies the value format, and may be C<string>, C<date>,
80 C<integer>, C<real>, C<boolean> or C<name> for PDF tags.
81 },
82 Title => { },
83 Author => { Groups => { 2 => 'Author' } },
84 Subject => { },
85 Keywords => { List => 'string' }, # this is a string list
86 Creator => { },
87 Producer => { },
88 CreationDate => {
89 Name => 'CreateDate',
90 Writable => 'date',
91 Groups => { 2 => 'Time' },
92 Shift => 'Time',
93 PrintConv => '$self->ConvertDateTime($val)',
94 PrintConvInv => '$self->InverseDateTime($val)',
95 },
96 ModDate => {
97 Name => 'ModifyDate',
98 Writable => 'date',
99 Groups => { 2 => 'Time' },
100 Shift => 'Time',
101 PrintConv => '$self->ConvertDateTime($val)',
102 PrintConvInv => '$self->InverseDateTime($val)',
103 },
104 Trapped => {
105 Protected => 1,
106 # remove leading '/' from '/True' or '/False'
107 ValueConv => '$val=~s{^/}{}; $val',
108 ValueConvInv => '"/$val"',
109 },
110 'AAPL:Keywords' => { #PH
111 Name => 'AppleKeywords',
112 List => 'array', # this is an array of values
113 Notes => q{
114 keywords written by Apple utilities, although they seem to use PDF:Keywords
115 when reading
116 },
117 },
118);
119
120# tags in the PDF Root document catalog
121%Image::ExifTool::PDF::Root = (
122 GROUPS => { 2 => 'Document' },
123 # note: can't capture previous versions of Root since they are not parsed
124 VARS => { CAPTURE => ['Root'] },
125 NOTES => 'This is the PDF document catalog.',
126 MarkInfo => {
127 SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
128 },
129 Metadata => {
130 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
131 },
132 Pages => {
133 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
134 },
135 Perms => {
136 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
137 },
138 Lang => 'Language',
139 PageLayout => { },
140 PageMode => { },
141 Version => 'PDFVersion',
142);
143
144# tags extracted from the PDF Encrypt dictionary
145%Image::ExifTool::PDF::Encrypt = (
146 GROUPS => { 2 => 'Document' },
147 NOTES => 'Tags extracted from the document Encrypt dictionary.',
148 Filter => {
149 Name => 'Encryption',
150 Notes => q{
151 extracted value is actually a combination of the Filter, SubFilter, V, R and
152 Length information from the Encrypt dictionary
153 },
154 },
155 P => {
156 Name => 'UserAccess',
157 ValueConv => '$val & 0x0f3c', # ignore reserved bits
158 PrintConvColumns => 2,
159 PrintConv => { BITMASK => {
160 2 => 'Print',
161 3 => 'Modify',
162 4 => 'Copy',
163 5 => 'Annotate',
164 8 => 'Fill forms',
165 9 => 'Extract',
166 10 => 'Assemble',
167 11 => 'Print high-res',
168 }},
169 },
170);
171
172# tags in PDF Pages directory
173%Image::ExifTool::PDF::Pages = (
174 GROUPS => { 2 => 'Document' },
175 Count => 'PageCount',
176 Kids => {
177 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
178 },
179);
180
181# tags in PDF Perms directory
182%Image::ExifTool::PDF::Perms = (
183 NOTES => 'Additional document permissions imposed by digital signatures.',
184 DocMDP => {
185 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
186 },
187 FieldMDP => {
188 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
189 },
190 UR3 => {
191 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
192 },
193);
194
195# tags in PDF Kids directory
196%Image::ExifTool::PDF::Kids = (
197 Metadata => {
198 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
199 },
200 PieceInfo => {
201 SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
202 },
203 Resources => {
204 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
205 },
206);
207
208# tags in PDF Resources directory
209%Image::ExifTool::PDF::Resources = (
210 ColorSpace => {
211 SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
212 },
213);
214
215# tags in PDF ColorSpace directory
216%Image::ExifTool::PDF::ColorSpace = (
217 DefaultRGB => {
218 SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
219 },
220);
221
222# tags in PDF DefaultRGB directory
223%Image::ExifTool::PDF::DefaultRGB = (
224 ICCBased => {
225 SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
226 },
227);
228
229# tags in PDF ICCBased directory
230%Image::ExifTool::PDF::ICCBased = (
231 _stream => {
232 SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
233 },
234);
235
236# tags in PDF PieceInfo directory
237%Image::ExifTool::PDF::PieceInfo = (
238 AdobePhotoshop => {
239 SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
240 },
241 Illustrator => {
242 # assume this is an illustrator file if it contains this directory
243 # and doesn't have a ".PDF" extension
244 Condition => q{
245 $self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
246 return 1;
247 },
248 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
249 },
250);
251
252# tags in PDF AdobePhotoshop directory
253%Image::ExifTool::PDF::AdobePhotoshop = (
254 Private => {
255 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
256 },
257);
258
259# tags in PDF Illustrator directory
260%Image::ExifTool::PDF::Illustrator = (
261 Private => {
262 SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
263 },
264);
265
266# tags in PDF Private directory
267%Image::ExifTool::PDF::Private = (
268 ImageResources => {
269 SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
270 },
271);
272
273# tags in PDF AI Private directory
274%Image::ExifTool::PDF::AIPrivate = (
275 GROUPS => { 2 => 'Document' },
276 EXTRACT_UNKNOWN => 0, # extract known but numbered tags
277 AIMetaData => {
278 SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
279 },
280 AIPrivateData => {
281 Notes => q{
282 the ExtractEmbedded option enables information to be extracted from embedded
283 PostScript documents in the AIPrivateData stream
284 },
285 SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
286 },
287 RoundTripVersion => { },
288 ContainerVersion => { },
289 CreatorVersion => { },
290);
291
292# tags in PDF AIMetaData directory
293%Image::ExifTool::PDF::AIMetaData = (
294 _stream => {
295 SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
296 },
297);
298
299# tags in PDF ImageResources directory
300%Image::ExifTool::PDF::ImageResources = (
301 _stream => {
302 SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
303 },
304);
305
306# tags in PDF MarkInfo directory
307%Image::ExifTool::PDF::MarkInfo = (
308 GROUPS => { 2 => 'Document' },
309 Marked => {
310 Name => 'TaggedPDF',
311 Notes => "not a Tagged PDF if this tag is missing",
312 PrintConv => { 'true' => 'Yes', 'false' => 'No' },
313 },
314);
315
316# tags in PDF Metadata directory
317%Image::ExifTool::PDF::Metadata = (
318 GROUPS => { 2 => 'Document' },
319 XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
320 Name => 'XMP',
321 SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
322 },
323);
324
325# tags in PDF signature directories (DocMDP, FieldMDP or UR3)
326%Image::ExifTool::PDF::Signature = (
327 GROUPS => { 2 => 'Document' },
328 ContactInfo => 'SignerContactInfo',
329 Location => 'SigningLocation',
330 M => {
331 Name => 'SigningDate',
332 Format => 'date',
333 Groups => { 2 => 'Time' },
334 PrintConv => '$self->ConvertDateTime($val)',
335 },
336 Name => 'SigningAuthority',
337 Reason => 'SigningReason',
338 Reference => {
339 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
340 },
341 Prop_AuthTime => {
342 Name => 'AuthenticationTime',
343 PrintConv => 'ConvertTimeSpan($val) . " ago"',
344 },
345 Prop_AuthType => 'AuthenticationType',
346);
347
348# tags in PDF Reference directory
349%Image::ExifTool::PDF::Reference = (
350 TransformParams => {
351 SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
352 },
353);
354
355# tags in PDF TransformParams directory
356%Image::ExifTool::PDF::TransformParams = (
357 GROUPS => { 2 => 'Document' },
358 Annots => {
359 Name => 'AnnotationUsageRights',
360 Notes => 'UR3 signatures only',
361 List => 1,
362 },
363 Document => {
364 Name => 'DocumentUsageRights',
365 Notes => 'UR3 signatures only',
366 List => 1,
367 },
368 Form => {
369 Name => 'FormUsageRights',
370 Notes => 'UR3 signatures only',
371 List => 1,
372 },
373 Signature => {
374 Name => 'SignatureUsageRights',
375 Notes => 'UR3 signatures only',
376 List => 1,
377 },
378 EF => {
379 Name => 'EmbeddedFileUsageRights',
380 Notes => 'UR3 signatures only',
381 List => 1,
382 },
383 Msg => {
384 Name => 'UsageRightsMessage',
385 Notes => 'UR3 signatures only',
386 },
387 P => {
388 Name => 'ModificationPermissions',
389 Notes => q{
390 1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
391 false
392 },
393 PrintConv => {
394 1 => 'No changes permitted',
395 2 => 'Fill forms, Create page templates, Sign',
396 3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
397 'true' => 'Restrict al applications to reader permissions',
398 'false' => 'Do not restrict applications to reader permissions',
399 },
400 },
401 Action => {
402 Name => 'FieldPermissions',
403 Notes => 'FieldMDP signatures only',
404 PrintConv => {
405 'All' => 'Disallow changes to all form fields',
406 'Include' => 'Disallow changes to specified form fields',
407 'Exclude' => 'Allow changes to specified form fields',
408 },
409 },
410 Fields => {
411 Notes => 'FieldMDP signatures only',
412 Name => 'FormFields',
413 List => 1,
414 },
415);
416
417# unknown tags for use in verbose option
418%Image::ExifTool::PDF::Unknown = (
419 GROUPS => { 2 => 'Unknown' },
420);
421
422#------------------------------------------------------------------------------
423# AutoLoad our writer routines when necessary
424#
425sub AUTOLOAD
426{
427 return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
428}
429
430#------------------------------------------------------------------------------
431# Convert from PDF to EXIF-style date/time
432# Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
433# Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
434sub ConvertPDFDate($)
435{
436 my $date = shift;
437 # remove optional 'D:' prefix
438 $date =~ s/^D://;
439 # fill in default values if necessary
440 # YYYYmmddHHMMSS
441 my $default = '00000101000000';
442 if (length $date < length $default) {
443 $date .= substr($default, length $date);
444 }
445 $date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
446 $date = "$1:$2:$3 $4:$5:$6";
447 if ($7) {
448 my $tz = $7;
449 if ($tz =~ /^\s*Z/i) {
450 # ignore any "HH'mm'" after the Z (OS X 10.6 does this)
451 $date .= 'Z';
452 # tolerate some improper formatting in timezone specification
453 } elsif ($tz =~ /^\s*([-+])\s*(\d+)[': ]+(\d*)/) {
454 $date .= $1 . $2 . ':' . ($3 || '00');
455 }
456 }
457 return $date;
458}
459
460#------------------------------------------------------------------------------
461# Locate any object in the XRef tables (including compressed objects)
462# Inputs: 0) XRef reference, 1) object reference string (or free object number)
463# Returns: offset to object in file or compressed object reference string,
464# 0 if object is free, or undefined on error
465sub LocateAnyObject($$)
466{
467 my ($xref, $ref) = @_;
468 return undef unless $xref;
469 return $$xref{$ref} if exists $$xref{$ref};
470 # get the object number
471 return undef unless $ref =~ /^(\d+)/;
472 my $objNum = $1;
473 # return 0 if the object number has been reused (old object is free)
474 return 0 if defined $$xref{$objNum};
475#
476# scan our XRef stream dictionaries for this object
477#
478 return undef unless $$xref{dicts};
479 my $dict;
480 foreach $dict (@{$$xref{dicts}}) {
481 # quick check to see if the object is in the range for this xref stream
482 next if $objNum >= $$dict{Size};
483 my $index = $$dict{Index};
484 next if $objNum < $$index[0];
485 # scan the tables for the specified object
486 my $size = $$dict{_entry_size};
487 my $num = scalar(@$index) / 2;
488 my $tot = 0;
489 my $i;
490 for ($i=0; $i<$num; ++$i) {
491 my $start = $$index[$i*2];
492 my $count = $$index[$i*2+1];
493 # table is in ascending order, so quit if we have passed the object
494 last if $objNum < $start;
495 if ($objNum < $start + $count) {
496 my $offset = $size * ($objNum - $start + $tot);
497 last if $offset + $size > length $$dict{_stream};
498 my @c = unpack("x$offset C$size", $$dict{_stream});
499 # extract values from this table entry
500 # (can be 1, 2, 3, 4, etc.. bytes per value)
501 my (@t, $j, $k);
502 my $w = $$dict{W};
503 for ($j=0; $j<3; ++$j) {
504 # use default value if W entry is 0 (as per spec)
505 # - 0th element defaults to 1, others default to 0
506 $$w[$j] or $t[$j] = ($j ? 0 : 1), next;
507 $t[$j] = shift(@c);
508 for ($k=1; $k < $$w[$j]; ++$k) {
509 $t[$j] = 256 * $t[$j] + shift(@c);
510 }
511 }
512 # by default, use "o g R" as the xref key
513 # (o = object number, g = generation number)
514 my $ref2 = "$objNum $t[2] R";
515 if ($t[0] == 1) {
516 # normal object reference:
517 # $t[1]=offset of object from start, $t[2]=generation number
518 $$xref{$ref2} = $t[1];
519 } elsif ($t[0] == 2) {
520 # compressed object reference:
521 # $t[1]=stream object number, $t[2]=index of object in stream
522 $ref2 = "$objNum 0 R";
523 $$xref{$ref2} = "I$t[2] $t[1] 0 R";
524 } elsif ($t[0] == 0) {
525 # free object:
526 # $t[1]=next free object in linked list, $t[2]=generation number
527 $$xref{$ref2} = 0;
528 } else {
529 # treat as a null object
530 $$xref{$ref2} = undef;
531 }
532 $$xref{$objNum} = $t[1]; # remember offsets by object number too
533 return $$xref{$ref} if $ref eq $ref2;
534 return 0; # object is free or was reused
535 }
536 $tot += $count;
537 }
538 }
539 return undef;
540}
541
542#------------------------------------------------------------------------------
543# Locate a regular object in the XRef tables (does not include compressed objects)
544# Inputs: 0) XRef reference, 1) object reference string (or free object number)
545# Returns: offset to object in file, 0 if object is free,
546# or undef on error or if object was compressed
547sub LocateObject($$)
548{
549 my ($xref, $ref) = @_;
550 my $offset = LocateAnyObject($xref, $ref);
551 return undef if $offset and $offset =~ /^I/;
552 return $offset;
553}
554
555#------------------------------------------------------------------------------
556# Fetch indirect object from file (from inside a stream if required)
557# Inputs: 0) ExifTool object reference, 1) object reference string,
558# 2) xref lookup, 3) object name (for warning messages)
559# Returns: object data or undefined on error
560# Notes: sets $lastFetched to the object reference, or undef if the object
561# was extracted from an encrypted stream
562sub FetchObject($$$$)
563{
564 my ($exifTool, $ref, $xref, $tag) = @_;
565 $lastFetched = $ref; # save this for decoding if necessary
566 my $offset = LocateAnyObject($xref, $ref);
567 $lastOffset = $offset;
568 unless ($offset) {
569 $exifTool->Warn("Bad $tag reference") unless defined $offset;
570 return undef;
571 }
572 my ($data, $obj);
573 if ($offset =~ s/^I(\d+) //) {
574 my $index = $1; # object index in stream
575 my ($objNum) = split ' ', $ref; # save original object number
576 $ref = $offset; # now a reference to the containing stream object
577 $obj = $streamObjs{$ref};
578 unless ($obj) {
579 # don't try to load the same object stream twice
580 return undef if defined $obj;
581 $streamObjs{$ref} = '';
582 # load the parent object stream
583 $obj = FetchObject($exifTool, $ref, $xref, $tag);
584 # make sure it contains everything we need
585 return undef unless defined $obj and ref($obj) eq 'HASH';
586 return undef unless $$obj{First} and $$obj{N};
587 return undef unless DecodeStream($exifTool, $obj);
588 # add a special '_table' entry to this dictionary which contains
589 # the list of object number/offset pairs from the stream header
590 my $num = $$obj{N} * 2;
591 my @table = split ' ', $$obj{_stream}, $num;
592 return undef unless @table == $num;
593 # remove everything before first object in stream
594 $$obj{_stream} = substr($$obj{_stream}, $$obj{First});
595 $table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
596 $$obj{_table} = \@table;
597 # save the object stream so we don't have to re-load it later
598 $streamObjs{$ref} = $obj;
599 }
600 # verify that we have the specified object
601 my $i = 2 * $index;
602 my $table = $$obj{_table};
603 unless ($index < $$obj{N} and $$table[$i] == $objNum) {
604 $exifTool->Warn("Bad index for stream object $tag");
605 return undef;
606 }
607 # extract the object at the specified index in the stream
608 # (offsets in table are in sequential order, so we can subract from
609 # the next offset to get the object length)
610 $offset = $$table[$i + 1];
611 my $len = ($$table[$i + 3] || length($$obj{_stream})) - $offset;
612 $data = substr($$obj{_stream}, $offset, $len);
613 # avoid re-decrypting data in already decrypted streams
614 undef $lastFetched if $cryptStream;
615 return ExtractObject($exifTool, \$data);
616 }
617 my $raf = $exifTool->{RAF};
618 $raf->Seek($offset, 0) or $exifTool->Warn("Bad $tag offset"), return undef;
619 # verify that we are reading the expected object
620 $raf->ReadLine($data) or $exifTool->Warn("Error reading $tag data"), return undef;
621 ($obj = $ref) =~ s/R/obj/;
622 unless ($data =~ s/^$obj//) {
623 $exifTool->Warn("$tag object ($obj) not found at $offset");
624 return undef;
625 }
626 return ExtractObject($exifTool, \$data, $raf, $xref);
627}
628
629#------------------------------------------------------------------------------
630# Convert PDF value to something readable
631# Inputs: 0) PDF object data
632# Returns: converted object
633sub ReadPDFValue($)
634{
635 my $str = shift;
636 # decode all strings in an array
637 if (ref $str eq 'ARRAY') {
638 # create new list to not alter the original data when rewriting
639 my ($val, @vals);
640 foreach $val (@$str) {
641 push @vals, ReadPDFValue($val);
642 }
643 return \@vals;
644 }
645 length $str or return $str;
646 my $delim = substr($str, 0, 1);
647 if ($delim eq '(') { # literal string
648 $str = $1 if $str =~ /.*?\((.*)\)/s; # remove brackets
649 # decode escape sequences in literal strings
650 while ($str =~ /\\(.)/sg) {
651 my $n = pos($str) - 2;
652 my $c = $1;
653 my $r;
654 if ($c =~ /[0-7]/) {
655 # get up to 2 more octal digits
656 $c .= $1 if $str =~ /\G([0-7]{1,2})/g;
657 # convert octal escape code
658 $r = chr(oct($c) & 0xff);
659 } elsif ($c eq "\x0d") {
660 # the string is continued if the line ends with '\'
661 # (also remove "\x0d\x0a")
662 $c .= $1 if $str =~ /\G(\x0a)/g;
663 $r = '';
664 } elsif ($c eq "\x0a") {
665 $r = '';
666 } else {
667 # convert escaped characters
668 ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
669 }
670 substr($str, $n, length($c)+1) = $r;
671 # continue search after this character
672 pos($str) = $n + length($r);
673 }
674 Crypt(\$str, $lastFetched) if $cryptString;
675 } elsif ($delim eq '<') { # hex string
676 # decode hex data
677 $str =~ tr/0-9A-Fa-f//dc;
678 $str .= '0' if length($str) & 0x01; # (by the spec)
679 $str = pack('H*', $str);
680 Crypt(\$str, $lastFetched) if $cryptString;
681 } elsif ($delim eq '/') { # name
682 $str = substr($str, 1);
683 # convert escape codes (PDF 1.2 or later)
684 $str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
685 }
686 return $str;
687}
688
689#------------------------------------------------------------------------------
690# Extract PDF object from combination of buffered data and file
691# Inputs: 0) ExifTool object reference, 1) data reference,
692# 2) optional raf reference, 3) optional xref table
693# Returns: converted PDF object or undef on error
694# a) dictionary object --> hash reference
695# b) array object --> array reference
696# c) indirect reference --> scalar reference
697# d) string, name, integer, boolean, null --> scalar value
698# - updates $$dataPt on return to contain unused data
699# - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
700# the stream data and a list of the tags (not including '_stream' and '_tags')
701# in their original order
702sub ExtractObject($$;$$)
703{
704 my ($exifTool, $dataPt, $raf, $xref) = @_;
705 my (@tags, $data, $objData);
706 my $dict = { };
707 my $delim;
708
709 for (;;) {
710 if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/s) {
711 $delim = $1;
712 $$dataPt =~ s/^\s+//; # remove leading white space
713 $objData = ReadToNested($dataPt, $raf);
714 return undef unless defined $objData;
715 last;
716 } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{}s) {
717#
718# extract boolean, numerical, string, name, null object or indirect reference
719#
720 $objData = $1;
721 # look for an indirect reference
722 if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
723 $objData .= "$1 R";
724 $objData = \$objData; # return scalar reference
725 }
726 return $objData; # return simple scalar or scalar reference
727 }
728 $raf and $raf->ReadLine($data) or return undef;
729 $$dataPt .= $data;
730 }
731#
732# return literal string or hex string without parsing
733#
734 if ($delim eq '(' or $delim eq '<') {
735 return $objData;
736#
737# extract array
738#
739 } elsif ($delim eq '[') {
740 $objData =~ /.*?\[(.*)\]/s or return undef;
741 my $data = $1; # brackets removed
742 my @list;
743 for (;;) {
744 last unless $data =~ m{\s*(\S[^[(/<>\s]*)}sg;
745 my $val = $1;
746 if ($val =~ /^(<{1,2}|\[|\()/) {
747 my $pos = pos($data) - length($val);
748 # nested dict, array, literal string or hex string
749 my $buff = substr($data, $pos);
750 $val = ReadToNested(\$buff);
751 last unless defined $val;
752 pos($data) = $pos + length($val);
753 $val = ExtractObject($exifTool, \$val);
754 } elsif ($val =~ /^\d/) {
755 my $pos = pos($data);
756 if ($data =~ /\G\s+(\d+)\s+R/g) {
757 $val = \ "$val $1 R"; # make a reference
758 } else {
759 pos($data) = $pos;
760 }
761 }
762 push @list, $val;
763 }
764 return \@list;
765 }
766#
767# extract dictionary
768#
769 # Note: entries are not necessarily separated by whitespace (doh!)
770 # ie) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
771 # Also, they may be separated by a comment (ie. "/Tag%comment\nValue"),
772 # but comments have already been removed
773 while ($objData =~ m{(\s*)/([^/[\]()<>{}\s]+)\s*(\S[^[(/<>\s]*)}sg) {
774 my $tag = $2;
775 my $val = $3;
776 if ($val =~ /^(<{1,2}|\[|\()/) {
777 # nested dict, array, literal string or hex string
778 $objData = substr($objData, pos($objData)-length($val));
779 $val = ReadToNested(\$objData, $raf);
780 last unless defined $val;
781 $val = ExtractObject($exifTool, \$val);
782 pos($objData) = 0;
783 } elsif ($val =~ /^\d/) {
784 my $pos = pos($objData);
785 if ($objData =~ /\G\s+(\d+)\s+R/sg) {
786 $val = \ "$val $1 R"; # make a reference
787 } else {
788 pos($objData) = $pos;
789 }
790 }
791 if ($$dict{$tag}) {
792 # duplicate dictionary entries are not allowed
793 $exifTool->Warn('Duplicate $tag entry in dictionary (ignored)');
794 } else {
795 # save the entry
796 push @tags, $tag;
797 $$dict{$tag} = $val;
798 }
799 }
800 return undef unless @tags;
801 $$dict{_tags} = \@tags;
802 return $dict unless $raf; # direct objects can not have streams
803#
804# extract the stream object
805#
806 # dictionary must specify stream Length
807 my $length = $$dict{Length} or return $dict;
808 if (ref $length) {
809 $length = $$length;
810 my $oldpos = $raf->Tell();
811 # get the location of the object specifying the length
812 # (compressed objects are not allowed)
813 my $offset = LocateObject($xref, $length) or return $dict;
814 $offset or $exifTool->Warn('Bad Length object'), return $dict;
815 $raf->Seek($offset, 0) or $exifTool->Warn('Bad Length offset'), return $dict;
816 # verify that we are reading the expected object
817 $raf->ReadLine($data) or $exifTool->Warn('Error reading Length data'), return $dict;
818 $length =~ s/R/obj/;
819 unless ($data =~ /^$length/) {
820 $exifTool->Warn("Length object ($length) not found at $offset");
821 return $dict;
822 }
823 $raf->ReadLine($data) or $exifTool->Warn('Error reading stream Length'), return $dict;
824 $data =~ /(\d+)/ or $exifTool->Warn('Stream length not found'), return $dict;
825 $length = $1;
826 $raf->Seek($oldpos, 0); # restore position to start of stream
827 }
828 # extract the trailing stream data
829 for (;;) {
830 # find the stream token
831 if ($$dataPt =~ /(\S+)/) {
832 last unless $1 eq 'stream';
833 # read an extra line because it may contain our \x0a
834 $$dataPt .= $data if $raf->ReadLine($data);
835 # remove our stream header
836 $$dataPt =~ s/^\s*stream(\x0a|\x0d\x0a)//s;
837 my $more = $length - length($$dataPt);
838 if ($more > 0) {
839 unless ($raf->Read($data, $more) == $more) {
840 $exifTool->Warn('Error reading stream data');
841 $$dataPt = '';
842 return $dict;
843 }
844 $$dict{_stream} = $$dataPt . $data;
845 $$dataPt = '';
846 } elsif ($more < 0) {
847 $$dict{_stream} = substr($$dataPt, 0, $length);
848 $$dataPt = substr($$dataPt, $length);
849 } else {
850 $$dict{_stream} = $$dataPt;
851 $$dataPt = '';
852 }
853 last;
854 }
855 $raf->ReadLine($data) or last;
856 $$dataPt .= $data;
857 }
858 return $dict;
859}
860
861#------------------------------------------------------------------------------
862# Read to nested delimiter
863# Inputs: 0) data reference, 1) optional raf reference
864# Returns: data up to and including matching delimiter (or undef on error)
865# - updates data reference with trailing data
866# - unescapes characters in literal strings
867my %closingDelim = ( # lookup for matching delimiter
868 '(' => ')',
869 '[' => ']',
870 '<' => '>',
871 '<<' => '>>',
872);
873sub ReadToNested($;$)
874{
875 my ($dataPt, $raf) = @_;
876 my @delim = (''); # closing delimiter list, most deeply nested first
877 pos($$dataPt) = 0; # begin at start of data
878 for (;;) {
879 unless ($$dataPt =~ /(\\*)(\(|\)|<{1,2}|>{1,2}|\[|\]|%)/g) {
880 # must read some more data
881 my $buff;
882 last unless $raf and $raf->ReadLine($buff);
883 $$dataPt .= $buff;
884 pos($$dataPt) = length($$dataPt) - length($buff);
885 next;
886 }
887 # are we in a literal string?
888 if ($delim[0] eq ')') {
889 # ignore escaped delimiters (preceeded by odd number of \'s)
890 next if length($1) & 0x01;
891 # ignore all delimiters but unescaped braces
892 next unless $2 eq '(' or $2 eq ')';
893 } elsif ($2 eq '%') {
894 # ignore the comment
895 my $pos = pos($$dataPt) - 1;
896 # remove everything from '%' up to but not including newline
897 $$dataPt =~ /.*/g;
898 my $end = pos($$dataPt);
899 $$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
900 pos($$dataPt) = $pos;
901 next;
902 }
903 if ($closingDelim{$2}) {
904 # push the corresponding closing delimiter
905 unshift @delim, $closingDelim{$2};
906 next;
907 }
908 unless ($2 eq $delim[0]) {
909 # handle the case where we find a ">>>" and interpret it
910 # as ">> >" instead of "> >>"
911 next unless $2 eq '>>' and $delim[0] eq '>';
912 pos($$dataPt) = pos($$dataPt) - 1;
913 }
914 shift @delim; # remove from nesting list
915 next if $delim[0]; # keep going if we have more nested delimiters
916 my $pos = pos($$dataPt);
917 my $buff = substr($$dataPt, 0, $pos);
918 $$dataPt = substr($$dataPt, $pos);
919 return $buff; # success!
920 }
921 return undef; # didn't find matching delimiter
922}
923
924#------------------------------------------------------------------------------
925# Decode filtered stream
926# Inputs: 0) ExifTool object reference, 1) dictionary reference
927# Returns: true if stream has been decoded OK
928sub DecodeStream($$)
929{
930 my ($exifTool, $dict) = @_;
931
932 return 0 unless $$dict{_stream}; # no stream to decode
933
934 # get list of filters
935 my (@filters, @decodeParms, $filter);
936 if (ref $$dict{Filter} eq 'ARRAY') {
937 @filters = @{$$dict{Filter}};
938 } elsif (defined $$dict{Filter}) {
939 @filters = ($$dict{Filter});
940 }
941 # apply decryption first if required (and if the default encryption
942 # has not been overridden by a Crypt filter. Note: the Crypt filter
943 # must be first in the Filter array: ref 3, page 38)
944 unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
945 CryptStream($dict, $lastFetched);
946 }
947 return 1 unless $$dict{Filter}; # Filter entry is mandatory
948 return 0 if defined $$dict{_filtered}; # avoid double-filtering
949 $$dict{_filtered} = 1; # set flag to prevent double-filtering
950
951 # get array of DecodeParms dictionaries
952 if (ref $$dict{DecodeParms} eq 'ARRAY') {
953 @decodeParms = @{$$dict{DecodeParms}};
954 } else {
955 @decodeParms = ($$dict{DecodeParms});
956 }
957 foreach $filter (@filters) {
958 my $decodeParms = shift @decodeParms;
959
960 if ($filter eq '/FlateDecode') {
961 if (eval 'require Compress::Zlib') {
962 my $inflate = Compress::Zlib::inflateInit();
963 my ($buff, $stat);
964 $inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
965 if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
966 $$dict{_stream} = $buff;
967 } else {
968 $exifTool->Warn('Error inflating stream');
969 return 0;
970 }
971 } else {
972 $exifTool->WarnOnce('Install Compress::Zlib to process filtered streams');
973 return 0;
974 }
975 # apply anti-predictor if necessary
976 next unless ref $decodeParms eq 'HASH';
977 my $pre = $$decodeParms{Predictor};
978 next unless $pre and $pre != 1;
979 if ($pre != 12) {
980 # currently only support 'up' prediction
981 $exifTool->WarnOnce("FlateDecode Predictor $pre not currently supported");
982 return 0;
983 }
984 my $cols = $$decodeParms{Columns};
985 unless ($cols) {
986 # currently only support 'up' prediction
987 $exifTool->WarnOnce('No Columns for decoding stream');
988 return 0;
989 }
990 my @bytes = unpack('C*', $$dict{_stream});
991 my @pre = (0) x $cols; # initialize predictor array
992 my $buff = '';
993 while (@bytes > $cols) {
994 unless (($_ = shift @bytes) == 2) {
995 $exifTool->WarnOnce("Unsupported PNG filter $_"); # (yes, PNG)
996 return 0;
997 }
998 foreach (@pre) {
999 $_ = ($_ + shift(@bytes)) & 0xff;
1000 }
1001 $buff .= pack('C*', @pre);
1002 }
1003 $$dict{_stream} = $buff;
1004
1005 } elsif ($filter eq '/Crypt') {
1006
1007 # (we shouldn't have to check the _decrypted flag since we
1008 # already checked the _filtered flag, but what the heck...)
1009 next if defined $$dict{_decrypted};
1010 # assume Identity filter (the default) if DecodeParms are missing
1011 next unless ref $decodeParms eq 'HASH';
1012 my $name = $$decodeParms{Name};
1013 next unless defined $name or $name eq 'Identity';
1014 if ($name ne 'StdCF') {
1015 $exifTool->WarnOnce("Unsupported Crypt Filter $name");
1016 return 0;
1017 }
1018 unless ($cryptInfo) {
1019 $exifTool->WarnOnce('Missing Encrypt StdCF entry');
1020 return 0;
1021 }
1022 # decrypt the stream manually because we want to:
1023 # 1) ignore $cryptStream (StmF) setting
1024 # 2) ignore EncryptMetadata setting (I can't find mention of how to
1025 # reconcile this in the spec., but this would make sense)
1026 # 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
1027 # 4) set _decrypted flag so we will recrypt according to StmF when
1028 # writing (since we don't yet write Filter'd streams)
1029 Crypt(\$$dict{_stream}, 'none');
1030 $$dict{_decrypted} = ($cryptStream ? 1 : 0);
1031
1032 } elsif ($filter ne '/Identity') {
1033
1034 $exifTool->WarnOnce("Unsupported Filter $filter");
1035 return 0;
1036 }
1037 }
1038 return 1;
1039}
1040
1041#------------------------------------------------------------------------------
1042# Initialize state for RC4 en/decryption (ref 2)
1043# Inputs: 0) RC4 key string
1044# Returns: RC4 key hash reference
1045sub RC4Init($)
1046{
1047 my @key = unpack('C*', shift);
1048 my @state = (0 .. 255);
1049 my ($i, $j) = (0, 0);
1050 while ($i < 256) {
1051 my $st = $state[$i];
1052 $j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
1053 $state[$i++] = $state[$j];
1054 $state[$j] = $st;
1055 }
1056 return { State => \@state, XY => [ 0, 0 ] };
1057}
1058
1059#------------------------------------------------------------------------------
1060# Apply RC4 en/decryption (ref 2)
1061# Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
1062# - can call this method directly with a key string, or with with the key
1063# reference returned by RC4Init
1064# - RC4 is a symmetric algorithm, so encryption is the same as decryption
1065sub RC4Crypt($$)
1066{
1067 my ($dataPt, $key) = @_;
1068 $key = RC4Init($key) unless ref $key eq 'HASH';
1069 my $state = $$key{State};
1070 my ($x, $y) = @{$$key{XY}};
1071
1072 my @data = unpack('C*', $$dataPt);
1073 foreach (@data) {
1074 $x = ($x + 1) & 0xff;
1075 my $stx = $$state[$x];
1076 $y = ($stx + $y) & 0xff;
1077 my $sty = $$state[$x] = $$state[$y];
1078 $$state[$y] = $stx;
1079 $_ ^= $$state[($stx + $sty) & 0xff];
1080 }
1081 $$key{XY} = [ $x, $y ];
1082 $$dataPt = pack('C*', @data);
1083}
1084
1085#------------------------------------------------------------------------------
1086# Initialize decryption
1087# Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
1088# 2) ID from file trailer dictionary
1089# Returns: error string or undef on success (and sets $cryptInfo)
1090sub DecryptInit($$$)
1091{
1092 local $_;
1093 my ($exifTool, $encrypt, $id) = @_;
1094
1095 undef $cryptInfo;
1096 unless ($encrypt and ref $encrypt eq 'HASH') {
1097 return 'Error loading Encrypt object';
1098 }
1099 my $filt = $$encrypt{Filter};
1100 unless ($filt and $filt =~ s/^\///) {
1101 return 'Encrypt dictionary has no Filter!';
1102 }
1103 # extract some interesting tags
1104 my $ver = $$encrypt{V} || 0;
1105 my $rev = $$encrypt{R} || 0;
1106 my $enc = "$filt V$ver";
1107 $enc .= ".$rev" if $filt eq 'Standard';
1108 $enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
1109 $enc .= ' (' . ($$encrypt{Length} || 40) . '-bit)' if $filt eq 'Standard';
1110 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
1111 $exifTool->HandleTag($tagTablePtr, 'Filter', $enc);
1112 if ($filt ne 'Standard') {
1113 return "Encryption filter $filt not currently supported";
1114 } elsif (not defined $$encrypt{R}) {
1115 return 'Standard security handler missing revision';
1116 }
1117 unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
1118 return 'Incomplete Encrypt specification';
1119 }
1120 $exifTool->HandleTag($tagTablePtr, 'P', $$encrypt{P});
1121
1122 my %parm; # optional parameters extracted from Encrypt dictionary
1123
1124 if ($ver == 1 or $ver == 2) {
1125 $cryptString = $cryptStream = 1;
1126 } elsif ($ver == 4 or $ver == 5) {
1127 # initialize our $cryptString and $cryptStream flags
1128 foreach ('StrF', 'StmF') {
1129 my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
1130 $$flagPt = $$encrypt{$_};
1131 undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
1132 return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
1133 }
1134 if ($cryptString or $cryptStream) {
1135 return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
1136 ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
1137 my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
1138 unless ($cryptMeth =~ /^\/(V2|AESV2|AESV3)$/) {
1139 return "Unsupported encryption method $cryptMeth";
1140 }
1141 # set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
1142 $$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2|AESV3)$/;
1143 }
1144 if ($ver == 5) {
1145 # validate OE and UE entries
1146 foreach ('OE', 'UE') {
1147 return "Missing Encrypt $_ entry" unless $$encrypt{$_};
1148 $parm{$_} = ReadPDFValue($$encrypt{$_});
1149 return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
1150 }
1151 require Image::ExifTool::AES; # will need this later
1152 }
1153 } else {
1154 return "Encryption version $ver not currently supported";
1155 }
1156 $id or return "Can't decrypt (no document ID)";
1157
1158 # make sure we have the necessary libraries available
1159 if ($ver < 5) {
1160 unless (eval 'require Digest::MD5') {
1161 return "Install Digest::MD5 to process encrypted PDF";
1162 }
1163 } else {
1164 unless (eval 'require Digest::SHA') {
1165 return "Install Digest::SHA to process AES-256 encrypted PDF";
1166 }
1167 }
1168
1169 # calculate file-level en/decryption key
1170 my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
1171 "\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
1172 my $o = ReadPDFValue($$encrypt{O});
1173 my $u = ReadPDFValue($$encrypt{U});
1174
1175 # set flag indicating whether metadata is encrypted
1176 # (in version 4 and higher, metadata streams may not be encrypted)
1177 if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
1178 $$encrypt{_meta} = 1;
1179 }
1180 # try no password first, then try provided password if available
1181 my ($try, $key);
1182 for ($try=0; ; ++$try) {
1183 my $password;
1184 if ($try == 0) {
1185 $password = '';
1186 } elsif ($try == 1) {
1187 $password = $exifTool->Options('Password');
1188 return 'Document is password protected (use Password option)' unless defined $password;
1189 # make sure there is no UTF-8 flag on the password
1190 if ($] >= 5.006 and (eval 'require Encode; Encode::is_utf8($password)' or $@)) {
1191 # repack by hand if Encode isn't available
1192 $password = $@ ? pack('C*',unpack('U0C*',$password)) : Encode::encode('utf8',$password);
1193 }
1194 } else {
1195 return 'Incorrect password';
1196 }
1197 if ($ver < 5) {
1198 if (length $password) {
1199 # password must be encoding in PDFDocEncoding (ref iso32000)
1200 $password = $exifTool->Encode($password, 'PDFDoc');
1201 # truncate or pad the password to exactly 32 bytes
1202 if (length($password) > 32) {
1203 $password = substr($password, 0, 32);
1204 } elsif (length($password) < 32) {
1205 $password .= substr($pad, 0, 32-length($password));
1206 }
1207 } else {
1208 $password = $pad;
1209 }
1210 $key = $password . $o . pack('V', $$encrypt{P}) . $id;
1211 my $rep = 1;
1212 if ($rev == 3 or $rev == 4) {
1213 # must add this if metadata not encrypted
1214 $key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
1215 $rep += 50; # repeat MD5 50 more times if revision is 3 or greater
1216 }
1217 my ($len, $i, $dat);
1218 if ($ver == 1) {
1219 $len = 5;
1220 } else {
1221 $len = $$encrypt{Length} || 40;
1222 $len >= 40 or return 'Bad Encrypt Length';
1223 $len = int($len / 8);
1224 }
1225 for ($i=0; $i<$rep; ++$i) {
1226 $key = substr(Digest::MD5::md5($key), 0, $len);
1227 }
1228 # decrypt U to see if a user password is required
1229 if ($rev >= 3) {
1230 $dat = Digest::MD5::md5($pad . $id);
1231 RC4Crypt(\$dat, $key);
1232 for ($i=1; $i<=19; ++$i) {
1233 my @key = unpack('C*', $key);
1234 foreach (@key) { $_ ^= $i; }
1235 RC4Crypt(\$dat, pack('C*', @key));
1236 }
1237 $dat .= substr($u, 16);
1238 } else {
1239 $dat = $pad;
1240 RC4Crypt(\$dat, $key);
1241 }
1242 last if $dat eq $u; # all done if this was the correct key
1243 } else {
1244 return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
1245 if (length $password) {
1246 # Note: this should be good for passwords containing reasonable characters,
1247 # but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
1248 # of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
1249 $password = $exifTool->Encode($password, 'UTF8');
1250 $password = substr($password, 0, 127) if length($password) > 127;
1251 }
1252 # test for the owner password
1253 my $sha = Digest::SHA::sha256($password, substr($o,32,8), substr($u,0,48));
1254 if ($sha eq substr($o, 0, 32)) {
1255 $key = Digest::SHA::sha256($password, substr($o,40,8), substr($u,0,48));
1256 my $dat = ("\0" x 16) . $parm{OE};
1257 # decrypt with no padding
1258 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1259 return $err if $err;
1260 $key = $dat; # use this as the file decryption key
1261 last;
1262 }
1263 # test for the user password
1264 $sha = Digest::SHA::sha256($password, substr($u,32,8));
1265 if ($sha eq substr($u, 0, 32)) {
1266 $key = Digest::SHA::sha256($password, substr($u,40,8));
1267 my $dat = ("\0" x 16) . $parm{UE};
1268 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1269 return $err if $err;
1270 $key = $dat; # use this as the file decryption key
1271 last;
1272 }
1273 }
1274 }
1275 $$encrypt{_key} = $key; # save the file-level encryption key
1276 $cryptInfo = $encrypt; # save reference to the file-level Encrypt object
1277 return undef; # success!
1278}
1279
1280#------------------------------------------------------------------------------
1281# Decrypt/Encrypt data
1282# Inputs: 0) data ref
1283# 1) PDF object reference to use as crypt key extension (may be 'none' to
1284# avoid extending the encryption key, as for streams with Crypt Filter)
1285# 2) encrypt flag (false for decryption)
1286sub Crypt($$;$)
1287{
1288 return unless $cryptInfo;
1289 my ($dataPt, $keyExt, $encrypt) = @_;
1290 # do not decrypt if the key extension object is undefined
1291 # (this doubles as a flag to disable decryption/encryption)
1292 return unless defined $keyExt;
1293 my $key = $$cryptInfo{_key};
1294 # apply the necessary crypt key extension
1295 unless ($$cryptInfo{_aesv3}) {
1296 unless ($keyExt eq 'none') {
1297 # extend crypt key using object and generation number
1298 unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
1299 $$cryptInfo{_error} = 'Invalid object reference for encryption';
1300 return;
1301 }
1302 $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
1303 }
1304 # add AES-128 salt if necessary (this little gem is conveniently
1305 # omitted from the Adobe PDF 1.6 documentation, causing me to
1306 # waste 12 hours trying to figure out why this wasn't working --
1307 # it appears in ISO32000 though, so I should have been using that)
1308 $key .= 'sAlT' if $$cryptInfo{_aesv2};
1309 my $len = length($key);
1310 $key = Digest::MD5::md5($key); # get 16-byte MD5 digest
1311 $key = substr($key, 0, $len) if $len < 16; # trim if necessary
1312 }
1313 # perform the decryption/encryption
1314 if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
1315 require Image::ExifTool::AES;
1316 my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
1317 $err and $$cryptInfo{_error} = $err;
1318 } else {
1319 RC4Crypt($dataPt, $key);
1320 }
1321}
1322
1323#------------------------------------------------------------------------------
1324# Decrypt/Encrypt stream data
1325# Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
1326sub CryptStream($$)
1327{
1328 return unless $cryptStream;
1329 my ($dict, $keyExt) = @_;
1330 my $type = $$dict{Type} || '';
1331 # XRef streams are not encrypted (ref 3, page 50),
1332 # and Metadata may or may not be encrypted
1333 if ($cryptInfo and $type ne '/XRef' and
1334 ($$cryptInfo{_meta} or $type ne '/Metadata'))
1335 {
1336 Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
1337 # toggle _decrypted flag
1338 $$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
1339 } else {
1340 $$dict{_decrypted} = 0; # stream should never be encrypted
1341 }
1342}
1343
1344#------------------------------------------------------------------------------
1345# Generate a new PDF tag (based on its ID) and add it to a tag table
1346# Inputs: 0) tag table ref, 1) tag ID
1347# Returns: tag info ref
1348sub NewPDFTag($$)
1349{
1350 my ($tagTablePtr, $tag) = @_;
1351 my $name = $tag;
1352 # translate URL-like escape sequences
1353 $name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
1354 $name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
1355 $name =~ s/(^|_)([a-z])/\U$2/g; # start words with upper case
1356 my $tagInfo = { Name => $name };
1357 Image::ExifTool::AddTagToTable($tagTablePtr, $tag, $tagInfo);
1358 return $tagInfo;
1359}
1360
1361#------------------------------------------------------------------------------
1362# Process PDF dictionary extract tag values
1363# Inputs: 0) ExifTool object reference, 1) tag table reference
1364# 2) dictionary reference, 3) cross-reference table reference,
1365# 4) nesting depth, 5) dictionary capture type
1366sub ProcessDict($$$$;$$)
1367{
1368 my ($exifTool, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1369 my $verbose = $exifTool->Options('Verbose');
1370 my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
1371 my $embedded = (defined $unknown and not $unknown and $exifTool->Options('ExtractEmbedded'));
1372 my @tags = @{$$dict{_tags}};
1373 my ($next, %join);
1374 my $index = 0;
1375
1376 $nesting = ($nesting || 0) + 1;
1377 if ($nesting > 50) {
1378 $exifTool->WarnOnce('Nesting too deep (directory ignored)');
1379 return;
1380 }
1381 # save entire dictionary for rewriting if specified
1382 if ($$exifTool{PDF_CAPTURE} and $$tagTablePtr{VARS} and
1383 $tagTablePtr->{VARS}->{CAPTURE})
1384 {
1385 my $name;
1386 foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
1387 next if $exifTool->{PDF_CAPTURE}->{$name};
1388 # make sure we load the right type if indicated
1389 next if $type and $type ne $name;
1390 $exifTool->{PDF_CAPTURE}->{$name} = $dict;
1391 last;
1392 }
1393 }
1394#
1395# extract information from all tags in the dictionary
1396#
1397 for (;;) {
1398 my ($tag, $tagInfo);
1399 if (@tags) {
1400 $tag = shift @tags;
1401 } elsif (defined $next and not $next) {
1402 $tag = 'Next';
1403 $next = 1;
1404 } else {
1405 last;
1406 }
1407 my $val = $$dict{$tag};
1408 if ($$tagTablePtr{$tag}) {
1409 $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag);
1410 undef $tagInfo if $$tagInfo{NoProcess};
1411 } elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
1412 $$tagTablePtr{$1} and ref $val eq 'SCALAR' and not $fetched{$$val})
1413 {
1414 my ($name, $num) = ($1, $2);
1415 $join{$name} = [] unless $join{$name};
1416 $fetched{$$val} = 1;
1417 my $obj = FetchObject($exifTool, $$val, $xref, $tag);
1418 next unless ref $obj eq 'HASH' and $$obj{_stream};
1419 # save all the stream data to join later
1420 $join{$name}->[$num] = $$obj{_stream};
1421 }
1422 if ($verbose) {
1423 my ($val2, $extra);
1424 if (ref $val eq 'SCALAR') {
1425 $extra = ", indirect object ($$val)";
1426 if ($fetched{$$val}) {
1427 $val2 = "ref($$val)";
1428 } elsif ($tag eq 'Next' and not $next) {
1429 # handle 'Next' links after all others
1430 $next = 0;
1431 next;
1432 } else {
1433 $fetched{$$val} = 1;
1434 $val = FetchObject($exifTool, $$val, $xref, $tag);
1435 unless (defined $val) {
1436 my $str;
1437 if (defined $lastOffset) {
1438 $val2 = '<free>';
1439 $str = 'Object was freed';
1440 } else {
1441 $val2 = '<err>';
1442 $str = 'Error reading object';
1443 }
1444 $exifTool->VPrint(0, "$$exifTool{INDENT}${str}:\n");
1445 }
1446 }
1447 } elsif (ref $val eq 'HASH') {
1448 $extra = ', direct dictionary';
1449 } elsif (ref $val eq 'ARRAY') {
1450 $extra = ', direct array of ' . scalar(@$val) . ' objects';
1451 } else {
1452 $extra = ', direct object';
1453 }
1454 my $isSubdir;
1455 if (ref $val eq 'HASH') {
1456 $isSubdir = 1;
1457 } elsif (ref $val eq 'ARRAY') {
1458 # recurse into objects in arrays only if they are lists of
1459 # dictionaries or indirect objects which could be dictionaries
1460 $isSubdir = 1 if @$val;
1461 foreach (@$val) {
1462 next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
1463 undef $isSubdir;
1464 last;
1465 }
1466 }
1467 if ($isSubdir) {
1468 # create bogus subdirectory to recurse into this dict
1469 $tagInfo or $tagInfo = {
1470 Name => $tag,
1471 SubDirectory => {
1472 TagTable => 'Image::ExifTool::PDF::Unknown',
1473 },
1474 };
1475 } else {
1476 if (ref $val eq 'ARRAY') {
1477 my @list = @$val;
1478 foreach (@list) {
1479 $_ = "ref($$_)" if ref $_ eq 'SCALAR';
1480 }
1481 $val2 = '[' . join(',',@list) . ']';
1482 }
1483 # generate tag info if we will use it later
1484 if (not $tagInfo and defined $val and $unknown) {
1485 $tagInfo = NewPDFTag($tagTablePtr, $tag);
1486 }
1487 }
1488 $exifTool->VerboseInfo($tag, $tagInfo,
1489 Value => $val2 || $val,
1490 Extra => $extra,
1491 Index => $index++,
1492 );
1493 next unless defined $val;
1494 }
1495 unless ($tagInfo) {
1496 # add any tag found in Info directory to table
1497 next unless $unknown;
1498 $tagInfo = NewPDFTag($tagTablePtr, $tag);
1499 }
1500 unless ($$tagInfo{SubDirectory}) {
1501 # fetch object if necessary
1502 # (OS X 10.6 writes indirect objects in the Info dictionary!)
1503 if (ref $val eq 'SCALAR') {
1504 my $prevFetched = $lastFetched;
1505 # (note: fetching the same object multiple times is OK here)
1506 $val = FetchObject($exifTool, $$val, $xref, $tag);
1507 next unless defined $val;
1508 $val = ReadPDFValue($val);
1509 # set flag to re-encrypt if necessary if rewritten
1510 $$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
1511 $lastFetched = $prevFetched; # restore last fetched object reference
1512 } else {
1513 $val = ReadPDFValue($val);
1514 }
1515 my $format = $$tagInfo{Format} || $$tagInfo{Writable} || 'string';
1516 $val = ConvertPDFDate($val) if $format eq 'date';
1517 # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
1518 # unless this is binary data (hex-encoded strings would not have been converted)
1519 if (ref $val) {
1520 next if ref $val ne 'ARRAY';
1521 my $v;
1522 foreach $v (@$val) {
1523 $exifTool->FoundTag($tagInfo, $v);
1524 }
1525 } else {
1526 if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
1527 # text string is already in Unicode if it starts with "\xfe\xff",
1528 # otherwise we must first convert from PDFDocEncoding
1529 $val = $exifTool->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
1530 }
1531 if ($$tagInfo{List}) {
1532 # separate tokens in comma or whitespace delimited lists
1533 my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val;
1534 foreach $val (@values) {
1535 $exifTool->FoundTag($tagInfo, $val);
1536 }
1537 } else {
1538 # a simple tag value
1539 $exifTool->FoundTag($tagInfo, $val);
1540 }
1541 }
1542 next;
1543 }
1544 # process the subdirectory
1545 my @subDicts;
1546 if (ref $val eq 'ARRAY') {
1547 @subDicts = @{$val};
1548 } else {
1549 @subDicts = ( $val );
1550 }
1551 # loop through all values of this tag
1552 for (;;) {
1553 my $subDict = shift @subDicts or last;
1554 # save last fetched object in case we fetch another one here
1555 my $prevFetched = $lastFetched;
1556 if (ref $subDict eq 'SCALAR') {
1557 # only fetch once (other copies are obsolete)
1558 next if $fetched{$$subDict};
1559 # load dictionary via an indirect reference
1560 $fetched{$$subDict} = 1;
1561 my $obj = FetchObject($exifTool, $$subDict, $xref, $tag);
1562 unless (defined $obj) {
1563 unless (defined $lastOffset) {
1564 $exifTool->Warn("Error reading $tag object ($$subDict)");
1565 }
1566 next;
1567 }
1568 $subDict = $obj;
1569 }
1570 if (ref $subDict eq 'ARRAY') {
1571 # convert array of key/value pairs to a hash
1572 next if @$subDict < 2;
1573 my %hash = ( _tags => [] );
1574 while (@$subDict >= 2) {
1575 my $key = shift @$subDict;
1576 $key =~ s/^\///;
1577 push @{$hash{_tags}}, $key;
1578 $hash{$key} = shift @$subDict;
1579 }
1580 $subDict = \%hash;
1581 } else {
1582 next unless ref $subDict eq 'HASH';
1583 }
1584 # set flag to re-crypt all strings when rewriting if the dictionary
1585 # came from an encrypted stream
1586 $$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
1587 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
1588 if (not $verbose) {
1589 my $proc = $$subTablePtr{PROCESS_PROC} || \&ProcessDict;
1590 &$proc($exifTool, $subTablePtr, $subDict, $xref, $nesting);
1591 } elsif ($next) {
1592 # handle 'Next' links at this level to avoid deep recursion
1593 undef $next;
1594 $index = 0;
1595 $tagTablePtr = $subTablePtr;
1596 $dict = $subDict;
1597 @tags = @{$$subDict{_tags}};
1598 $exifTool->VerboseDir($tag, scalar(@tags));
1599 } else {
1600 my $oldIndent = $exifTool->{INDENT};
1601 my $oldDir = $exifTool->{DIR_NAME};
1602 $exifTool->{INDENT} .= '| ';
1603 $exifTool->{DIR_NAME} = $tag;
1604 $exifTool->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
1605 ProcessDict($exifTool, $subTablePtr, $subDict, $xref, $nesting);
1606 $exifTool->{INDENT} = $oldIndent;
1607 $exifTool->{DIR_NAME} = $oldDir;
1608 }
1609 $lastFetched = $prevFetched;
1610 }
1611 }
1612#
1613# extract information from joined streams if necessary
1614#
1615
1616 if (%join) {
1617 my ($tag, $i);
1618 foreach $tag (sort keys %join) {
1619 my $list = $join{$tag};
1620 last unless defined $$list[1] and $$list[1] =~ /^%.*?([\x0d\x0a]*)/;
1621 my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
1622 for ($i=1; defined $$list[$i]; ++$i) {
1623 $buff .= $$list[$i];
1624 undef $$list[$i]; # free memory
1625 }
1626 $exifTool->HandleTag($tagTablePtr, $tag, $buff);
1627 }
1628 }
1629#
1630# extract information from stream object if it exists (ie. Metadata stream)
1631#
1632 return unless $$dict{_stream};
1633 my $tag = '_stream';
1634 # add Subtype (if it exists) to stream name and remove leading '/'
1635 ($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
1636 return unless $$tagTablePtr{$tag};
1637 my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag);
1638 # decode stream if necessary
1639 DecodeStream($exifTool, $dict) or return;
1640 # extract information from stream
1641 my %dirInfo = (
1642 DataPt => \$$dict{_stream},
1643 DataLen => length $$dict{_stream},
1644 DirStart => 0,
1645 DirLen => length $$dict{_stream},
1646 Parent => 'PDF',
1647 );
1648 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
1649 unless ($exifTool->ProcessDirectory(\%dirInfo, $subTablePtr)) {
1650 $exifTool->Warn("Error processing $$tagInfo{Name} information");
1651 }
1652}
1653
1654#------------------------------------------------------------------------------
1655# Extract information from PDF file
1656# Inputs: 0) ExifTool object reference, 1) dirInfo reference
1657# Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
1658sub ReadPDF($$)
1659{
1660 my ($exifTool, $dirInfo) = @_;
1661 my $raf = $$dirInfo{RAF};
1662 my $verbose = $exifTool->Options('Verbose');
1663 my ($buff, $encrypt, $id);
1664#
1665# validate PDF file
1666#
1667 # (linearization dictionary must be in the first 1024 bytes of the file)
1668 $raf->Read($buff, 1024) >= 8 or return 0;
1669 $buff =~ /^%PDF-(\d+\.\d+)/ or return 0;
1670 $pdfVer = $1;
1671 $exifTool->SetFileType(); # set the FileType tag
1672 $exifTool->Warn("May not be able to read a PDF version $pdfVer file") if $pdfVer >= 2.0;
1673 # store PDFVersion tag
1674 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
1675 $exifTool->HandleTag($tagTablePtr, 'Version', $pdfVer);
1676 $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
1677#
1678# check for a linearized PDF (only if reading)
1679#
1680 my $capture = $$exifTool{PDF_CAPTURE};
1681 unless ($capture) {
1682 my $lin = 'false';
1683 if ($buff =~ /<</g) {
1684 $buff = substr($buff, pos($buff) - 2);
1685 my $dict = ExtractObject($exifTool, \$buff);
1686 if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
1687 if (not $$exifTool{VALUE}{FileSize}) {
1688 undef $lin; # can't determine if it is linearized
1689 } elsif ($$dict{L} == $$exifTool{VALUE}{FileSize}) {
1690 $lin = 'true';
1691 }
1692 }
1693 }
1694 $exifTool->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
1695 }
1696#
1697# read the xref tables referenced from startxref at the end of the file
1698#
1699 my @xrefOffsets;
1700 $raf->Seek(0, 2) or return -2;
1701 # the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
1702 my $len = $raf->Tell();
1703 $len = 1024 if $len > 1024;
1704 $raf->Seek(-$len, 2) or return -2;
1705 $raf->Read($buff, $len) == $len or return -3;
1706 # find the last xref table in the file (may be multiple %%EOF marks)
1707 $buff =~ /.*startxref *(\x0d\x0a|\x0d|\x0a)\s*?(\d+)\s+%%EOF/s or return -4;
1708 local $/ = $1; # set input record separator
1709 push @xrefOffsets, $2, 'Main';
1710 my (%xref, @mainDicts, %loaded, $mainFree);
1711 # initialize variables to capture when rewriting
1712 if ($capture) {
1713 $capture->{startxref} = $2;
1714 $capture->{xref} = \%xref;
1715 $capture->{newline} = $/;
1716 $capture->{mainFree} = $mainFree = { };
1717 }
1718 while (@xrefOffsets) {
1719 my $offset = shift @xrefOffsets;
1720 my $type = shift @xrefOffsets;
1721 next if $loaded{$offset}; # avoid infinite recursion
1722 unless ($raf->Seek($offset, 0)) {
1723 %loaded or return -5;
1724 $exifTool->Warn('Bad offset for secondary xref table');
1725 next;
1726 }
1727 # Note: care must be taken because ReadLine may read more than we want if
1728 # the newline sequence for this table is different than the rest of the file
1729 unless ($raf->ReadLine($buff)) {
1730 %loaded or return -6;
1731 $exifTool->Warn('Bad offset for secondary xref table');
1732 next;
1733 }
1734 my $loadXRefStream;
1735 if ($buff =~ s/^xref\s+//s) {
1736 # load xref table
1737 for (;;) {
1738 # read another line if necessary (skipping blank lines)
1739 $raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
1740 last if $buff =~ s/^\s*trailer\s+//s;
1741 $buff =~ s/\s*(\d+)\s+(\d+)\s+//s or return -4;
1742 my ($start, $num) = ($1, $2);
1743 $raf->Seek(-length($buff), 1) or return -4;
1744 my $i;
1745 for ($i=0; $i<$num; ++$i) {
1746 $raf->Read($buff, 20) == 20 or return -6;
1747 $buff =~ /^\s*(\d{10}) (\d{5}) (f|n)/s or return -4;
1748 my $num = $start + $i;
1749 # save offset for newest copy of all objects
1750 # (or next object number for free objects)
1751 unless (defined $xref{$num}) {
1752 my ($offset, $gen) = (int($1), int($2));
1753 $xref{$num} = $offset;
1754 if ($3 eq 'f') {
1755 # save free objects in last xref table for rewriting
1756 $$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
1757 next;
1758 }
1759 # also save offset keyed by object reference string
1760 $xref{"$num $gen R"} = $offset;
1761 }
1762 }
1763 # (I have a sample from Adobe which has an empty xref table)
1764 # %xref or return -4; # xref table may not be empty
1765 $buff = '';
1766 }
1767 undef $mainFree; # only do this for the last xref table
1768 } elsif ($buff =~ s/^(\d+)\s+(\d+)\s+obj//s) {
1769 # this is a PDF-1.5 cross-reference stream dictionary
1770 $loadXRefStream = 1;
1771 } else {
1772 %loaded or return -4;
1773 $exifTool->Warn('Invalid secondary xref table');
1774 next;
1775 }
1776 my $mainDict = ExtractObject($exifTool, \$buff, $raf, \%xref);
1777 unless (ref $mainDict eq 'HASH') {
1778 %loaded or return -8;
1779 $exifTool->Warn('Error loading secondary dictionary');
1780 next;
1781 }
1782 if ($loadXRefStream) {
1783 # decode and save our XRef stream from PDF-1.5 file
1784 # (but parse it later as required to save time)
1785 # Note: this technique can potentially result in an old object
1786 # being used if the file was incrementally updated and an older
1787 # object from an xref table was replaced by a newer object in an
1788 # xref stream. But doing so isn't a good idea (if allowed at all)
1789 # because a PDF 1.4 consumer would also make this same mistake.
1790 if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
1791 @{$$mainDict{W}} > 2 and $$mainDict{Size} and
1792 DecodeStream($exifTool, $mainDict))
1793 {
1794 # create Index entry if it doesn't exist
1795 $$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
1796 # create '_entry_size' entry for internal use
1797 my $w = $$mainDict{W};
1798 my $size = 0;
1799 foreach (@$w) { $size += $_; }
1800 $$mainDict{_entry_size} = $size;
1801 # save this stream dictionary to use later if required
1802 $xref{dicts} = [] unless $xref{dicts};
1803 push @{$xref{dicts}}, $mainDict;
1804 } else {
1805 %loaded or return -9;
1806 $exifTool->Warn('Invalid xref stream in secondary dictionary');
1807 }
1808 }
1809 $loaded{$offset} = 1;
1810 # load XRef stream in hybrid file if it exists
1811 push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
1812 $encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
1813 if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
1814 $id = ReadPDFValue($mainDict->{ID}->[0]);
1815 }
1816 push @mainDicts, $mainDict, $type;
1817 # load previous xref table if it exists
1818 push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
1819 }
1820#
1821# extract encryption information if necessary
1822#
1823 if ($encrypt) {
1824 if (ref $encrypt eq 'SCALAR') {
1825 $encrypt = FetchObject($exifTool, $$encrypt, \%xref, 'Encrypt');
1826 }
1827 # generate Encryption tag information
1828 my $err = DecryptInit($exifTool, $encrypt, $id);
1829 if ($err) {
1830 $exifTool->Warn($err);
1831 $$capture{Error} = $err if $capture;
1832 return -1;
1833 }
1834 }
1835#
1836# extract the information beginning with each of the main dictionaries
1837#
1838 while (@mainDicts) {
1839 my $dict = shift @mainDicts;
1840 my $type = shift @mainDicts;
1841 if ($verbose) {
1842 my $n = scalar(@{$$dict{_tags}});
1843 $exifTool->VPrint(0, "PDF dictionary with $n entries:\n");
1844 }
1845 ProcessDict($exifTool, $tagTablePtr, $dict, \%xref, 0, $type);
1846 }
1847 # handle any decryption errors
1848 if ($encrypt) {
1849 my $err = $$encrypt{_error};
1850 if ($err) {
1851 $exifTool->Warn($err);
1852 $$capture{Error} = $err if $capture;
1853 return -1;
1854 }
1855 }
1856 return 1;
1857}
1858
1859#------------------------------------------------------------------------------
1860# ReadPDF() warning strings for each error return value
1861my %pdfWarning = (
1862 # -1 is reserved as error return value with no associated warning
1863 -2 => 'Error seeking in file',
1864 -3 => 'Error reading file',
1865 -4 => 'Invalid xref table',
1866 -5 => 'Invalid xref offset',
1867 -6 => 'Error reading xref table',
1868 -7 => 'Error reading trailer',
1869 -8 => 'Error reading main dictionary',
1870 -9 => 'Invalid xref stream in main dictionary',
1871);
1872
1873#------------------------------------------------------------------------------
1874# Extract information from PDF file
1875# Inputs: 0) ExifTool object reference, 1) dirInfo reference
1876# Returns: 1 if this was a valid PDF file
1877sub ProcessPDF($$)
1878{
1879 my ($exifTool, $dirInfo) = @_;
1880
1881 undef $cryptInfo; # (must not delete after returning so writer can use it)
1882 undef $cryptStream;
1883 undef $cryptString;
1884 my $result = ReadPDF($exifTool, $dirInfo);
1885 if ($result < 0) {
1886 $exifTool->Warn($pdfWarning{$result}) if $pdfWarning{$result};
1887 $result = 1;
1888 }
1889 # clean up and return
1890 undef %streamObjs;
1891 undef %fetched;
1892 return $result;
1893}
1894
18951; # end
1896
1897
1898__END__
1899
1900=head1 NAME
1901
1902Image::ExifTool::PDF - Read PDF meta information
1903
1904=head1 SYNOPSIS
1905
1906This module is loaded automatically by Image::ExifTool when required.
1907
1908=head1 DESCRIPTION
1909
1910This code reads meta information from PDF (Adobe Portable Document Format)
1911files. It supports object streams introduced in PDF-1.5 but only with a
1912limited set of Filter and Predictor algorithms, however all standard
1913encryption methods through PDF-1.7 extension level 3 are supported,
1914including AESV2 (AES-128) and AESV3 (AES-256).
1915
1916=head1 AUTHOR
1917
1918Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
1919
1920This library is free software; you can redistribute it and/or modify it
1921under the same terms as Perl itself.
1922
1923=head1 REFERENCES
1924
1925=over 4
1926
1927=item L<http://partners.adobe.com/public/developer/pdf/index_reference.html>
1928
1929=item L<Crypt::RC4|Crypt::RC4>
1930
1931=item L<http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf>
1932
1933=item L<http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf>
1934
1935=item L<http://tools.ietf.org/search/rfc3454>
1936
1937=item L<http://www.armware.dk/RFC/rfc/rfc4013.html>
1938
1939=back
1940
1941=head1 SEE ALSO
1942
1943L<Image::ExifTool::TagNames/PDF Tags>,
1944L<Image::ExifTool(3pm)|Image::ExifTool>
1945
1946=cut
Note: See TracBrowser for help on using the repository browser.