Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/Image/ExifTool/PDF.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago
An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)
File size: 72.8 KB

Line
1	#------------------------------------------------------------------------------
2	# File: PDF.pm
3	#
4	# Description: Read PDF meta information
5	#
6	# Revisions: 07/11/2005 - P. Harvey Created
7	# 07/25/2005 - P. Harvey Add support for encrypted documents
8	#
9	# References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
10	# 2) http://search.cpan.org/dist/Crypt-RC4/
11	# 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
12	# 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
13	# 5) http://tools.ietf.org/search/rfc3454
14	# 6) http://www.armware.dk/RFC/rfc/rfc4013.html
15	#------------------------------------------------------------------------------
16
17	package Image::ExifTool::PDF;
18
19	use strict;
20	use vars qw($VERSION $AUTOLOAD $lastFetched);
21	use Image::ExifTool qw(:DataAccess :Utils);
22	require Exporter;
23
24	$VERSION = '1.29';
25
26	sub FetchObject($$$$);
27	sub ExtractObject($$;$$);
28	sub ReadToNested($;$);
29	sub ProcessDict($$$$;$$);
30	sub ReadPDFValue($);
31	sub CheckPDF($$$);
32
33	# $lastFetched - last fetched object reference (used for decryption)
34	# (undefined if fetched object was already decrypted, ie. object from stream)
35
36	my $cryptInfo; # encryption object reference (plus additional information)
37	my $cryptString; # flag that strings are encrypted
38	my $cryptStream; # flag that streams are encrypted
39	my $lastOffset; # last fetched object offset
40	my %streamObjs; # hash of stream objects
41	my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
42	my $pdfVer; # version of PDF file being processed
43
44	# tags in main PDF directories
45	%Image::ExifTool::PDF::Main = (
46	GROUPS => { 2 => 'Document' },
47	VARS => { CAPTURE => ['Main','Prev'] },
48	Info => {
49	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
50	},
51	Root => {
52	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
53	},
54	Encrypt => {
55	NoProcess => 1, # don't process normally (processed in advance)
56	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
57	},
58	_linearized => {
59	Name => 'Linearized',
60	Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
61	PrintConv => { 'true' => 'Yes', 'false' => 'No' },
62	},
63	);
64
65	# tags in PDF Info directory
66	%Image::ExifTool::PDF::Info = (
67	GROUPS => { 2 => 'Document' },
68	VARS => { CAPTURE => ['Info'] },
69	EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
70	WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
71	CHECK_PROC => \&CheckPDF,
72	WRITABLE => 'string',
73	NOTES => q{
74	As well as the tags listed below, the PDF specification allows for
75	user-defined tags to exist in the Info dictionary. These tags, which should
76	have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
77	object, are also extracted by ExifTool.
78
79	B<Writable> specifies the value format, and may be C<string>, C<date>,
80	C<integer>, C<real>, C<boolean> or C<name> for PDF tags.
81	},
82	Title => { },
83	Author => { Groups => { 2 => 'Author' } },
84	Subject => { },
85	Keywords => { List => 'string' }, # this is a string list
86	Creator => { },
87	Producer => { },
88	CreationDate => {
89	Name => 'CreateDate',
90	Writable => 'date',
91	Groups => { 2 => 'Time' },
92	Shift => 'Time',
93	PrintConv => '$self->ConvertDateTime($val)',
94	PrintConvInv => '$self->InverseDateTime($val)',
95	},
96	ModDate => {
97	Name => 'ModifyDate',
98	Writable => 'date',
99	Groups => { 2 => 'Time' },
100	Shift => 'Time',
101	PrintConv => '$self->ConvertDateTime($val)',
102	PrintConvInv => '$self->InverseDateTime($val)',
103	},
104	Trapped => {
105	Protected => 1,
106	# remove leading '/' from '/True' or '/False'
107	ValueConv => '$val=~s{^/}{}; $val',
108	ValueConvInv => '"/$val"',
109	},
110	'AAPL:Keywords' => { #PH
111	Name => 'AppleKeywords',
112	List => 'array', # this is an array of values
113	Notes => q{
114	keywords written by Apple utilities, although they seem to use PDF:Keywords
115	when reading
116	},
117	},
118	);
119
120	# tags in the PDF Root document catalog
121	%Image::ExifTool::PDF::Root = (
122	GROUPS => { 2 => 'Document' },
123	# note: can't capture previous versions of Root since they are not parsed
124	VARS => { CAPTURE => ['Root'] },
125	NOTES => 'This is the PDF document catalog.',
126	MarkInfo => {
127	SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
128	},
129	Metadata => {
130	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
131	},
132	Pages => {
133	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
134	},
135	Perms => {
136	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
137	},
138	Lang => 'Language',
139	PageLayout => { },
140	PageMode => { },
141	Version => 'PDFVersion',
142	);
143
144	# tags extracted from the PDF Encrypt dictionary
145	%Image::ExifTool::PDF::Encrypt = (
146	GROUPS => { 2 => 'Document' },
147	NOTES => 'Tags extracted from the document Encrypt dictionary.',
148	Filter => {
149	Name => 'Encryption',
150	Notes => q{
151	extracted value is actually a combination of the Filter, SubFilter, V, R and
152	Length information from the Encrypt dictionary
153	},
154	},
155	P => {
156	Name => 'UserAccess',
157	ValueConv => '$val & 0x0f3c', # ignore reserved bits
158	PrintConvColumns => 2,
159	PrintConv => { BITMASK => {
160	2 => 'Print',
161	3 => 'Modify',
162	4 => 'Copy',
163	5 => 'Annotate',
164	8 => 'Fill forms',
165	9 => 'Extract',
166	10 => 'Assemble',
167	11 => 'Print high-res',
168	}},
169	},
170	);
171
172	# tags in PDF Pages directory
173	%Image::ExifTool::PDF::Pages = (
174	GROUPS => { 2 => 'Document' },
175	Count => 'PageCount',
176	Kids => {
177	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
178	},
179	);
180
181	# tags in PDF Perms directory
182	%Image::ExifTool::PDF::Perms = (
183	NOTES => 'Additional document permissions imposed by digital signatures.',
184	DocMDP => {
185	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
186	},
187	FieldMDP => {
188	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
189	},
190	UR3 => {
191	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
192	},
193	);
194
195	# tags in PDF Kids directory
196	%Image::ExifTool::PDF::Kids = (
197	Metadata => {
198	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
199	},
200	PieceInfo => {
201	SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
202	},
203	Resources => {
204	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
205	},
206	);
207
208	# tags in PDF Resources directory
209	%Image::ExifTool::PDF::Resources = (
210	ColorSpace => {
211	SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
212	},
213	);
214
215	# tags in PDF ColorSpace directory
216	%Image::ExifTool::PDF::ColorSpace = (
217	DefaultRGB => {
218	SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
219	},
220	);
221
222	# tags in PDF DefaultRGB directory
223	%Image::ExifTool::PDF::DefaultRGB = (
224	ICCBased => {
225	SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
226	},
227	);
228
229	# tags in PDF ICCBased directory
230	%Image::ExifTool::PDF::ICCBased = (
231	_stream => {
232	SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
233	},
234	);
235
236	# tags in PDF PieceInfo directory
237	%Image::ExifTool::PDF::PieceInfo = (
238	AdobePhotoshop => {
239	SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
240	},
241	Illustrator => {
242	# assume this is an illustrator file if it contains this directory
243	# and doesn't have a ".PDF" extension
244	Condition => q{
245	$self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
246	return 1;
247	},
248	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
249	},
250	);
251
252	# tags in PDF AdobePhotoshop directory
253	%Image::ExifTool::PDF::AdobePhotoshop = (
254	Private => {
255	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
256	},
257	);
258
259	# tags in PDF Illustrator directory
260	%Image::ExifTool::PDF::Illustrator = (
261	Private => {
262	SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
263	},
264	);
265
266	# tags in PDF Private directory
267	%Image::ExifTool::PDF::Private = (
268	ImageResources => {
269	SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
270	},
271	);
272
273	# tags in PDF AI Private directory
274	%Image::ExifTool::PDF::AIPrivate = (
275	GROUPS => { 2 => 'Document' },
276	EXTRACT_UNKNOWN => 0, # extract known but numbered tags
277	AIMetaData => {
278	SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
279	},
280	AIPrivateData => {
281	Notes => q{
282	the ExtractEmbedded option enables information to be extracted from embedded
283	PostScript documents in the AIPrivateData stream
284	},
285	SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
286	},
287	RoundTripVersion => { },
288	ContainerVersion => { },
289	CreatorVersion => { },
290	);
291
292	# tags in PDF AIMetaData directory
293	%Image::ExifTool::PDF::AIMetaData = (
294	_stream => {
295	SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
296	},
297	);
298
299	# tags in PDF ImageResources directory
300	%Image::ExifTool::PDF::ImageResources = (
301	_stream => {
302	SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
303	},
304	);
305
306	# tags in PDF MarkInfo directory
307	%Image::ExifTool::PDF::MarkInfo = (
308	GROUPS => { 2 => 'Document' },
309	Marked => {
310	Name => 'TaggedPDF',
311	Notes => "not a Tagged PDF if this tag is missing",
312	PrintConv => { 'true' => 'Yes', 'false' => 'No' },
313	},
314	);
315
316	# tags in PDF Metadata directory
317	%Image::ExifTool::PDF::Metadata = (
318	GROUPS => { 2 => 'Document' },
319	XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
320	Name => 'XMP',
321	SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
322	},
323	);
324
325	# tags in PDF signature directories (DocMDP, FieldMDP or UR3)
326	%Image::ExifTool::PDF::Signature = (
327	GROUPS => { 2 => 'Document' },
328	ContactInfo => 'SignerContactInfo',
329	Location => 'SigningLocation',
330	M => {
331	Name => 'SigningDate',
332	Format => 'date',
333	Groups => { 2 => 'Time' },
334	PrintConv => '$self->ConvertDateTime($val)',
335	},
336	Name => 'SigningAuthority',
337	Reason => 'SigningReason',
338	Reference => {
339	SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
340	},
341	Prop_AuthTime => {
342	Name => 'AuthenticationTime',
343	PrintConv => 'ConvertTimeSpan($val) . " ago"',
344	},
345	Prop_AuthType => 'AuthenticationType',
346	);
347
348	# tags in PDF Reference directory
349	%Image::ExifTool::PDF::Reference = (
350	TransformParams => {
351	SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
352	},
353	);
354
355	# tags in PDF TransformParams directory
356	%Image::ExifTool::PDF::TransformParams = (
357	GROUPS => { 2 => 'Document' },
358	Annots => {
359	Name => 'AnnotationUsageRights',
360	Notes => 'UR3 signatures only',
361	List => 1,
362	},
363	Document => {
364	Name => 'DocumentUsageRights',
365	Notes => 'UR3 signatures only',
366	List => 1,
367	},
368	Form => {
369	Name => 'FormUsageRights',
370	Notes => 'UR3 signatures only',
371	List => 1,
372	},
373	Signature => {
374	Name => 'SignatureUsageRights',
375	Notes => 'UR3 signatures only',
376	List => 1,
377	},
378	EF => {
379	Name => 'EmbeddedFileUsageRights',
380	Notes => 'UR3 signatures only',
381	List => 1,
382	},
383	Msg => {
384	Name => 'UsageRightsMessage',
385	Notes => 'UR3 signatures only',
386	},
387	P => {
388	Name => 'ModificationPermissions',
389	Notes => q{
390	1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
391	false
392	},
393	PrintConv => {
394	1 => 'No changes permitted',
395	2 => 'Fill forms, Create page templates, Sign',
396	3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
397	'true' => 'Restrict al applications to reader permissions',
398	'false' => 'Do not restrict applications to reader permissions',
399	},
400	},
401	Action => {
402	Name => 'FieldPermissions',
403	Notes => 'FieldMDP signatures only',
404	PrintConv => {
405	'All' => 'Disallow changes to all form fields',
406	'Include' => 'Disallow changes to specified form fields',
407	'Exclude' => 'Allow changes to specified form fields',
408	},
409	},
410	Fields => {
411	Notes => 'FieldMDP signatures only',
412	Name => 'FormFields',
413	List => 1,
414	},
415	);
416
417	# unknown tags for use in verbose option
418	%Image::ExifTool::PDF::Unknown = (
419	GROUPS => { 2 => 'Unknown' },
420	);
421
422	#------------------------------------------------------------------------------
423	# AutoLoad our writer routines when necessary
424	#
425	sub AUTOLOAD
426	{
427	return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
428	}
429
430	#------------------------------------------------------------------------------
431	# Convert from PDF to EXIF-style date/time
432	# Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
433	# Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
434	sub ConvertPDFDate($)
435	{
436	my $date = shift;
437	# remove optional 'D:' prefix
438	$date =~ s/^D://;
439	# fill in default values if necessary
440	# YYYYmmddHHMMSS
441	my $default = '00000101000000';
442	if (length $date < length $default) {
443	$date .= substr($default, length $date);
444	}
445	$date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
446	$date = "$1:$2:$3 $4:$5:$6";
447	if ($7) {
448	my $tz = $7;
449	if ($tz =~ /^\s*Z/i) {
450	# ignore any "HH'mm'" after the Z (OS X 10.6 does this)
451	$date .= 'Z';
452	# tolerate some improper formatting in timezone specification
453	} elsif ($tz =~ /^\s([-+])\s(\d+)[': ]+(\d*)/) {
454	$date .= $1 . $2 . ':' . ($3 \|\| '00');
455	}
456	}
457	return $date;
458	}
459
460	#------------------------------------------------------------------------------
461	# Locate any object in the XRef tables (including compressed objects)
462	# Inputs: 0) XRef reference, 1) object reference string (or free object number)
463	# Returns: offset to object in file or compressed object reference string,
464	# 0 if object is free, or undefined on error
465	sub LocateAnyObject($$)
466	{
467	my ($xref, $ref) = @_;
468	return undef unless $xref;
469	return $$xref{$ref} if exists $$xref{$ref};
470	# get the object number
471	return undef unless $ref =~ /^(\d+)/;
472	my $objNum = $1;
473	# return 0 if the object number has been reused (old object is free)
474	return 0 if defined $$xref{$objNum};
475	#
476	# scan our XRef stream dictionaries for this object
477	#
478	return undef unless $$xref{dicts};
479	my $dict;
480	foreach $dict (@{$$xref{dicts}}) {
481	# quick check to see if the object is in the range for this xref stream
482	next if $objNum >= $$dict{Size};
483	my $index = $$dict{Index};
484	next if $objNum < $$index[0];
485	# scan the tables for the specified object
486	my $size = $$dict{_entry_size};
487	my $num = scalar(@$index) / 2;
488	my $tot = 0;
489	my $i;
490	for ($i=0; $i<$num; ++$i) {
491	my $start = $$index[$i*2];
492	my $count = $$index[$i*2+1];
493	# table is in ascending order, so quit if we have passed the object
494	last if $objNum < $start;
495	if ($objNum < $start + $count) {
496	my $offset = $size * ($objNum - $start + $tot);
497	last if $offset + $size > length $$dict{_stream};
498	my @c = unpack("x$offset C$size", $$dict{_stream});
499	# extract values from this table entry
500	# (can be 1, 2, 3, 4, etc.. bytes per value)
501	my (@t, $j, $k);
502	my $w = $$dict{W};
503	for ($j=0; $j<3; ++$j) {
504	# use default value if W entry is 0 (as per spec)
505	# - 0th element defaults to 1, others default to 0
506	$$w[$j] or $t[$j] = ($j ? 0 : 1), next;
507	$t[$j] = shift(@c);
508	for ($k=1; $k < $$w[$j]; ++$k) {
509	$t[$j] = 256 * $t[$j] + shift(@c);
510	}
511	}
512	# by default, use "o g R" as the xref key
513	# (o = object number, g = generation number)
514	my $ref2 = "$objNum $t[2] R";
515	if ($t[0] == 1) {
516	# normal object reference:
517	# $t[1]=offset of object from start, $t[2]=generation number
518	$$xref{$ref2} = $t[1];
519	} elsif ($t[0] == 2) {
520	# compressed object reference:
521	# $t[1]=stream object number, $t[2]=index of object in stream
522	$ref2 = "$objNum 0 R";
523	$$xref{$ref2} = "I$t[2] $t[1] 0 R";
524	} elsif ($t[0] == 0) {
525	# free object:
526	# $t[1]=next free object in linked list, $t[2]=generation number
527	$$xref{$ref2} = 0;
528	} else {
529	# treat as a null object
530	$$xref{$ref2} = undef;
531	}
532	$$xref{$objNum} = $t[1]; # remember offsets by object number too
533	return $$xref{$ref} if $ref eq $ref2;
534	return 0; # object is free or was reused
535	}
536	$tot += $count;
537	}
538	}
539	return undef;
540	}
541
542	#------------------------------------------------------------------------------
543	# Locate a regular object in the XRef tables (does not include compressed objects)
544	# Inputs: 0) XRef reference, 1) object reference string (or free object number)
545	# Returns: offset to object in file, 0 if object is free,
546	# or undef on error or if object was compressed
547	sub LocateObject($$)
548	{
549	my ($xref, $ref) = @_;
550	my $offset = LocateAnyObject($xref, $ref);
551	return undef if $offset and $offset =~ /^I/;
552	return $offset;
553	}
554
555	#------------------------------------------------------------------------------
556	# Fetch indirect object from file (from inside a stream if required)
557	# Inputs: 0) ExifTool object reference, 1) object reference string,
558	# 2) xref lookup, 3) object name (for warning messages)
559	# Returns: object data or undefined on error
560	# Notes: sets $lastFetched to the object reference, or undef if the object
561	# was extracted from an encrypted stream
562	sub FetchObject($$$$)
563	{
564	my ($exifTool, $ref, $xref, $tag) = @_;
565	$lastFetched = $ref; # save this for decoding if necessary
566	my $offset = LocateAnyObject($xref, $ref);
567	$lastOffset = $offset;
568	unless ($offset) {
569	$exifTool->Warn("Bad $tag reference") unless defined $offset;
570	return undef;
571	}
572	my ($data, $obj);
573	if ($offset =~ s/^I(\d+) //) {
574	my $index = $1; # object index in stream
575	my ($objNum) = split ' ', $ref; # save original object number
576	$ref = $offset; # now a reference to the containing stream object
577	$obj = $streamObjs{$ref};
578	unless ($obj) {
579	# don't try to load the same object stream twice
580	return undef if defined $obj;
581	$streamObjs{$ref} = '';
582	# load the parent object stream
583	$obj = FetchObject($exifTool, $ref, $xref, $tag);
584	# make sure it contains everything we need
585	return undef unless defined $obj and ref($obj) eq 'HASH';
586	return undef unless $$obj{First} and $$obj{N};
587	return undef unless DecodeStream($exifTool, $obj);
588	# add a special '_table' entry to this dictionary which contains
589	# the list of object number/offset pairs from the stream header
590	my $num = $$obj{N} * 2;
591	my @table = split ' ', $$obj{_stream}, $num;
592	return undef unless @table == $num;
593	# remove everything before first object in stream
594	$$obj{_stream} = substr($$obj{_stream}, $$obj{First});
595	$table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
596	$$obj{_table} = \@table;
597	# save the object stream so we don't have to re-load it later
598	$streamObjs{$ref} = $obj;
599	}
600	# verify that we have the specified object
601	my $i = 2 * $index;
602	my $table = $$obj{_table};
603	unless ($index < $$obj{N} and $$table[$i] == $objNum) {
604	$exifTool->Warn("Bad index for stream object $tag");
605	return undef;
606	}
607	# extract the object at the specified index in the stream
608	# (offsets in table are in sequential order, so we can subract from
609	# the next offset to get the object length)
610	$offset = $$table[$i + 1];
611	my $len = ($$table[$i + 3] \|\| length($$obj{_stream})) - $offset;
612	$data = substr($$obj{_stream}, $offset, $len);
613	# avoid re-decrypting data in already decrypted streams
614	undef $lastFetched if $cryptStream;
615	return ExtractObject($exifTool, \$data);
616	}
617	my $raf = $exifTool->{RAF};
618	$raf->Seek($offset, 0) or $exifTool->Warn("Bad $tag offset"), return undef;
619	# verify that we are reading the expected object
620	$raf->ReadLine($data) or $exifTool->Warn("Error reading $tag data"), return undef;
621	($obj = $ref) =~ s/R/obj/;
622	unless ($data =~ s/^$obj//) {
623	$exifTool->Warn("$tag object ($obj) not found at $offset");
624	return undef;
625	}
626	return ExtractObject($exifTool, \$data, $raf, $xref);
627	}
628
629	#------------------------------------------------------------------------------
630	# Convert PDF value to something readable
631	# Inputs: 0) PDF object data
632	# Returns: converted object
633	sub ReadPDFValue($)
634	{
635	my $str = shift;
636	# decode all strings in an array
637	if (ref $str eq 'ARRAY') {
638	# create new list to not alter the original data when rewriting
639	my ($val, @vals);
640	foreach $val (@$str) {
641	push @vals, ReadPDFValue($val);
642	}
643	return \@vals;
644	}
645	length $str or return $str;
646	my $delim = substr($str, 0, 1);
647	if ($delim eq '(') { # literal string
648	$str = $1 if $str =~ /.?\((.)\)/s; # remove brackets
649	# decode escape sequences in literal strings
650	while ($str =~ /\\(.)/sg) {
651	my $n = pos($str) - 2;
652	my $c = $1;
653	my $r;
654	if ($c =~ /[0-7]/) {
655	# get up to 2 more octal digits
656	$c .= $1 if $str =~ /\G([0-7]{1,2})/g;
657	# convert octal escape code
658	$r = chr(oct($c) & 0xff);
659	} elsif ($c eq "\x0d") {
660	# the string is continued if the line ends with '\'
661	# (also remove "\x0d\x0a")
662	$c .= $1 if $str =~ /\G(\x0a)/g;
663	$r = '';
664	} elsif ($c eq "\x0a") {
665	$r = '';
666	} else {
667	# convert escaped characters
668	($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
669	}
670	substr($str, $n, length($c)+1) = $r;
671	# continue search after this character
672	pos($str) = $n + length($r);
673	}
674	Crypt(\$str, $lastFetched) if $cryptString;
675	} elsif ($delim eq '<') { # hex string
676	# decode hex data
677	$str =~ tr/0-9A-Fa-f//dc;
678	$str .= '0' if length($str) & 0x01; # (by the spec)
679	$str = pack('H*', $str);
680	Crypt(\$str, $lastFetched) if $cryptString;
681	} elsif ($delim eq '/') { # name
682	$str = substr($str, 1);
683	# convert escape codes (PDF 1.2 or later)
684	$str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
685	}
686	return $str;
687	}
688
689	#------------------------------------------------------------------------------
690	# Extract PDF object from combination of buffered data and file
691	# Inputs: 0) ExifTool object reference, 1) data reference,
692	# 2) optional raf reference, 3) optional xref table
693	# Returns: converted PDF object or undef on error
694	# a) dictionary object --> hash reference
695	# b) array object --> array reference
696	# c) indirect reference --> scalar reference
697	# d) string, name, integer, boolean, null --> scalar value
698	# - updates $$dataPt on return to contain unused data
699	# - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
700	# the stream data and a list of the tags (not including '_stream' and '_tags')
701	# in their original order
702	sub ExtractObject($$;$$)
703	{
704	my ($exifTool, $dataPt, $raf, $xref) = @_;
705	my (@tags, $data, $objData);
706	my $dict = { };
707	my $delim;
708
709	for (;;) {
710	if ($$dataPt =~ /^\s*(<{1,2}\|\[\|\()/s) {
711	$delim = $1;
712	$$dataPt =~ s/^\s+//; # remove leading white space
713	$objData = ReadToNested($dataPt, $raf);
714	return undef unless defined $objData;
715	last;
716	} elsif ($$dataPt =~ s{^\s(\S[^[(/<>\s])\s*}{}s) {
717	#
718	# extract boolean, numerical, string, name, null object or indirect reference
719	#
720	$objData = $1;
721	# look for an indirect reference
722	if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
723	$objData .= "$1 R";
724	$objData = \$objData; # return scalar reference
725	}
726	return $objData; # return simple scalar or scalar reference
727	}
728	$raf and $raf->ReadLine($data) or return undef;
729	$$dataPt .= $data;
730	}
731	#
732	# return literal string or hex string without parsing
733	#
734	if ($delim eq '(' or $delim eq '<') {
735	return $objData;
736	#
737	# extract array
738	#
739	} elsif ($delim eq '[') {
740	$objData =~ /.?\[(.)\]/s or return undef;
741	my $data = $1; # brackets removed
742	my @list;
743	for (;;) {
744	last unless $data =~ m{\s(\S[^[(/<>\s])}sg;
745	my $val = $1;
746	if ($val =~ /^(<{1,2}\|\[\|\()/) {
747	my $pos = pos($data) - length($val);
748	# nested dict, array, literal string or hex string
749	my $buff = substr($data, $pos);
750	$val = ReadToNested(\$buff);
751	last unless defined $val;
752	pos($data) = $pos + length($val);
753	$val = ExtractObject($exifTool, \$val);
754	} elsif ($val =~ /^\d/) {
755	my $pos = pos($data);
756	if ($data =~ /\G\s+(\d+)\s+R/g) {
757	$val = \ "$val $1 R"; # make a reference
758	} else {
759	pos($data) = $pos;
760	}
761	}
762	push @list, $val;
763	}
764	return \@list;
765	}
766	#
767	# extract dictionary
768	#
769	# Note: entries are not necessarily separated by whitespace (doh!)
770	# ie) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
771	# Also, they may be separated by a comment (ie. "/Tag%comment\nValue"),
772	# but comments have already been removed
773	while ($objData =~ m{(\s)/([^/[\]()<>{}\s]+)\s(\S[^[(/<>\s]*)}sg) {
774	my $tag = $2;
775	my $val = $3;
776	if ($val =~ /^(<{1,2}\|\[\|\()/) {
777	# nested dict, array, literal string or hex string
778	$objData = substr($objData, pos($objData)-length($val));
779	$val = ReadToNested(\$objData, $raf);
780	last unless defined $val;
781	$val = ExtractObject($exifTool, \$val);
782	pos($objData) = 0;
783	} elsif ($val =~ /^\d/) {
784	my $pos = pos($objData);
785	if ($objData =~ /\G\s+(\d+)\s+R/sg) {
786	$val = \ "$val $1 R"; # make a reference
787	} else {
788	pos($objData) = $pos;
789	}
790	}
791	if ($$dict{$tag}) {
792	# duplicate dictionary entries are not allowed
793	$exifTool->Warn('Duplicate $tag entry in dictionary (ignored)');
794	} else {
795	# save the entry
796	push @tags, $tag;
797	$$dict{$tag} = $val;
798	}
799	}
800	return undef unless @tags;
801	$$dict{_tags} = \@tags;
802	return $dict unless $raf; # direct objects can not have streams
803	#
804	# extract the stream object
805	#
806	# dictionary must specify stream Length
807	my $length = $$dict{Length} or return $dict;
808	if (ref $length) {
809	$length = $$length;
810	my $oldpos = $raf->Tell();
811	# get the location of the object specifying the length
812	# (compressed objects are not allowed)
813	my $offset = LocateObject($xref, $length) or return $dict;
814	$offset or $exifTool->Warn('Bad Length object'), return $dict;
815	$raf->Seek($offset, 0) or $exifTool->Warn('Bad Length offset'), return $dict;
816	# verify that we are reading the expected object
817	$raf->ReadLine($data) or $exifTool->Warn('Error reading Length data'), return $dict;
818	$length =~ s/R/obj/;
819	unless ($data =~ /^$length/) {
820	$exifTool->Warn("Length object ($length) not found at $offset");
821	return $dict;
822	}
823	$raf->ReadLine($data) or $exifTool->Warn('Error reading stream Length'), return $dict;
824	$data =~ /(\d+)/ or $exifTool->Warn('Stream length not found'), return $dict;
825	$length = $1;
826	$raf->Seek($oldpos, 0); # restore position to start of stream
827	}
828	# extract the trailing stream data
829	for (;;) {
830	# find the stream token
831	if ($$dataPt =~ /(\S+)/) {
832	last unless $1 eq 'stream';
833	# read an extra line because it may contain our \x0a
834	$$dataPt .= $data if $raf->ReadLine($data);
835	# remove our stream header
836	$$dataPt =~ s/^\s*stream(\x0a\|\x0d\x0a)//s;
837	my $more = $length - length($$dataPt);
838	if ($more > 0) {
839	unless ($raf->Read($data, $more) == $more) {
840	$exifTool->Warn('Error reading stream data');
841	$$dataPt = '';
842	return $dict;
843	}
844	$$dict{_stream} = $$dataPt . $data;
845	$$dataPt = '';
846	} elsif ($more < 0) {
847	$$dict{_stream} = substr($$dataPt, 0, $length);
848	$$dataPt = substr($$dataPt, $length);
849	} else {
850	$$dict{_stream} = $$dataPt;
851	$$dataPt = '';
852	}
853	last;
854	}
855	$raf->ReadLine($data) or last;
856	$$dataPt .= $data;
857	}
858	return $dict;
859	}
860
861	#------------------------------------------------------------------------------
862	# Read to nested delimiter
863	# Inputs: 0) data reference, 1) optional raf reference
864	# Returns: data up to and including matching delimiter (or undef on error)
865	# - updates data reference with trailing data
866	# - unescapes characters in literal strings
867	my %closingDelim = ( # lookup for matching delimiter
868	'(' => ')',
869	'[' => ']',
870	'<' => '>',
871	'<<' => '>>',
872	);
873	sub ReadToNested($;$)
874	{
875	my ($dataPt, $raf) = @_;
876	my @delim = (''); # closing delimiter list, most deeply nested first
877	pos($$dataPt) = 0; # begin at start of data
878	for (;;) {
879	unless ($$dataPt =~ /(\\*)(\(\|\)\|<{1,2}\|>{1,2}\|\[\|\]\|%)/g) {
880	# must read some more data
881	my $buff;
882	last unless $raf and $raf->ReadLine($buff);
883	$$dataPt .= $buff;
884	pos($$dataPt) = length($$dataPt) - length($buff);
885	next;
886	}
887	# are we in a literal string?
888	if ($delim[0] eq ')') {
889	# ignore escaped delimiters (preceeded by odd number of \'s)
890	next if length($1) & 0x01;
891	# ignore all delimiters but unescaped braces
892	next unless $2 eq '(' or $2 eq ')';
893	} elsif ($2 eq '%') {
894	# ignore the comment
895	my $pos = pos($$dataPt) - 1;
896	# remove everything from '%' up to but not including newline
897	$$dataPt =~ /.*/g;
898	my $end = pos($$dataPt);
899	$$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
900	pos($$dataPt) = $pos;
901	next;
902	}
903	if ($closingDelim{$2}) {
904	# push the corresponding closing delimiter
905	unshift @delim, $closingDelim{$2};
906	next;
907	}
908	unless ($2 eq $delim[0]) {
909	# handle the case where we find a ">>>" and interpret it
910	# as ">> >" instead of "> >>"
911	next unless $2 eq '>>' and $delim[0] eq '>';
912	pos($$dataPt) = pos($$dataPt) - 1;
913	}
914	shift @delim; # remove from nesting list
915	next if $delim[0]; # keep going if we have more nested delimiters
916	my $pos = pos($$dataPt);
917	my $buff = substr($$dataPt, 0, $pos);
918	$$dataPt = substr($$dataPt, $pos);
919	return $buff; # success!
920	}
921	return undef; # didn't find matching delimiter
922	}
923
924	#------------------------------------------------------------------------------
925	# Decode filtered stream
926	# Inputs: 0) ExifTool object reference, 1) dictionary reference
927	# Returns: true if stream has been decoded OK
928	sub DecodeStream($$)
929	{
930	my ($exifTool, $dict) = @_;
931
932	return 0 unless $$dict{_stream}; # no stream to decode
933
934	# get list of filters
935	my (@filters, @decodeParms, $filter);
936	if (ref $$dict{Filter} eq 'ARRAY') {
937	@filters = @{$$dict{Filter}};
938	} elsif (defined $$dict{Filter}) {
939	@filters = ($$dict{Filter});
940	}
941	# apply decryption first if required (and if the default encryption
942	# has not been overridden by a Crypt filter. Note: the Crypt filter
943	# must be first in the Filter array: ref 3, page 38)
944	unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
945	CryptStream($dict, $lastFetched);
946	}
947	return 1 unless $$dict{Filter}; # Filter entry is mandatory
948	return 0 if defined $$dict{_filtered}; # avoid double-filtering
949	$$dict{_filtered} = 1; # set flag to prevent double-filtering
950
951	# get array of DecodeParms dictionaries
952	if (ref $$dict{DecodeParms} eq 'ARRAY') {
953	@decodeParms = @{$$dict{DecodeParms}};
954	} else {
955	@decodeParms = ($$dict{DecodeParms});
956	}
957	foreach $filter (@filters) {
958	my $decodeParms = shift @decodeParms;
959
960	if ($filter eq '/FlateDecode') {
961	if (eval 'require Compress::Zlib') {
962	my $inflate = Compress::Zlib::inflateInit();
963	my ($buff, $stat);
964	$inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
965	if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
966	$$dict{_stream} = $buff;
967	} else {
968	$exifTool->Warn('Error inflating stream');
969	return 0;
970	}
971	} else {
972	$exifTool->WarnOnce('Install Compress::Zlib to process filtered streams');
973	return 0;
974	}
975	# apply anti-predictor if necessary
976	next unless ref $decodeParms eq 'HASH';
977	my $pre = $$decodeParms{Predictor};
978	next unless $pre and $pre != 1;
979	if ($pre != 12) {
980	# currently only support 'up' prediction
981	$exifTool->WarnOnce("FlateDecode Predictor $pre not currently supported");
982	return 0;
983	}
984	my $cols = $$decodeParms{Columns};
985	unless ($cols) {
986	# currently only support 'up' prediction
987	$exifTool->WarnOnce('No Columns for decoding stream');
988	return 0;
989	}
990	my @bytes = unpack('C*', $$dict{_stream});
991	my @pre = (0) x $cols; # initialize predictor array
992	my $buff = '';
993	while (@bytes > $cols) {
994	unless (($_ = shift @bytes) == 2) {
995	$exifTool->WarnOnce("Unsupported PNG filter $_"); # (yes, PNG)
996	return 0;
997	}
998	foreach (@pre) {
999	$_ = ($_ + shift(@bytes)) & 0xff;
1000	}
1001	$buff .= pack('C*', @pre);
1002	}
1003	$$dict{_stream} = $buff;
1004
1005	} elsif ($filter eq '/Crypt') {
1006
1007	# (we shouldn't have to check the _decrypted flag since we
1008	# already checked the _filtered flag, but what the heck...)
1009	next if defined $$dict{_decrypted};
1010	# assume Identity filter (the default) if DecodeParms are missing
1011	next unless ref $decodeParms eq 'HASH';
1012	my $name = $$decodeParms{Name};
1013	next unless defined $name or $name eq 'Identity';
1014	if ($name ne 'StdCF') {
1015	$exifTool->WarnOnce("Unsupported Crypt Filter $name");
1016	return 0;
1017	}
1018	unless ($cryptInfo) {
1019	$exifTool->WarnOnce('Missing Encrypt StdCF entry');
1020	return 0;
1021	}
1022	# decrypt the stream manually because we want to:
1023	# 1) ignore $cryptStream (StmF) setting
1024	# 2) ignore EncryptMetadata setting (I can't find mention of how to
1025	# reconcile this in the spec., but this would make sense)
1026	# 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
1027	# 4) set _decrypted flag so we will recrypt according to StmF when
1028	# writing (since we don't yet write Filter'd streams)
1029	Crypt(\$$dict{_stream}, 'none');
1030	$$dict{_decrypted} = ($cryptStream ? 1 : 0);
1031
1032	} elsif ($filter ne '/Identity') {
1033
1034	$exifTool->WarnOnce("Unsupported Filter $filter");
1035	return 0;
1036	}
1037	}
1038	return 1;
1039	}
1040
1041	#------------------------------------------------------------------------------
1042	# Initialize state for RC4 en/decryption (ref 2)
1043	# Inputs: 0) RC4 key string
1044	# Returns: RC4 key hash reference
1045	sub RC4Init($)
1046	{
1047	my @key = unpack('C*', shift);
1048	my @state = (0 .. 255);
1049	my ($i, $j) = (0, 0);
1050	while ($i < 256) {
1051	my $st = $state[$i];
1052	$j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
1053	$state[$i++] = $state[$j];
1054	$state[$j] = $st;
1055	}
1056	return { State => \@state, XY => [ 0, 0 ] };
1057	}
1058
1059	#------------------------------------------------------------------------------
1060	# Apply RC4 en/decryption (ref 2)
1061	# Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
1062	# - can call this method directly with a key string, or with with the key
1063	# reference returned by RC4Init
1064	# - RC4 is a symmetric algorithm, so encryption is the same as decryption
1065	sub RC4Crypt($$)
1066	{
1067	my ($dataPt, $key) = @_;
1068	$key = RC4Init($key) unless ref $key eq 'HASH';
1069	my $state = $$key{State};
1070	my ($x, $y) = @{$$key{XY}};
1071
1072	my @data = unpack('C*', $$dataPt);
1073	foreach (@data) {
1074	$x = ($x + 1) & 0xff;
1075	my $stx = $$state[$x];
1076	$y = ($stx + $y) & 0xff;
1077	my $sty = $$state[$x] = $$state[$y];
1078	$$state[$y] = $stx;
1079	$_ ^= $$state[($stx + $sty) & 0xff];
1080	}
1081	$$key{XY} = [ $x, $y ];
1082	$$dataPt = pack('C*', @data);
1083	}
1084
1085	#------------------------------------------------------------------------------
1086	# Initialize decryption
1087	# Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
1088	# 2) ID from file trailer dictionary
1089	# Returns: error string or undef on success (and sets $cryptInfo)
1090	sub DecryptInit($$$)
1091	{
1092	local $_;
1093	my ($exifTool, $encrypt, $id) = @_;
1094
1095	undef $cryptInfo;
1096	unless ($encrypt and ref $encrypt eq 'HASH') {
1097	return 'Error loading Encrypt object';
1098	}
1099	my $filt = $$encrypt{Filter};
1100	unless ($filt and $filt =~ s/^\///) {
1101	return 'Encrypt dictionary has no Filter!';
1102	}
1103	# extract some interesting tags
1104	my $ver = $$encrypt{V} \|\| 0;
1105	my $rev = $$encrypt{R} \|\| 0;
1106	my $enc = "$filt V$ver";
1107	$enc .= ".$rev" if $filt eq 'Standard';
1108	$enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
1109	$enc .= ' (' . ($$encrypt{Length} \|\| 40) . '-bit)' if $filt eq 'Standard';
1110	my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
1111	$exifTool->HandleTag($tagTablePtr, 'Filter', $enc);
1112	if ($filt ne 'Standard') {
1113	return "Encryption filter $filt not currently supported";
1114	} elsif (not defined $$encrypt{R}) {
1115	return 'Standard security handler missing revision';
1116	}
1117	unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
1118	return 'Incomplete Encrypt specification';
1119	}
1120	$exifTool->HandleTag($tagTablePtr, 'P', $$encrypt{P});
1121
1122	my %parm; # optional parameters extracted from Encrypt dictionary
1123
1124	if ($ver == 1 or $ver == 2) {
1125	$cryptString = $cryptStream = 1;
1126	} elsif ($ver == 4 or $ver == 5) {
1127	# initialize our $cryptString and $cryptStream flags
1128	foreach ('StrF', 'StmF') {
1129	my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
1130	$$flagPt = $$encrypt{$_};
1131	undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
1132	return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
1133	}
1134	if ($cryptString or $cryptStream) {
1135	return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
1136	ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
1137	my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
1138	unless ($cryptMeth =~ /^\/(V2\|AESV2\|AESV3)$/) {
1139	return "Unsupported encryption method $cryptMeth";
1140	}
1141	# set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
1142	$$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2\|AESV3)$/;
1143	}
1144	if ($ver == 5) {
1145	# validate OE and UE entries
1146	foreach ('OE', 'UE') {
1147	return "Missing Encrypt $_ entry" unless $$encrypt{$_};
1148	$parm{$_} = ReadPDFValue($$encrypt{$_});
1149	return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
1150	}
1151	require Image::ExifTool::AES; # will need this later
1152	}
1153	} else {
1154	return "Encryption version $ver not currently supported";
1155	}
1156	$id or return "Can't decrypt (no document ID)";
1157
1158	# make sure we have the necessary libraries available
1159	if ($ver < 5) {
1160	unless (eval 'require Digest::MD5') {
1161	return "Install Digest::MD5 to process encrypted PDF";
1162	}
1163	} else {
1164	unless (eval 'require Digest::SHA') {
1165	return "Install Digest::SHA to process AES-256 encrypted PDF";
1166	}
1167	}
1168
1169	# calculate file-level en/decryption key
1170	my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
1171	"\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
1172	my $o = ReadPDFValue($$encrypt{O});
1173	my $u = ReadPDFValue($$encrypt{U});
1174
1175	# set flag indicating whether metadata is encrypted
1176	# (in version 4 and higher, metadata streams may not be encrypted)
1177	if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
1178	$$encrypt{_meta} = 1;
1179	}
1180	# try no password first, then try provided password if available
1181	my ($try, $key);
1182	for ($try=0; ; ++$try) {
1183	my $password;
1184	if ($try == 0) {
1185	$password = '';
1186	} elsif ($try == 1) {
1187	$password = $exifTool->Options('Password');
1188	return 'Document is password protected (use Password option)' unless defined $password;
1189	# make sure there is no UTF-8 flag on the password
1190	if ($] >= 5.006 and (eval 'require Encode; Encode::is_utf8($password)' or $@)) {
1191	# repack by hand if Encode isn't available
1192	$password = $@ ? pack('C',unpack('U0C',$password)) : Encode::encode('utf8',$password);
1193	}
1194	} else {
1195	return 'Incorrect password';
1196	}
1197	if ($ver < 5) {
1198	if (length $password) {
1199	# password must be encoding in PDFDocEncoding (ref iso32000)
1200	$password = $exifTool->Encode($password, 'PDFDoc');
1201	# truncate or pad the password to exactly 32 bytes
1202	if (length($password) > 32) {
1203	$password = substr($password, 0, 32);
1204	} elsif (length($password) < 32) {
1205	$password .= substr($pad, 0, 32-length($password));
1206	}
1207	} else {
1208	$password = $pad;
1209	}
1210	$key = $password . $o . pack('V', $$encrypt{P}) . $id;
1211	my $rep = 1;
1212	if ($rev == 3 or $rev == 4) {
1213	# must add this if metadata not encrypted
1214	$key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
1215	$rep += 50; # repeat MD5 50 more times if revision is 3 or greater
1216	}
1217	my ($len, $i, $dat);
1218	if ($ver == 1) {
1219	$len = 5;
1220	} else {
1221	$len = $$encrypt{Length} \|\| 40;
1222	$len >= 40 or return 'Bad Encrypt Length';
1223	$len = int($len / 8);
1224	}
1225	for ($i=0; $i<$rep; ++$i) {
1226	$key = substr(Digest::MD5::md5($key), 0, $len);
1227	}
1228	# decrypt U to see if a user password is required
1229	if ($rev >= 3) {
1230	$dat = Digest::MD5::md5($pad . $id);
1231	RC4Crypt(\$dat, $key);
1232	for ($i=1; $i<=19; ++$i) {
1233	my @key = unpack('C*', $key);
1234	foreach (@key) { $_ ^= $i; }
1235	RC4Crypt(\$dat, pack('C*', @key));
1236	}
1237	$dat .= substr($u, 16);
1238	} else {
1239	$dat = $pad;
1240	RC4Crypt(\$dat, $key);
1241	}
1242	last if $dat eq $u; # all done if this was the correct key
1243	} else {
1244	return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
1245	if (length $password) {
1246	# Note: this should be good for passwords containing reasonable characters,
1247	# but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
1248	# of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
1249	$password = $exifTool->Encode($password, 'UTF8');
1250	$password = substr($password, 0, 127) if length($password) > 127;
1251	}
1252	# test for the owner password
1253	my $sha = Digest::SHA::sha256($password, substr($o,32,8), substr($u,0,48));
1254	if ($sha eq substr($o, 0, 32)) {
1255	$key = Digest::SHA::sha256($password, substr($o,40,8), substr($u,0,48));
1256	my $dat = ("\0" x 16) . $parm{OE};
1257	# decrypt with no padding
1258	my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1259	return $err if $err;
1260	$key = $dat; # use this as the file decryption key
1261	last;
1262	}
1263	# test for the user password
1264	$sha = Digest::SHA::sha256($password, substr($u,32,8));
1265	if ($sha eq substr($u, 0, 32)) {
1266	$key = Digest::SHA::sha256($password, substr($u,40,8));
1267	my $dat = ("\0" x 16) . $parm{UE};
1268	my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
1269	return $err if $err;
1270	$key = $dat; # use this as the file decryption key
1271	last;
1272	}
1273	}
1274	}
1275	$$encrypt{_key} = $key; # save the file-level encryption key
1276	$cryptInfo = $encrypt; # save reference to the file-level Encrypt object
1277	return undef; # success!
1278	}
1279
1280	#------------------------------------------------------------------------------
1281	# Decrypt/Encrypt data
1282	# Inputs: 0) data ref
1283	# 1) PDF object reference to use as crypt key extension (may be 'none' to
1284	# avoid extending the encryption key, as for streams with Crypt Filter)
1285	# 2) encrypt flag (false for decryption)
1286	sub Crypt($$;$)
1287	{
1288	return unless $cryptInfo;
1289	my ($dataPt, $keyExt, $encrypt) = @_;
1290	# do not decrypt if the key extension object is undefined
1291	# (this doubles as a flag to disable decryption/encryption)
1292	return unless defined $keyExt;
1293	my $key = $$cryptInfo{_key};
1294	# apply the necessary crypt key extension
1295	unless ($$cryptInfo{_aesv3}) {
1296	unless ($keyExt eq 'none') {
1297	# extend crypt key using object and generation number
1298	unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
1299	$$cryptInfo{_error} = 'Invalid object reference for encryption';
1300	return;
1301	}
1302	$key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
1303	}
1304	# add AES-128 salt if necessary (this little gem is conveniently
1305	# omitted from the Adobe PDF 1.6 documentation, causing me to
1306	# waste 12 hours trying to figure out why this wasn't working --
1307	# it appears in ISO32000 though, so I should have been using that)
1308	$key .= 'sAlT' if $$cryptInfo{_aesv2};
1309	my $len = length($key);
1310	$key = Digest::MD5::md5($key); # get 16-byte MD5 digest
1311	$key = substr($key, 0, $len) if $len < 16; # trim if necessary
1312	}
1313	# perform the decryption/encryption
1314	if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
1315	require Image::ExifTool::AES;
1316	my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
1317	$err and $$cryptInfo{_error} = $err;
1318	} else {
1319	RC4Crypt($dataPt, $key);
1320	}
1321	}
1322
1323	#------------------------------------------------------------------------------
1324	# Decrypt/Encrypt stream data
1325	# Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
1326	sub CryptStream($$)
1327	{
1328	return unless $cryptStream;
1329	my ($dict, $keyExt) = @_;
1330	my $type = $$dict{Type} \|\| '';
1331	# XRef streams are not encrypted (ref 3, page 50),
1332	# and Metadata may or may not be encrypted
1333	if ($cryptInfo and $type ne '/XRef' and
1334	($$cryptInfo{_meta} or $type ne '/Metadata'))
1335	{
1336	Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
1337	# toggle _decrypted flag
1338	$$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
1339	} else {
1340	$$dict{_decrypted} = 0; # stream should never be encrypted
1341	}
1342	}
1343
1344	#------------------------------------------------------------------------------
1345	# Generate a new PDF tag (based on its ID) and add it to a tag table
1346	# Inputs: 0) tag table ref, 1) tag ID
1347	# Returns: tag info ref
1348	sub NewPDFTag($$)
1349	{
1350	my ($tagTablePtr, $tag) = @_;
1351	my $name = $tag;
1352	# translate URL-like escape sequences
1353	$name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
1354	$name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
1355	$name =~ s/(^\|_)([a-z])/\U$2/g; # start words with upper case
1356	my $tagInfo = { Name => $name };
1357	Image::ExifTool::AddTagToTable($tagTablePtr, $tag, $tagInfo);
1358	return $tagInfo;
1359	}
1360
1361	#------------------------------------------------------------------------------
1362	# Process PDF dictionary extract tag values
1363	# Inputs: 0) ExifTool object reference, 1) tag table reference
1364	# 2) dictionary reference, 3) cross-reference table reference,
1365	# 4) nesting depth, 5) dictionary capture type
1366	sub ProcessDict($$$$;$$)
1367	{
1368	my ($exifTool, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
1369	my $verbose = $exifTool->Options('Verbose');
1370	my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
1371	my $embedded = (defined $unknown and not $unknown and $exifTool->Options('ExtractEmbedded'));
1372	my @tags = @{$$dict{_tags}};
1373	my ($next, %join);
1374	my $index = 0;
1375
1376	$nesting = ($nesting \|\| 0) + 1;
1377	if ($nesting > 50) {
1378	$exifTool->WarnOnce('Nesting too deep (directory ignored)');
1379	return;
1380	}
1381	# save entire dictionary for rewriting if specified
1382	if ($$exifTool{PDF_CAPTURE} and $$tagTablePtr{VARS} and
1383	$tagTablePtr->{VARS}->{CAPTURE})
1384	{
1385	my $name;
1386	foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
1387	next if $exifTool->{PDF_CAPTURE}->{$name};
1388	# make sure we load the right type if indicated
1389	next if $type and $type ne $name;
1390	$exifTool->{PDF_CAPTURE}->{$name} = $dict;
1391	last;
1392	}
1393	}
1394	#
1395	# extract information from all tags in the dictionary
1396	#
1397	for (;;) {
1398	my ($tag, $tagInfo);
1399	if (@tags) {
1400	$tag = shift @tags;
1401	} elsif (defined $next and not $next) {
1402	$tag = 'Next';
1403	$next = 1;
1404	} else {
1405	last;
1406	}
1407	my $val = $$dict{$tag};
1408	if ($$tagTablePtr{$tag}) {
1409	$tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag);
1410	undef $tagInfo if $$tagInfo{NoProcess};
1411	} elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
1412	$$tagTablePtr{$1} and ref $val eq 'SCALAR' and not $fetched{$$val})
1413	{
1414	my ($name, $num) = ($1, $2);
1415	$join{$name} = [] unless $join{$name};
1416	$fetched{$$val} = 1;
1417	my $obj = FetchObject($exifTool, $$val, $xref, $tag);
1418	next unless ref $obj eq 'HASH' and $$obj{_stream};
1419	# save all the stream data to join later
1420	$join{$name}->[$num] = $$obj{_stream};
1421	}
1422	if ($verbose) {
1423	my ($val2, $extra);
1424	if (ref $val eq 'SCALAR') {
1425	$extra = ", indirect object ($$val)";
1426	if ($fetched{$$val}) {
1427	$val2 = "ref($$val)";
1428	} elsif ($tag eq 'Next' and not $next) {
1429	# handle 'Next' links after all others
1430	$next = 0;
1431	next;
1432	} else {
1433	$fetched{$$val} = 1;
1434	$val = FetchObject($exifTool, $$val, $xref, $tag);
1435	unless (defined $val) {
1436	my $str;
1437	if (defined $lastOffset) {
1438	$val2 = '<free>';
1439	$str = 'Object was freed';
1440	} else {
1441	$val2 = '<err>';
1442	$str = 'Error reading object';
1443	}
1444	$exifTool->VPrint(0, "$$exifTool{INDENT}${str}:\n");
1445	}
1446	}
1447	} elsif (ref $val eq 'HASH') {
1448	$extra = ', direct dictionary';
1449	} elsif (ref $val eq 'ARRAY') {
1450	$extra = ', direct array of ' . scalar(@$val) . ' objects';
1451	} else {
1452	$extra = ', direct object';
1453	}
1454	my $isSubdir;
1455	if (ref $val eq 'HASH') {
1456	$isSubdir = 1;
1457	} elsif (ref $val eq 'ARRAY') {
1458	# recurse into objects in arrays only if they are lists of
1459	# dictionaries or indirect objects which could be dictionaries
1460	$isSubdir = 1 if @$val;
1461	foreach (@$val) {
1462	next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
1463	undef $isSubdir;
1464	last;
1465	}
1466	}
1467	if ($isSubdir) {
1468	# create bogus subdirectory to recurse into this dict
1469	$tagInfo or $tagInfo = {
1470	Name => $tag,
1471	SubDirectory => {
1472	TagTable => 'Image::ExifTool::PDF::Unknown',
1473	},
1474	};
1475	} else {
1476	if (ref $val eq 'ARRAY') {
1477	my @list = @$val;
1478	foreach (@list) {
1479	$_ = "ref($$_)" if ref $_ eq 'SCALAR';
1480	}
1481	$val2 = '[' . join(',',@list) . ']';
1482	}
1483	# generate tag info if we will use it later
1484	if (not $tagInfo and defined $val and $unknown) {
1485	$tagInfo = NewPDFTag($tagTablePtr, $tag);
1486	}
1487	}
1488	$exifTool->VerboseInfo($tag, $tagInfo,
1489	Value => $val2 \|\| $val,
1490	Extra => $extra,
1491	Index => $index++,
1492	);
1493	next unless defined $val;
1494	}
1495	unless ($tagInfo) {
1496	# add any tag found in Info directory to table
1497	next unless $unknown;
1498	$tagInfo = NewPDFTag($tagTablePtr, $tag);
1499	}
1500	unless ($$tagInfo{SubDirectory}) {
1501	# fetch object if necessary
1502	# (OS X 10.6 writes indirect objects in the Info dictionary!)
1503	if (ref $val eq 'SCALAR') {
1504	my $prevFetched = $lastFetched;
1505	# (note: fetching the same object multiple times is OK here)
1506	$val = FetchObject($exifTool, $$val, $xref, $tag);
1507	next unless defined $val;
1508	$val = ReadPDFValue($val);
1509	# set flag to re-encrypt if necessary if rewritten
1510	$$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
1511	$lastFetched = $prevFetched; # restore last fetched object reference
1512	} else {
1513	$val = ReadPDFValue($val);
1514	}
1515	my $format = $$tagInfo{Format} \|\| $$tagInfo{Writable} \|\| 'string';
1516	$val = ConvertPDFDate($val) if $format eq 'date';
1517	# convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
1518	# unless this is binary data (hex-encoded strings would not have been converted)
1519	if (ref $val) {
1520	next if ref $val ne 'ARRAY';
1521	my $v;
1522	foreach $v (@$val) {
1523	$exifTool->FoundTag($tagInfo, $v);
1524	}
1525	} else {
1526	if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
1527	# text string is already in Unicode if it starts with "\xfe\xff",
1528	# otherwise we must first convert from PDFDocEncoding
1529	$val = $exifTool->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
1530	}
1531	if ($$tagInfo{List}) {
1532	# separate tokens in comma or whitespace delimited lists
1533	my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val;
1534	foreach $val (@values) {
1535	$exifTool->FoundTag($tagInfo, $val);
1536	}
1537	} else {
1538	# a simple tag value
1539	$exifTool->FoundTag($tagInfo, $val);
1540	}
1541	}
1542	next;
1543	}
1544	# process the subdirectory
1545	my @subDicts;
1546	if (ref $val eq 'ARRAY') {
1547	@subDicts = @{$val};
1548	} else {
1549	@subDicts = ( $val );
1550	}
1551	# loop through all values of this tag
1552	for (;;) {
1553	my $subDict = shift @subDicts or last;
1554	# save last fetched object in case we fetch another one here
1555	my $prevFetched = $lastFetched;
1556	if (ref $subDict eq 'SCALAR') {
1557	# only fetch once (other copies are obsolete)
1558	next if $fetched{$$subDict};
1559	# load dictionary via an indirect reference
1560	$fetched{$$subDict} = 1;
1561	my $obj = FetchObject($exifTool, $$subDict, $xref, $tag);
1562	unless (defined $obj) {
1563	unless (defined $lastOffset) {
1564	$exifTool->Warn("Error reading $tag object ($$subDict)");
1565	}
1566	next;
1567	}
1568	$subDict = $obj;
1569	}
1570	if (ref $subDict eq 'ARRAY') {
1571	# convert array of key/value pairs to a hash
1572	next if @$subDict < 2;
1573	my %hash = ( _tags => [] );
1574	while (@$subDict >= 2) {
1575	my $key = shift @$subDict;
1576	$key =~ s/^\///;
1577	push @{$hash{_tags}}, $key;
1578	$hash{$key} = shift @$subDict;
1579	}
1580	$subDict = \%hash;
1581	} else {
1582	next unless ref $subDict eq 'HASH';
1583	}
1584	# set flag to re-crypt all strings when rewriting if the dictionary
1585	# came from an encrypted stream
1586	$$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
1587	my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
1588	if (not $verbose) {
1589	my $proc = $$subTablePtr{PROCESS_PROC} \|\| \&ProcessDict;
1590	&$proc($exifTool, $subTablePtr, $subDict, $xref, $nesting);
1591	} elsif ($next) {
1592	# handle 'Next' links at this level to avoid deep recursion
1593	undef $next;
1594	$index = 0;
1595	$tagTablePtr = $subTablePtr;
1596	$dict = $subDict;
1597	@tags = @{$$subDict{_tags}};
1598	$exifTool->VerboseDir($tag, scalar(@tags));
1599	} else {
1600	my $oldIndent = $exifTool->{INDENT};
1601	my $oldDir = $exifTool->{DIR_NAME};
1602	$exifTool->{INDENT} .= '\| ';
1603	$exifTool->{DIR_NAME} = $tag;
1604	$exifTool->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
1605	ProcessDict($exifTool, $subTablePtr, $subDict, $xref, $nesting);
1606	$exifTool->{INDENT} = $oldIndent;
1607	$exifTool->{DIR_NAME} = $oldDir;
1608	}
1609	$lastFetched = $prevFetched;
1610	}
1611	}
1612	#
1613	# extract information from joined streams if necessary
1614	#
1615
1616	if (%join) {
1617	my ($tag, $i);
1618	foreach $tag (sort keys %join) {
1619	my $list = $join{$tag};
1620	last unless defined $$list[1] and $$list[1] =~ /^%.?([\x0d\x0a])/;
1621	my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
1622	for ($i=1; defined $$list[$i]; ++$i) {
1623	$buff .= $$list[$i];
1624	undef $$list[$i]; # free memory
1625	}
1626	$exifTool->HandleTag($tagTablePtr, $tag, $buff);
1627	}
1628	}
1629	#
1630	# extract information from stream object if it exists (ie. Metadata stream)
1631	#
1632	return unless $$dict{_stream};
1633	my $tag = '_stream';
1634	# add Subtype (if it exists) to stream name and remove leading '/'
1635	($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
1636	return unless $$tagTablePtr{$tag};
1637	my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag);
1638	# decode stream if necessary
1639	DecodeStream($exifTool, $dict) or return;
1640	# extract information from stream
1641	my %dirInfo = (
1642	DataPt => \$$dict{_stream},
1643	DataLen => length $$dict{_stream},
1644	DirStart => 0,
1645	DirLen => length $$dict{_stream},
1646	Parent => 'PDF',
1647	);
1648	my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
1649	unless ($exifTool->ProcessDirectory(\%dirInfo, $subTablePtr)) {
1650	$exifTool->Warn("Error processing $$tagInfo{Name} information");
1651	}
1652	}
1653
1654	#------------------------------------------------------------------------------
1655	# Extract information from PDF file
1656	# Inputs: 0) ExifTool object reference, 1) dirInfo reference
1657	# Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
1658	sub ReadPDF($$)
1659	{
1660	my ($exifTool, $dirInfo) = @_;
1661	my $raf = $$dirInfo{RAF};
1662	my $verbose = $exifTool->Options('Verbose');
1663	my ($buff, $encrypt, $id);
1664	#
1665	# validate PDF file
1666	#
1667	# (linearization dictionary must be in the first 1024 bytes of the file)
1668	$raf->Read($buff, 1024) >= 8 or return 0;
1669	$buff =~ /^%PDF-(\d+\.\d+)/ or return 0;
1670	$pdfVer = $1;
1671	$exifTool->SetFileType(); # set the FileType tag
1672	$exifTool->Warn("May not be able to read a PDF version $pdfVer file") if $pdfVer >= 2.0;
1673	# store PDFVersion tag
1674	my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
1675	$exifTool->HandleTag($tagTablePtr, 'Version', $pdfVer);
1676	$tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
1677	#
1678	# check for a linearized PDF (only if reading)
1679	#
1680	my $capture = $$exifTool{PDF_CAPTURE};
1681	unless ($capture) {
1682	my $lin = 'false';
1683	if ($buff =~ /<</g) {
1684	$buff = substr($buff, pos($buff) - 2);
1685	my $dict = ExtractObject($exifTool, \$buff);
1686	if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
1687	if (not $$exifTool{VALUE}{FileSize}) {
1688	undef $lin; # can't determine if it is linearized
1689	} elsif ($$dict{L} == $$exifTool{VALUE}{FileSize}) {
1690	$lin = 'true';
1691	}
1692	}
1693	}
1694	$exifTool->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
1695	}
1696	#
1697	# read the xref tables referenced from startxref at the end of the file
1698	#
1699	my @xrefOffsets;
1700	$raf->Seek(0, 2) or return -2;
1701	# the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
1702	my $len = $raf->Tell();
1703	$len = 1024 if $len > 1024;
1704	$raf->Seek(-$len, 2) or return -2;
1705	$raf->Read($buff, $len) == $len or return -3;
1706	# find the last xref table in the file (may be multiple %%EOF marks)
1707	$buff =~ /.startxref (\x0d\x0a\|\x0d\|\x0a)\s*?(\d+)\s+%%EOF/s or return -4;
1708	local $/ = $1; # set input record separator
1709	push @xrefOffsets, $2, 'Main';
1710	my (%xref, @mainDicts, %loaded, $mainFree);
1711	# initialize variables to capture when rewriting
1712	if ($capture) {
1713	$capture->{startxref} = $2;
1714	$capture->{xref} = \%xref;
1715	$capture->{newline} = $/;
1716	$capture->{mainFree} = $mainFree = { };
1717	}
1718	while (@xrefOffsets) {
1719	my $offset = shift @xrefOffsets;
1720	my $type = shift @xrefOffsets;
1721	next if $loaded{$offset}; # avoid infinite recursion
1722	unless ($raf->Seek($offset, 0)) {
1723	%loaded or return -5;
1724	$exifTool->Warn('Bad offset for secondary xref table');
1725	next;
1726	}
1727	# Note: care must be taken because ReadLine may read more than we want if
1728	# the newline sequence for this table is different than the rest of the file
1729	unless ($raf->ReadLine($buff)) {
1730	%loaded or return -6;
1731	$exifTool->Warn('Bad offset for secondary xref table');
1732	next;
1733	}
1734	my $loadXRefStream;
1735	if ($buff =~ s/^xref\s+//s) {
1736	# load xref table
1737	for (;;) {
1738	# read another line if necessary (skipping blank lines)
1739	$raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
1740	last if $buff =~ s/^\s*trailer\s+//s;
1741	$buff =~ s/\s*(\d+)\s+(\d+)\s+//s or return -4;
1742	my ($start, $num) = ($1, $2);
1743	$raf->Seek(-length($buff), 1) or return -4;
1744	my $i;
1745	for ($i=0; $i<$num; ++$i) {
1746	$raf->Read($buff, 20) == 20 or return -6;
1747	$buff =~ /^\s*(\d{10}) (\d{5}) (f\|n)/s or return -4;
1748	my $num = $start + $i;
1749	# save offset for newest copy of all objects
1750	# (or next object number for free objects)
1751	unless (defined $xref{$num}) {
1752	my ($offset, $gen) = (int($1), int($2));
1753	$xref{$num} = $offset;
1754	if ($3 eq 'f') {
1755	# save free objects in last xref table for rewriting
1756	$$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
1757	next;
1758	}
1759	# also save offset keyed by object reference string
1760	$xref{"$num $gen R"} = $offset;
1761	}
1762	}
1763	# (I have a sample from Adobe which has an empty xref table)
1764	# %xref or return -4; # xref table may not be empty
1765	$buff = '';
1766	}
1767	undef $mainFree; # only do this for the last xref table
1768	} elsif ($buff =~ s/^(\d+)\s+(\d+)\s+obj//s) {
1769	# this is a PDF-1.5 cross-reference stream dictionary
1770	$loadXRefStream = 1;
1771	} else {
1772	%loaded or return -4;
1773	$exifTool->Warn('Invalid secondary xref table');
1774	next;
1775	}
1776	my $mainDict = ExtractObject($exifTool, \$buff, $raf, \%xref);
1777	unless (ref $mainDict eq 'HASH') {
1778	%loaded or return -8;
1779	$exifTool->Warn('Error loading secondary dictionary');
1780	next;
1781	}
1782	if ($loadXRefStream) {
1783	# decode and save our XRef stream from PDF-1.5 file
1784	# (but parse it later as required to save time)
1785	# Note: this technique can potentially result in an old object
1786	# being used if the file was incrementally updated and an older
1787	# object from an xref table was replaced by a newer object in an
1788	# xref stream. But doing so isn't a good idea (if allowed at all)
1789	# because a PDF 1.4 consumer would also make this same mistake.
1790	if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
1791	@{$$mainDict{W}} > 2 and $$mainDict{Size} and
1792	DecodeStream($exifTool, $mainDict))
1793	{
1794	# create Index entry if it doesn't exist
1795	$$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
1796	# create '_entry_size' entry for internal use
1797	my $w = $$mainDict{W};
1798	my $size = 0;
1799	foreach (@$w) { $size += $_; }
1800	$$mainDict{_entry_size} = $size;
1801	# save this stream dictionary to use later if required
1802	$xref{dicts} = [] unless $xref{dicts};
1803	push @{$xref{dicts}}, $mainDict;
1804	} else {
1805	%loaded or return -9;
1806	$exifTool->Warn('Invalid xref stream in secondary dictionary');
1807	}
1808	}
1809	$loaded{$offset} = 1;
1810	# load XRef stream in hybrid file if it exists
1811	push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
1812	$encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
1813	if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
1814	$id = ReadPDFValue($mainDict->{ID}->[0]);
1815	}
1816	push @mainDicts, $mainDict, $type;
1817	# load previous xref table if it exists
1818	push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
1819	}
1820	#
1821	# extract encryption information if necessary
1822	#
1823	if ($encrypt) {
1824	if (ref $encrypt eq 'SCALAR') {
1825	$encrypt = FetchObject($exifTool, $$encrypt, \%xref, 'Encrypt');
1826	}
1827	# generate Encryption tag information
1828	my $err = DecryptInit($exifTool, $encrypt, $id);
1829	if ($err) {
1830	$exifTool->Warn($err);
1831	$$capture{Error} = $err if $capture;
1832	return -1;
1833	}
1834	}
1835	#
1836	# extract the information beginning with each of the main dictionaries
1837	#
1838	while (@mainDicts) {
1839	my $dict = shift @mainDicts;
1840	my $type = shift @mainDicts;
1841	if ($verbose) {
1842	my $n = scalar(@{$$dict{_tags}});
1843	$exifTool->VPrint(0, "PDF dictionary with $n entries:\n");
1844	}
1845	ProcessDict($exifTool, $tagTablePtr, $dict, \%xref, 0, $type);
1846	}
1847	# handle any decryption errors
1848	if ($encrypt) {
1849	my $err = $$encrypt{_error};
1850	if ($err) {
1851	$exifTool->Warn($err);
1852	$$capture{Error} = $err if $capture;
1853	return -1;
1854	}
1855	}
1856	return 1;
1857	}
1858
1859	#------------------------------------------------------------------------------
1860	# ReadPDF() warning strings for each error return value
1861	my %pdfWarning = (
1862	# -1 is reserved as error return value with no associated warning
1863	-2 => 'Error seeking in file',
1864	-3 => 'Error reading file',
1865	-4 => 'Invalid xref table',
1866	-5 => 'Invalid xref offset',
1867	-6 => 'Error reading xref table',
1868	-7 => 'Error reading trailer',
1869	-8 => 'Error reading main dictionary',
1870	-9 => 'Invalid xref stream in main dictionary',
1871	);
1872
1873	#------------------------------------------------------------------------------
1874	# Extract information from PDF file
1875	# Inputs: 0) ExifTool object reference, 1) dirInfo reference
1876	# Returns: 1 if this was a valid PDF file
1877	sub ProcessPDF($$)
1878	{
1879	my ($exifTool, $dirInfo) = @_;
1880
1881	undef $cryptInfo; # (must not delete after returning so writer can use it)
1882	undef $cryptStream;
1883	undef $cryptString;
1884	my $result = ReadPDF($exifTool, $dirInfo);
1885	if ($result < 0) {
1886	$exifTool->Warn($pdfWarning{$result}) if $pdfWarning{$result};
1887	$result = 1;
1888	}
1889	# clean up and return
1890	undef %streamObjs;
1891	undef %fetched;
1892	return $result;
1893	}
1894
1895	1; # end
1896
1897
1898	__END__
1899
1900	=head1 NAME
1901
1902	Image::ExifTool::PDF - Read PDF meta information
1903
1904	=head1 SYNOPSIS
1905
1906	This module is loaded automatically by Image::ExifTool when required.
1907
1908	=head1 DESCRIPTION
1909
1910	This code reads meta information from PDF (Adobe Portable Document Format)
1911	files. It supports object streams introduced in PDF-1.5 but only with a
1912	limited set of Filter and Predictor algorithms, however all standard
1913	encryption methods through PDF-1.7 extension level 3 are supported,
1914	including AESV2 (AES-128) and AESV3 (AES-256).
1915
1916	=head1 AUTHOR
1917
1918	Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
1919
1920	This library is free software; you can redistribute it and/or modify it
1921	under the same terms as Perl itself.
1922
1923	=head1 REFERENCES
1924
1925	=over 4
1926
1927	=item L<http://partners.adobe.com/public/developer/pdf/index_reference.html>
1928
1929	=item L<Crypt::RC4\|Crypt::RC4>
1930
1931	=item L<http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf>
1932
1933	=item L<http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf>
1934
1935	=item L<http://tools.ietf.org/search/rfc3454>
1936
1937	=item L<http://www.armware.dk/RFC/rfc/rfc4013.html>
1938
1939	=back
1940
1941	=head1 SEE ALSO
1942
1943	L<Image::ExifTool::TagNames/PDF Tags>,
1944	L<Image::ExifTool(3pm)\|Image::ExifTool>
1945
1946	=cut

Note: See TracBrowser for help on using the repository browser.

Download in other formats: