1 | #------------------------------------------------------------------------------
|
---|
2 | # File: PDF.pm
|
---|
3 | #
|
---|
4 | # Description: Read PDF meta information
|
---|
5 | #
|
---|
6 | # Revisions: 07/11/2005 - P. Harvey Created
|
---|
7 | # 07/25/2005 - P. Harvey Add support for encrypted documents
|
---|
8 | #
|
---|
9 | # References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html
|
---|
10 | # 2) http://search.cpan.org/dist/Crypt-RC4/
|
---|
11 | # 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
|
---|
12 | # 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf
|
---|
13 | # 5) http://tools.ietf.org/search/rfc3454
|
---|
14 | # 6) http://www.armware.dk/RFC/rfc/rfc4013.html
|
---|
15 | #------------------------------------------------------------------------------
|
---|
16 |
|
---|
17 | package Image::ExifTool::PDF;
|
---|
18 |
|
---|
19 | use strict;
|
---|
20 | use vars qw($VERSION $AUTOLOAD $lastFetched);
|
---|
21 | use Image::ExifTool qw(:DataAccess :Utils);
|
---|
22 | require Exporter;
|
---|
23 |
|
---|
24 | $VERSION = '1.51';
|
---|
25 |
|
---|
26 | sub FetchObject($$$$);
|
---|
27 | sub ExtractObject($$;$$);
|
---|
28 | sub ReadToNested($;$);
|
---|
29 | sub ProcessDict($$$$;$$);
|
---|
30 | sub ProcessAcroForm($$$$;$$);
|
---|
31 | sub ExpandArray($);
|
---|
32 | sub ReadPDFValue($);
|
---|
33 | sub CheckPDF($$$);
|
---|
34 |
|
---|
35 | # $lastFetched - last fetched object reference (used for decryption)
|
---|
36 | # (undefined if fetched object was already decrypted, eg. object from stream)
|
---|
37 |
|
---|
38 | my $cryptInfo; # encryption object reference (plus additional information)
|
---|
39 | my $cryptString; # flag that strings are encrypted
|
---|
40 | my $cryptStream; # flag that streams are encrypted
|
---|
41 | my $lastOffset; # last fetched object offset
|
---|
42 | my %streamObjs; # hash of stream objects
|
---|
43 | my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion)
|
---|
44 | my $pdfVer; # version of PDF file being processed
|
---|
45 |
|
---|
46 | # filters supported in DecodeStream()
|
---|
47 | my %supportedFilter = (
|
---|
48 | '/FlateDecode' => 1,
|
---|
49 | '/Crypt' => 1,
|
---|
50 | '/Identity' => 1, # (not filtered)
|
---|
51 | '/DCTDecode' => 1, # (JPEG image - not filtered)
|
---|
52 | '/JPXDecode' => 1, # (Jpeg2000 image - not filtered)
|
---|
53 | '/LZWDecode' => 1, # (usually a bitmapped image)
|
---|
54 | '/ASCIIHexDecode' => 1,
|
---|
55 | '/ASCII85Decode' => 1,
|
---|
56 | # other standard filters that we currently don't support
|
---|
57 | #'/JBIG2Decode' => 0, # (JBIG2 image format not supported)
|
---|
58 | #'/CCITTFaxDecode' => 0,
|
---|
59 | #'/RunLengthDecode' => 0,
|
---|
60 | );
|
---|
61 |
|
---|
62 | # tags in main PDF directories
|
---|
63 | %Image::ExifTool::PDF::Main = (
|
---|
64 | GROUPS => { 2 => 'Document' },
|
---|
65 | VARS => { CAPTURE => ['Main','Prev'] },
|
---|
66 | Info => {
|
---|
67 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' },
|
---|
68 | # Adobe Acrobat 10.1.5 will create a duplicate Info dictionary with
|
---|
69 | # a different object number when metadata is edited. This flag
|
---|
70 | # is part of a patch to ignore this duplicate information (unless
|
---|
71 | # the IgnoreMinorErrors option is used)
|
---|
72 | IgnoreDuplicates => 1,
|
---|
73 | },
|
---|
74 | Root => {
|
---|
75 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' },
|
---|
76 | },
|
---|
77 | Encrypt => {
|
---|
78 | NoProcess => 1, # don't process normally (processed in advance)
|
---|
79 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' },
|
---|
80 | },
|
---|
81 | _linearized => {
|
---|
82 | Name => 'Linearized',
|
---|
83 | Notes => 'flag set if document is linearized for fast web display; not a real Tag ID',
|
---|
84 | PrintConv => { 'true' => 'Yes', 'false' => 'No' },
|
---|
85 | },
|
---|
86 | );
|
---|
87 |
|
---|
88 | # tags in PDF Info dictionary
|
---|
89 | %Image::ExifTool::PDF::Info = (
|
---|
90 | GROUPS => { 2 => 'Document' },
|
---|
91 | VARS => { CAPTURE => ['Info'] },
|
---|
92 | EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory
|
---|
93 | WRITE_PROC => \&Image::ExifTool::DummyWriteProc,
|
---|
94 | CHECK_PROC => \&CheckPDF,
|
---|
95 | WRITABLE => 'string',
|
---|
96 | # set PRIORITY to 0 so most recent Info dictionary takes precedence
|
---|
97 | # (Acrobat Pro bug? doesn't use same object/generation number for
|
---|
98 | # new Info dictionary when doing incremental update)
|
---|
99 | PRIORITY => 0,
|
---|
100 | NOTES => q{
|
---|
101 | As well as the tags listed below, the PDF specification allows for
|
---|
102 | user-defined tags to exist in the Info dictionary. These tags, which should
|
---|
103 | have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata
|
---|
104 | object, are also extracted by ExifTool.
|
---|
105 |
|
---|
106 | B<Writable> specifies the value format, and may be C<string>, C<date>,
|
---|
107 | C<integer>, C<real>, C<boolean> or C<name> for PDF tags.
|
---|
108 | },
|
---|
109 | Title => { },
|
---|
110 | Author => { Groups => { 2 => 'Author' } },
|
---|
111 | Subject => { },
|
---|
112 | Keywords => { List => 'string' }, # this is a string list
|
---|
113 | Creator => { },
|
---|
114 | Producer => { },
|
---|
115 | CreationDate => {
|
---|
116 | Name => 'CreateDate',
|
---|
117 | Writable => 'date',
|
---|
118 | Groups => { 2 => 'Time' },
|
---|
119 | Shift => 'Time',
|
---|
120 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
121 | PrintConvInv => '$self->InverseDateTime($val)',
|
---|
122 | },
|
---|
123 | ModDate => {
|
---|
124 | Name => 'ModifyDate',
|
---|
125 | Writable => 'date',
|
---|
126 | Groups => { 2 => 'Time' },
|
---|
127 | Shift => 'Time',
|
---|
128 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
129 | PrintConvInv => '$self->InverseDateTime($val)',
|
---|
130 | },
|
---|
131 | Trapped => {
|
---|
132 | Protected => 1,
|
---|
133 | # remove leading '/' from '/True' or '/False'
|
---|
134 | ValueConv => '$val=~s{^/}{}; $val',
|
---|
135 | ValueConvInv => '"/$val"',
|
---|
136 | },
|
---|
137 | 'AAPL:Keywords' => { #PH
|
---|
138 | Name => 'AppleKeywords',
|
---|
139 | List => 'array', # this is an array of values
|
---|
140 | Notes => q{
|
---|
141 | keywords written by Apple utilities, although they seem to use PDF:Keywords
|
---|
142 | when reading
|
---|
143 | },
|
---|
144 | },
|
---|
145 | );
|
---|
146 |
|
---|
147 | # tags in the PDF Root document catalog
|
---|
148 | %Image::ExifTool::PDF::Root = (
|
---|
149 | GROUPS => { 2 => 'Document' },
|
---|
150 | # note: can't capture previous versions of Root since they are not parsed
|
---|
151 | VARS => { CAPTURE => ['Root'] },
|
---|
152 | NOTES => 'This is the PDF document catalog.',
|
---|
153 | MarkInfo => {
|
---|
154 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' },
|
---|
155 | },
|
---|
156 | Metadata => {
|
---|
157 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
|
---|
158 | },
|
---|
159 | Pages => {
|
---|
160 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' },
|
---|
161 | },
|
---|
162 | Perms => {
|
---|
163 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' },
|
---|
164 | },
|
---|
165 | AcroForm => {
|
---|
166 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::AcroForm' },
|
---|
167 | },
|
---|
168 | Lang => 'Language',
|
---|
169 | PageLayout => { },
|
---|
170 | PageMode => { },
|
---|
171 | Version => 'PDFVersion',
|
---|
172 | );
|
---|
173 |
|
---|
174 | # tags extracted from the PDF Encrypt dictionary
|
---|
175 | %Image::ExifTool::PDF::Encrypt = (
|
---|
176 | GROUPS => { 2 => 'Document' },
|
---|
177 | NOTES => 'Tags extracted from the document Encrypt dictionary.',
|
---|
178 | Filter => {
|
---|
179 | Name => 'Encryption',
|
---|
180 | Notes => q{
|
---|
181 | extracted value is actually a combination of the Filter, SubFilter, V, R and
|
---|
182 | Length information from the Encrypt dictionary
|
---|
183 | },
|
---|
184 | },
|
---|
185 | P => {
|
---|
186 | Name => 'UserAccess',
|
---|
187 | ValueConv => '$val & 0x0f3c', # ignore reserved bits
|
---|
188 | PrintConvColumns => 2,
|
---|
189 | PrintConv => { BITMASK => {
|
---|
190 | 2 => 'Print',
|
---|
191 | 3 => 'Modify',
|
---|
192 | 4 => 'Copy',
|
---|
193 | 5 => 'Annotate',
|
---|
194 | 8 => 'Fill forms',
|
---|
195 | 9 => 'Extract',
|
---|
196 | 10 => 'Assemble',
|
---|
197 | 11 => 'Print high-res',
|
---|
198 | }},
|
---|
199 | },
|
---|
200 | );
|
---|
201 |
|
---|
202 | # tags in PDF Pages dictionary
|
---|
203 | %Image::ExifTool::PDF::Pages = (
|
---|
204 | GROUPS => { 2 => 'Document' },
|
---|
205 | Count => 'PageCount',
|
---|
206 | Kids => {
|
---|
207 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
|
---|
208 | },
|
---|
209 | );
|
---|
210 |
|
---|
211 | # tags in PDF Perms dictionary
|
---|
212 | %Image::ExifTool::PDF::Perms = (
|
---|
213 | NOTES => 'Additional document permissions imposed by digital signatures.',
|
---|
214 | DocMDP => {
|
---|
215 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
|
---|
216 | },
|
---|
217 | FieldMDP => {
|
---|
218 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
|
---|
219 | },
|
---|
220 | UR3 => {
|
---|
221 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' },
|
---|
222 | },
|
---|
223 | );
|
---|
224 |
|
---|
225 | # tags in PDF Perms dictionary
|
---|
226 | %Image::ExifTool::PDF::AcroForm = (
|
---|
227 | PROCESS_PROC => \&ProcessAcroForm,
|
---|
228 | _has_xfa => {
|
---|
229 | Name => 'HasXFA',
|
---|
230 | Notes => q{
|
---|
231 | this tag is defined if a document contains form fields, and is true if it
|
---|
232 | uses XML Forms Architecture; not a real Tag ID
|
---|
233 | },
|
---|
234 | PrintConv => { 'true' => 'Yes', 'false' => 'No' },
|
---|
235 | },
|
---|
236 | );
|
---|
237 |
|
---|
238 | # tags in PDF Kids dictionary
|
---|
239 | %Image::ExifTool::PDF::Kids = (
|
---|
240 | Metadata => {
|
---|
241 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
|
---|
242 | },
|
---|
243 | PieceInfo => {
|
---|
244 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' },
|
---|
245 | },
|
---|
246 | Resources => {
|
---|
247 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' },
|
---|
248 | },
|
---|
249 | Kids => {
|
---|
250 | Condition => '$self->Options("ExtractEmbedded")',
|
---|
251 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' },
|
---|
252 | },
|
---|
253 | );
|
---|
254 |
|
---|
255 | # tags in PDF Resources dictionary
|
---|
256 | %Image::ExifTool::PDF::Resources = (
|
---|
257 | ColorSpace => {
|
---|
258 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' },
|
---|
259 | },
|
---|
260 | XObject => {
|
---|
261 | Condition => '$self->Options("ExtractEmbedded")',
|
---|
262 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::XObject' },
|
---|
263 | },
|
---|
264 | Properties => {
|
---|
265 | Condition => '$self->Options("ExtractEmbedded")',
|
---|
266 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Properties' },
|
---|
267 | },
|
---|
268 | );
|
---|
269 |
|
---|
270 | # tags in PDF ColorSpace dictionary
|
---|
271 | %Image::ExifTool::PDF::ColorSpace = (
|
---|
272 | DefaultRGB => {
|
---|
273 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
|
---|
274 | ConvertToDict => 1, # (not seen yet, but just in case)
|
---|
275 | },
|
---|
276 | DefaultCMYK => {
|
---|
277 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' },
|
---|
278 | # hack: this is stored as an array instead of a dictionary in my
|
---|
279 | # sample, so convert to a dictionary to extract the ICCBased element
|
---|
280 | ConvertToDict => 1,
|
---|
281 | },
|
---|
282 | Cs1 => {
|
---|
283 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Cs1' },
|
---|
284 | },
|
---|
285 | );
|
---|
286 |
|
---|
287 | # tags in PDF DefaultRGB dictionary
|
---|
288 | %Image::ExifTool::PDF::DefaultRGB = (
|
---|
289 | ICCBased => {
|
---|
290 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' },
|
---|
291 | },
|
---|
292 | );
|
---|
293 |
|
---|
294 | # tags in PDF Cs1 dictionary
|
---|
295 | %Image::ExifTool::PDF::Cs1 = (
|
---|
296 | _stream => {
|
---|
297 | SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
|
---|
298 | },
|
---|
299 | );
|
---|
300 |
|
---|
301 | # tags in PDF ICCBased dictionary
|
---|
302 | %Image::ExifTool::PDF::ICCBased = (
|
---|
303 | _stream => {
|
---|
304 | SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' },
|
---|
305 | },
|
---|
306 | );
|
---|
307 |
|
---|
308 | # tags in PDF XObject dictionary (parsed only if ExtractEmbedded is enabled)
|
---|
309 | %Image::ExifTool::PDF::XObject = (
|
---|
310 | EXTRACT_UNKNOWN => 0, # extract known but numbered tags (Im1, Im2, etc)
|
---|
311 | Im => {
|
---|
312 | Notes => q{
|
---|
313 | the L<ExtractEmbedded|../ExifTool.html#ExtractEmbedded> option enables information to be extracted from these
|
---|
314 | embedded images
|
---|
315 | },
|
---|
316 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Im' },
|
---|
317 | },
|
---|
318 | );
|
---|
319 |
|
---|
320 | # tags in PDF Im# dictionary
|
---|
321 | %Image::ExifTool::PDF::Im = (
|
---|
322 | NOTES => q{
|
---|
323 | Information extracted from embedded images with the L<ExtractEmbedded|../ExifTool.html#ExtractEmbedded> option.
|
---|
324 | The EmbeddedImage and its metadata are extracted only for JPEG and Jpeg2000
|
---|
325 | image formats.
|
---|
326 | },
|
---|
327 | Width => 'EmbeddedImageWidth',
|
---|
328 | Height => 'EmbeddedImageHeight',
|
---|
329 | Filter => { Name => 'EmbeddedImageFilter', List => 1 },
|
---|
330 | ColorSpace => {
|
---|
331 | Name => 'EmbeddedImageColorSpace',
|
---|
332 | List => 1,
|
---|
333 | RawConv => 'ref $val ? undef : $val', # (ignore color space data)
|
---|
334 | },
|
---|
335 | Image_stream => {
|
---|
336 | Name => 'EmbeddedImage',
|
---|
337 | Groups => { 2 => 'Preview' },
|
---|
338 | Binary => 1,
|
---|
339 | },
|
---|
340 | );
|
---|
341 |
|
---|
342 | # tags in PDF Properties dictionary
|
---|
343 | %Image::ExifTool::PDF::Properties = (
|
---|
344 | EXTRACT_UNKNOWN => 0, # extract known but numbered tags (MC0, MC1, etc)
|
---|
345 | MC => {
|
---|
346 | Notes => q{
|
---|
347 | the L<ExtractEmbedded|../ExifTool.html#ExtractEmbedded> option enables information to be extracted from these
|
---|
348 | embedded metadata dictionaries
|
---|
349 | },
|
---|
350 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::MC' },
|
---|
351 | }
|
---|
352 | );
|
---|
353 |
|
---|
354 | # tags in PDF MC# dictionary
|
---|
355 | %Image::ExifTool::PDF::MC = (
|
---|
356 | Metadata => {
|
---|
357 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' },
|
---|
358 | }
|
---|
359 | );
|
---|
360 |
|
---|
361 | # tags in PDF PieceInfo dictionary
|
---|
362 | %Image::ExifTool::PDF::PieceInfo = (
|
---|
363 | AdobePhotoshop => {
|
---|
364 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' },
|
---|
365 | },
|
---|
366 | Illustrator => {
|
---|
367 | # assume this is an illustrator file if it contains this directory
|
---|
368 | # and doesn't have a ".PDF" extension
|
---|
369 | Condition => q{
|
---|
370 | $self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF';
|
---|
371 | return 1;
|
---|
372 | },
|
---|
373 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' },
|
---|
374 | },
|
---|
375 | );
|
---|
376 |
|
---|
377 | # tags in PDF AdobePhotoshop dictionary
|
---|
378 | %Image::ExifTool::PDF::AdobePhotoshop = (
|
---|
379 | Private => {
|
---|
380 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' },
|
---|
381 | },
|
---|
382 | );
|
---|
383 |
|
---|
384 | # tags in PDF Illustrator dictionary
|
---|
385 | %Image::ExifTool::PDF::Illustrator = (
|
---|
386 | Private => {
|
---|
387 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' },
|
---|
388 | },
|
---|
389 | );
|
---|
390 |
|
---|
391 | # tags in PDF Private dictionary
|
---|
392 | %Image::ExifTool::PDF::Private = (
|
---|
393 | ImageResources => {
|
---|
394 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' },
|
---|
395 | },
|
---|
396 | );
|
---|
397 |
|
---|
398 | # tags in PDF AI Private dictionary
|
---|
399 | %Image::ExifTool::PDF::AIPrivate = (
|
---|
400 | GROUPS => { 2 => 'Document' },
|
---|
401 | EXTRACT_UNKNOWN => 0, # extract known but numbered tags
|
---|
402 | AIMetaData => {
|
---|
403 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' },
|
---|
404 | },
|
---|
405 | AIPrivateData => {
|
---|
406 | Notes => q{
|
---|
407 | the L<ExtractEmbedded|../ExifTool.html#ExtractEmbedded> option enables information to be extracted from embedded
|
---|
408 | PostScript documents in the AIPrivateData# and AIPDFPrivateData# streams
|
---|
409 | },
|
---|
410 | JoinStreams => 1, # join streams from numbered tags and process as one
|
---|
411 | SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
|
---|
412 | },
|
---|
413 | AIPDFPrivateData => {
|
---|
414 | JoinStreams => 1, # join streams from numbered tags and process as one
|
---|
415 | SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
|
---|
416 | },
|
---|
417 | RoundTripVersion => { },
|
---|
418 | ContainerVersion => { },
|
---|
419 | CreatorVersion => { },
|
---|
420 | );
|
---|
421 |
|
---|
422 | # tags in PDF AIMetaData dictionary
|
---|
423 | %Image::ExifTool::PDF::AIMetaData = (
|
---|
424 | _stream => {
|
---|
425 | SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' },
|
---|
426 | },
|
---|
427 | );
|
---|
428 |
|
---|
429 | # tags in PDF ImageResources dictionary
|
---|
430 | %Image::ExifTool::PDF::ImageResources = (
|
---|
431 | _stream => {
|
---|
432 | SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' },
|
---|
433 | },
|
---|
434 | );
|
---|
435 |
|
---|
436 | # tags in PDF MarkInfo dictionary
|
---|
437 | %Image::ExifTool::PDF::MarkInfo = (
|
---|
438 | GROUPS => { 2 => 'Document' },
|
---|
439 | Marked => {
|
---|
440 | Name => 'TaggedPDF',
|
---|
441 | Notes => "not a Tagged PDF if this tag is missing",
|
---|
442 | PrintConv => { 'true' => 'Yes', 'false' => 'No' },
|
---|
443 | },
|
---|
444 | );
|
---|
445 |
|
---|
446 | # tags in PDF Metadata dictionary
|
---|
447 | %Image::ExifTool::PDF::Metadata = (
|
---|
448 | GROUPS => { 2 => 'Document' },
|
---|
449 | XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag)
|
---|
450 | Name => 'XMP',
|
---|
451 | SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' },
|
---|
452 | },
|
---|
453 | );
|
---|
454 |
|
---|
455 | # tags in PDF signature directories (DocMDP, FieldMDP or UR3)
|
---|
456 | %Image::ExifTool::PDF::Signature = (
|
---|
457 | GROUPS => { 2 => 'Document' },
|
---|
458 | ContactInfo => 'SignerContactInfo',
|
---|
459 | Location => 'SigningLocation',
|
---|
460 | M => {
|
---|
461 | Name => 'SigningDate',
|
---|
462 | Format => 'date',
|
---|
463 | Groups => { 2 => 'Time' },
|
---|
464 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
465 | },
|
---|
466 | Name => 'SigningAuthority',
|
---|
467 | Reason => 'SigningReason',
|
---|
468 | Reference => {
|
---|
469 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' },
|
---|
470 | },
|
---|
471 | Prop_AuthTime => {
|
---|
472 | Name => 'AuthenticationTime',
|
---|
473 | PrintConv => 'ConvertTimeSpan($val) . " ago"',
|
---|
474 | },
|
---|
475 | Prop_AuthType => 'AuthenticationType',
|
---|
476 | );
|
---|
477 |
|
---|
478 | # tags in PDF Reference dictionary
|
---|
479 | %Image::ExifTool::PDF::Reference = (
|
---|
480 | TransformParams => {
|
---|
481 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' },
|
---|
482 | },
|
---|
483 | );
|
---|
484 |
|
---|
485 | # tags in PDF TransformParams dictionary
|
---|
486 | %Image::ExifTool::PDF::TransformParams = (
|
---|
487 | GROUPS => { 2 => 'Document' },
|
---|
488 | Annots => {
|
---|
489 | Name => 'AnnotationUsageRights',
|
---|
490 | Notes => q{
|
---|
491 | possible values are Create, Delete, Modify, Copy, Import and Export;
|
---|
492 | additional values for UR3 signatures are Online and SummaryView
|
---|
493 | },
|
---|
494 | List => 1,
|
---|
495 | },
|
---|
496 | Document => {
|
---|
497 | Name => 'DocumentUsageRights',
|
---|
498 | Notes => 'only possible value is FullSave',
|
---|
499 | List => 1,
|
---|
500 | },
|
---|
501 | Form => {
|
---|
502 | Name => 'FormUsageRights',
|
---|
503 | Notes => q{
|
---|
504 | possible values are FillIn, Import, Export, SubmitStandalone and
|
---|
505 | SpawnTemplate; additional values for UR3 signatures are BarcodePlaintext and
|
---|
506 | Online
|
---|
507 | },
|
---|
508 | List => 1,
|
---|
509 | },
|
---|
510 | FormEX => {
|
---|
511 | Name => 'FormExtraUsageRights',
|
---|
512 | Notes => 'UR signatures only; only possible value is BarcodePlaintext',
|
---|
513 | List => 1,
|
---|
514 | },
|
---|
515 | Signature => {
|
---|
516 | Name => 'SignatureUsageRights',
|
---|
517 | Notes => 'only possible value is Modify',
|
---|
518 | List => 1,
|
---|
519 | },
|
---|
520 | EF => {
|
---|
521 | Name => 'EmbeddedFileUsageRights',
|
---|
522 | Notes => 'possible values are Create, Delete, Modify and Import',
|
---|
523 | List => 1,
|
---|
524 | },
|
---|
525 | Msg => 'UsageRightsMessage',
|
---|
526 | P => {
|
---|
527 | Name => 'ModificationPermissions',
|
---|
528 | Notes => q{
|
---|
529 | 1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default
|
---|
530 | false
|
---|
531 | },
|
---|
532 | PrintConv => {
|
---|
533 | 1 => 'No changes permitted',
|
---|
534 | 2 => 'Fill forms, Create page templates, Sign',
|
---|
535 | 3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations',
|
---|
536 | 'true' => 'Restrict all applications to reader permissions',
|
---|
537 | 'false' => 'Do not restrict applications to reader permissions',
|
---|
538 | },
|
---|
539 | },
|
---|
540 | Action => {
|
---|
541 | Name => 'FieldPermissions',
|
---|
542 | Notes => 'FieldMDP signatures only',
|
---|
543 | PrintConv => {
|
---|
544 | 'All' => 'Disallow changes to all form fields',
|
---|
545 | 'Include' => 'Disallow changes to specified form fields',
|
---|
546 | 'Exclude' => 'Allow changes to specified form fields',
|
---|
547 | },
|
---|
548 | },
|
---|
549 | Fields => {
|
---|
550 | Notes => 'FieldMDP signatures only',
|
---|
551 | Name => 'FormFields',
|
---|
552 | List => 1,
|
---|
553 | },
|
---|
554 | );
|
---|
555 |
|
---|
556 | # unknown tags for use in verbose option
|
---|
557 | %Image::ExifTool::PDF::Unknown = (
|
---|
558 | GROUPS => { 2 => 'Unknown' },
|
---|
559 | );
|
---|
560 |
|
---|
561 | #------------------------------------------------------------------------------
|
---|
562 | # AutoLoad our writer routines when necessary
|
---|
563 | #
|
---|
564 | sub AUTOLOAD
|
---|
565 | {
|
---|
566 | return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_);
|
---|
567 | }
|
---|
568 |
|
---|
569 | #------------------------------------------------------------------------------
|
---|
570 | # Convert from PDF to EXIF-style date/time
|
---|
571 | # Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM')
|
---|
572 | # Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM)
|
---|
573 | sub ConvertPDFDate($)
|
---|
574 | {
|
---|
575 | my $date = shift;
|
---|
576 | # remove optional 'D:' prefix
|
---|
577 | $date =~ s/^D://;
|
---|
578 | # fill in default values if necessary
|
---|
579 | # YYYYmmddHHMMSS
|
---|
580 | my $default = '00000101000000';
|
---|
581 | if (length $date < length $default) {
|
---|
582 | $date .= substr($default, length $date);
|
---|
583 | }
|
---|
584 | $date =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ or return $date;
|
---|
585 | $date = "$1:$2:$3 $4:$5:$6";
|
---|
586 | if ($7) {
|
---|
587 | my $tz = $7;
|
---|
588 | if ($tz =~ /^\s*Z/i) {
|
---|
589 | # ignore any "HH'mm'" after the Z (OS X 10.6 does this)
|
---|
590 | $date .= 'Z';
|
---|
591 | # tolerate some improper formatting in timezone specification
|
---|
592 | } elsif ($tz =~ /^\s*([-+])\s*(\d+)[': ]+(\d*)/) {
|
---|
593 | $date .= $1 . $2 . ':' . ($3 || '00');
|
---|
594 | }
|
---|
595 | }
|
---|
596 | return $date;
|
---|
597 | }
|
---|
598 |
|
---|
599 | #------------------------------------------------------------------------------
|
---|
600 | # Locate any object in the XRef tables (including compressed objects)
|
---|
601 | # Inputs: 0) XRef reference, 1) object reference string (or free object number)
|
---|
602 | # Returns: offset to object in file or compressed object reference string,
|
---|
603 | # 0 if object is free, or undefined on error
|
---|
604 | sub LocateAnyObject($$)
|
---|
605 | {
|
---|
606 | my ($xref, $ref) = @_;
|
---|
607 | return undef unless $xref;
|
---|
608 | return $$xref{$ref} if exists $$xref{$ref};
|
---|
609 | # get the object number
|
---|
610 | return undef unless $ref =~ /^(\d+)/;
|
---|
611 | my $objNum = $1;
|
---|
612 | # return 0 if the object number has been reused (old object is free)
|
---|
613 | return 0 if defined $$xref{$objNum};
|
---|
614 | #
|
---|
615 | # scan our XRef stream dictionaries for this object
|
---|
616 | #
|
---|
617 | return undef unless $$xref{dicts};
|
---|
618 | my $dict;
|
---|
619 | foreach $dict (@{$$xref{dicts}}) {
|
---|
620 | # quick check to see if the object is in the range for this xref stream
|
---|
621 | next if $objNum >= $$dict{Size};
|
---|
622 | my $index = $$dict{Index};
|
---|
623 | next if $objNum < $$index[0];
|
---|
624 | # scan the tables for the specified object
|
---|
625 | my $size = $$dict{_entry_size};
|
---|
626 | my $num = scalar(@$index) / 2;
|
---|
627 | my $tot = 0;
|
---|
628 | my $i;
|
---|
629 | for ($i=0; $i<$num; ++$i) {
|
---|
630 | my $start = $$index[$i*2];
|
---|
631 | my $count = $$index[$i*2+1];
|
---|
632 | # table is in ascending order, so quit if we have passed the object
|
---|
633 | last if $objNum < $start;
|
---|
634 | if ($objNum < $start + $count) {
|
---|
635 | my $offset = $size * ($objNum - $start + $tot);
|
---|
636 | last if $offset + $size > length $$dict{_stream};
|
---|
637 | my @c = unpack("x$offset C$size", $$dict{_stream});
|
---|
638 | # extract values from this table entry
|
---|
639 | # (can be 1, 2, 3, 4, etc.. bytes per value)
|
---|
640 | my (@t, $j, $k);
|
---|
641 | my $w = $$dict{W};
|
---|
642 | for ($j=0; $j<3; ++$j) {
|
---|
643 | # use default value if W entry is 0 (as per spec)
|
---|
644 | # - 0th element defaults to 1, others default to 0
|
---|
645 | $$w[$j] or $t[$j] = ($j ? 0 : 1), next;
|
---|
646 | $t[$j] = shift(@c);
|
---|
647 | for ($k=1; $k < $$w[$j]; ++$k) {
|
---|
648 | $t[$j] = 256 * $t[$j] + shift(@c);
|
---|
649 | }
|
---|
650 | }
|
---|
651 | # by default, use "o g R" as the xref key
|
---|
652 | # (o = object number, g = generation number)
|
---|
653 | my $ref2 = "$objNum $t[2] R";
|
---|
654 | if ($t[0] == 1) {
|
---|
655 | # normal object reference:
|
---|
656 | # $t[1]=offset of object from start, $t[2]=generation number
|
---|
657 | $$xref{$ref2} = $t[1];
|
---|
658 | } elsif ($t[0] == 2) {
|
---|
659 | # compressed object reference:
|
---|
660 | # $t[1]=stream object number, $t[2]=index of object in stream
|
---|
661 | $ref2 = "$objNum 0 R";
|
---|
662 | $$xref{$ref2} = "I$t[2] $t[1] 0 R";
|
---|
663 | } elsif ($t[0] == 0) {
|
---|
664 | # free object:
|
---|
665 | # $t[1]=next free object in linked list, $t[2]=generation number
|
---|
666 | $$xref{$ref2} = 0;
|
---|
667 | } else {
|
---|
668 | # treat as a null object
|
---|
669 | $$xref{$ref2} = undef;
|
---|
670 | }
|
---|
671 | $$xref{$objNum} = $t[1]; # remember offsets by object number too
|
---|
672 | return $$xref{$ref} if $ref eq $ref2;
|
---|
673 | return 0; # object is free or was reused
|
---|
674 | }
|
---|
675 | $tot += $count;
|
---|
676 | }
|
---|
677 | }
|
---|
678 | return undef;
|
---|
679 | }
|
---|
680 |
|
---|
681 | #------------------------------------------------------------------------------
|
---|
682 | # Locate a regular object in the XRef tables (does not include compressed objects)
|
---|
683 | # Inputs: 0) XRef reference, 1) object reference string (or free object number)
|
---|
684 | # Returns: offset to object in file, 0 if object is free,
|
---|
685 | # or undef on error or if object was compressed
|
---|
686 | sub LocateObject($$)
|
---|
687 | {
|
---|
688 | my ($xref, $ref) = @_;
|
---|
689 | my $offset = LocateAnyObject($xref, $ref);
|
---|
690 | return undef if $offset and $offset =~ /^I/;
|
---|
691 | return $offset;
|
---|
692 | }
|
---|
693 |
|
---|
694 | #------------------------------------------------------------------------------
|
---|
695 | # Check that the correct object is located at the specified file offset
|
---|
696 | # Inputs: 0) ExifTool ref, 1) object name, 2) object reference string, 3) file offset
|
---|
697 | # Returns: first non-blank line at start of object, or undef on error
|
---|
698 | sub CheckObject($$$$)
|
---|
699 | {
|
---|
700 | my ($et, $tag, $ref, $offset) = @_;
|
---|
701 | my ($data, $obj, $dat, $pat);
|
---|
702 |
|
---|
703 | my $raf = $$et{RAF};
|
---|
704 | $raf->Seek($offset+$$et{PDFBase}, 0) or $et->Warn("Bad $tag offset"), return undef;
|
---|
705 | # verify that we are reading the expected object
|
---|
706 | ($obj = $ref) =~ s/R/obj/;
|
---|
707 | for (;;) {
|
---|
708 | $raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
|
---|
709 | last if $data =~ s/^$obj//;
|
---|
710 | next if $data =~ /^\s+$/; # keep reading if this was a blank line
|
---|
711 | # handle cases where other whitespace characters are used in the object ID string
|
---|
712 | while ($data =~ /^\d+(\s+\d+)?\s*$/) {
|
---|
713 | $raf->ReadLine($dat);
|
---|
714 | $data .= $dat;
|
---|
715 | }
|
---|
716 | ($pat = $obj) =~ s/ /\\s+/g;
|
---|
717 | unless ($data =~ s/$pat//) {
|
---|
718 | $tag = ucfirst $tag;
|
---|
719 | $et->Warn("$tag object ($obj) not found at offset $offset");
|
---|
720 | return undef;
|
---|
721 | }
|
---|
722 | last;
|
---|
723 | }
|
---|
724 | # read the first line of data from the object (ignoring blank lines and comments)
|
---|
725 | for (;;) {
|
---|
726 | last if $data =~ /\S/ and $data !~ /^\s*%/;
|
---|
727 | $raf->ReadLine($data) or $et->Warn("Error reading $tag data"), return undef;
|
---|
728 | }
|
---|
729 | return $data;
|
---|
730 | }
|
---|
731 |
|
---|
732 | #------------------------------------------------------------------------------
|
---|
733 | # Fetch indirect object from file (from inside a stream if required)
|
---|
734 | # Inputs: 0) ExifTool object reference, 1) object reference string,
|
---|
735 | # 2) xref lookup, 3) object name (for warning messages)
|
---|
736 | # Returns: object data or undefined on error
|
---|
737 | # Notes: sets $lastFetched to the object reference, or undef if the object
|
---|
738 | # was extracted from an encrypted stream
|
---|
739 | sub FetchObject($$$$)
|
---|
740 | {
|
---|
741 | my ($et, $ref, $xref, $tag) = @_;
|
---|
742 | $lastFetched = $ref; # save this for decoding if necessary
|
---|
743 | my $offset = LocateAnyObject($xref, $ref);
|
---|
744 | $lastOffset = $offset;
|
---|
745 | unless ($offset) {
|
---|
746 | $et->Warn("Bad $tag reference") unless defined $offset;
|
---|
747 | return undef;
|
---|
748 | }
|
---|
749 | my ($data, $obj);
|
---|
750 | if ($offset =~ s/^I(\d+) //) {
|
---|
751 | my $index = $1; # object index in stream
|
---|
752 | my ($objNum) = split ' ', $ref; # save original object number
|
---|
753 | $ref = $offset; # now a reference to the containing stream object
|
---|
754 | $obj = $streamObjs{$ref};
|
---|
755 | unless ($obj) {
|
---|
756 | # don't try to load the same object stream twice
|
---|
757 | return undef if defined $obj;
|
---|
758 | $streamObjs{$ref} = '';
|
---|
759 | # load the parent object stream
|
---|
760 | $obj = FetchObject($et, $ref, $xref, $tag);
|
---|
761 | # make sure it contains everything we need
|
---|
762 | return undef unless defined $obj and ref($obj) eq 'HASH';
|
---|
763 | return undef unless $$obj{First} and $$obj{N};
|
---|
764 | return undef unless DecodeStream($et, $obj);
|
---|
765 | # add a special '_table' entry to this dictionary which contains
|
---|
766 | # the list of object number/offset pairs from the stream header
|
---|
767 | my $num = $$obj{N} * 2;
|
---|
768 | my @table = split ' ', $$obj{_stream}, $num;
|
---|
769 | return undef unless @table == $num;
|
---|
770 | # remove everything before first object in stream
|
---|
771 | $$obj{_stream} = substr($$obj{_stream}, $$obj{First});
|
---|
772 | $table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number
|
---|
773 | $$obj{_table} = \@table;
|
---|
774 | # save the object stream so we don't have to re-load it later
|
---|
775 | $streamObjs{$ref} = $obj;
|
---|
776 | }
|
---|
777 | # verify that we have the specified object
|
---|
778 | my $i = 2 * $index;
|
---|
779 | my $table = $$obj{_table};
|
---|
780 | unless ($index < $$obj{N} and $$table[$i] == $objNum) {
|
---|
781 | $et->Warn("Bad index for stream object $tag");
|
---|
782 | return undef;
|
---|
783 | }
|
---|
784 | # extract the object at the specified index in the stream
|
---|
785 | # (offsets in table are in sequential order, so we can subtract from
|
---|
786 | # the next offset to get the object length)
|
---|
787 | $offset = $$table[$i + 1];
|
---|
788 | my $len = ($$table[$i + 3] || length($$obj{_stream})) - $offset;
|
---|
789 | $data = substr($$obj{_stream}, $offset, $len);
|
---|
790 | # avoid re-decrypting data in already decrypted streams
|
---|
791 | undef $lastFetched if $cryptStream;
|
---|
792 | return ExtractObject($et, \$data);
|
---|
793 | }
|
---|
794 | # load the start of the object
|
---|
795 | $data = CheckObject($et, $tag, $ref, $offset);
|
---|
796 | return undef unless defined $data;
|
---|
797 |
|
---|
798 | return ExtractObject($et, \$data, $$et{RAF}, $xref);
|
---|
799 | }
|
---|
800 |
|
---|
801 | #------------------------------------------------------------------------------
|
---|
802 | # Convert PDF value to something readable
|
---|
803 | # Inputs: 0) PDF object data
|
---|
804 | # Returns: converted object
|
---|
805 | sub ReadPDFValue($)
|
---|
806 | {
|
---|
807 | my $str = shift;
|
---|
808 | # decode all strings in an array
|
---|
809 | if (ref $str eq 'ARRAY') {
|
---|
810 | # create new list to not alter the original data when rewriting
|
---|
811 | my ($val, @vals);
|
---|
812 | foreach $val (@$str) {
|
---|
813 | push @vals, ReadPDFValue($val);
|
---|
814 | }
|
---|
815 | return \@vals;
|
---|
816 | }
|
---|
817 | length $str or return $str;
|
---|
818 | my $delim = substr($str, 0, 1);
|
---|
819 | if ($delim eq '(') { # literal string
|
---|
820 | $str = $1 if $str =~ /^.*?\((.*)\)/s; # remove brackets
|
---|
821 | # decode escape sequences in literal strings
|
---|
822 | while ($str =~ /\\(.)/sg) {
|
---|
823 | my $n = pos($str) - 2;
|
---|
824 | my $c = $1;
|
---|
825 | my $r;
|
---|
826 | if ($c =~ /[0-7]/) {
|
---|
827 | # get up to 2 more octal digits
|
---|
828 | $c .= $1 if $str =~ /\G([0-7]{1,2})/g;
|
---|
829 | # convert octal escape code
|
---|
830 | $r = chr(oct($c) & 0xff);
|
---|
831 | } elsif ($c eq "\x0d") {
|
---|
832 | # the string is continued if the line ends with '\'
|
---|
833 | # (also remove "\x0d\x0a")
|
---|
834 | $c .= $1 if $str =~ /\G(\x0a)/g;
|
---|
835 | $r = '';
|
---|
836 | } elsif ($c eq "\x0a") {
|
---|
837 | $r = '';
|
---|
838 | } else {
|
---|
839 | # convert escaped characters
|
---|
840 | ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/;
|
---|
841 | }
|
---|
842 | substr($str, $n, length($c)+1) = $r;
|
---|
843 | # continue search after this character
|
---|
844 | pos($str) = $n + length($r);
|
---|
845 | }
|
---|
846 | Crypt(\$str, $lastFetched) if $cryptString;
|
---|
847 | } elsif ($delim eq '<') { # hex string
|
---|
848 | # decode hex data
|
---|
849 | $str =~ tr/0-9A-Fa-f//dc;
|
---|
850 | $str .= '0' if length($str) & 0x01; # (by the spec)
|
---|
851 | $str = pack('H*', $str);
|
---|
852 | Crypt(\$str, $lastFetched) if $cryptString;
|
---|
853 | } elsif ($delim eq '/') { # name
|
---|
854 | $str = substr($str, 1);
|
---|
855 | # convert escape codes (PDF 1.2 or later)
|
---|
856 | $str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2;
|
---|
857 | }
|
---|
858 | return $str;
|
---|
859 | }
|
---|
860 |
|
---|
861 | #------------------------------------------------------------------------------
|
---|
862 | # Extract PDF object from combination of buffered data and file
|
---|
863 | # Inputs: 0) ExifTool object reference, 1) data reference,
|
---|
864 | # 2) optional raf reference, 3) optional xref table
|
---|
865 | # Returns: converted PDF object or undef on error
|
---|
866 | # a) dictionary object --> hash reference
|
---|
867 | # b) array object --> array reference
|
---|
868 | # c) indirect reference --> scalar reference
|
---|
869 | # d) string, name, integer, boolean, null --> scalar value
|
---|
870 | # - updates $$dataPt on return to contain unused data
|
---|
871 | # - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent
|
---|
872 | # the stream data and a list of the tags (not including '_stream' and '_tags')
|
---|
873 | # in their original order
|
---|
874 | sub ExtractObject($$;$$)
|
---|
875 | {
|
---|
876 | my ($et, $dataPt, $raf, $xref) = @_;
|
---|
877 | my (@tags, $data, $objData);
|
---|
878 | my $dict = { };
|
---|
879 | my $delim;
|
---|
880 |
|
---|
881 | for (;;) {
|
---|
882 | if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/s) {
|
---|
883 | $delim = $1;
|
---|
884 | $$dataPt =~ s/^\s+//; # remove leading white space
|
---|
885 | $objData = ReadToNested($dataPt, $raf);
|
---|
886 | return undef unless defined $objData;
|
---|
887 | last;
|
---|
888 | } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{}s) {
|
---|
889 | #
|
---|
890 | # extract boolean, numerical, string, name, null object or indirect reference
|
---|
891 | #
|
---|
892 | $objData = $1;
|
---|
893 | # look for an indirect reference
|
---|
894 | if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) {
|
---|
895 | $objData .= "$1 R";
|
---|
896 | $objData = \$objData; # return scalar reference
|
---|
897 | }
|
---|
898 | return $objData; # return simple scalar or scalar reference
|
---|
899 | }
|
---|
900 | $raf and $raf->ReadLine($data) or return undef;
|
---|
901 | $$dataPt .= $data;
|
---|
902 | }
|
---|
903 | #
|
---|
904 | # return literal string or hex string without parsing
|
---|
905 | #
|
---|
906 | if ($delim eq '(' or $delim eq '<') {
|
---|
907 | return $objData;
|
---|
908 | #
|
---|
909 | # extract array
|
---|
910 | #
|
---|
911 | } elsif ($delim eq '[') {
|
---|
912 | $objData =~ /^.*?\[(.*)\]/s or return undef;
|
---|
913 | my $data = $1; # brackets removed
|
---|
914 | my @list;
|
---|
915 | for (;;) {
|
---|
916 | last unless $data =~ m{\s*(\S[^[(/<>\s]*)}sg;
|
---|
917 | my $val = $1;
|
---|
918 | if ($val =~ /^(<{1,2}|\[|\()/) {
|
---|
919 | my $pos = pos($data) - length($val);
|
---|
920 | # nested dict, array, literal string or hex string
|
---|
921 | my $buff = substr($data, $pos);
|
---|
922 | $val = ReadToNested(\$buff);
|
---|
923 | last unless defined $val;
|
---|
924 | pos($data) = $pos + length($val);
|
---|
925 | $val = ExtractObject($et, \$val);
|
---|
926 | } elsif ($val =~ /^\d/) {
|
---|
927 | my $pos = pos($data);
|
---|
928 | if ($data =~ /\G\s+(\d+)\s+R/g) {
|
---|
929 | $val = \ "$val $1 R"; # make a reference
|
---|
930 | } else {
|
---|
931 | pos($data) = $pos;
|
---|
932 | }
|
---|
933 | }
|
---|
934 | push @list, $val;
|
---|
935 | }
|
---|
936 | return \@list;
|
---|
937 | }
|
---|
938 | #
|
---|
939 | # extract dictionary
|
---|
940 | #
|
---|
941 | # Note: entries are not necessarily separated by whitespace (doh!)
|
---|
942 | # eg) "/Tag/Name", "/Tag(string)", "/Tag[array]", etc are legal!
|
---|
943 | # Also, they may be separated by a comment (eg. "/Tag%comment\nValue"),
|
---|
944 | # but comments have already been removed
|
---|
945 | while ($objData =~ m{(\s*)/([^/[\]()<>{}\s]+)\s*(\S[^[(/<>\s]*)}sg) {
|
---|
946 | my $tag = $2;
|
---|
947 | my $val = $3;
|
---|
948 | if ($val =~ /^(<{1,2}|\[|\()/) {
|
---|
949 | # nested dict, array, literal string or hex string
|
---|
950 | $objData = substr($objData, pos($objData)-length($val));
|
---|
951 | $val = ReadToNested(\$objData, $raf);
|
---|
952 | last unless defined $val;
|
---|
953 | $val = ExtractObject($et, \$val);
|
---|
954 | pos($objData) = 0;
|
---|
955 | } elsif ($val =~ /^\d/) {
|
---|
956 | my $pos = pos($objData);
|
---|
957 | if ($objData =~ /\G\s+(\d+)\s+R/sg) {
|
---|
958 | $val = \ "$val $1 R"; # make a reference
|
---|
959 | } else {
|
---|
960 | pos($objData) = $pos;
|
---|
961 | }
|
---|
962 | }
|
---|
963 | if ($$dict{$tag}) {
|
---|
964 | # duplicate dictionary entries are not allowed
|
---|
965 | $et->Warn("Duplicate '${tag}' entry in dictionary (ignored)");
|
---|
966 | } else {
|
---|
967 | # save the entry
|
---|
968 | push @tags, $tag;
|
---|
969 | $$dict{$tag} = $val;
|
---|
970 | }
|
---|
971 | }
|
---|
972 | return undef unless @tags;
|
---|
973 | $$dict{_tags} = \@tags;
|
---|
974 | return $dict unless $raf; # direct objects can not have streams
|
---|
975 | #
|
---|
976 | # extract the stream object
|
---|
977 | #
|
---|
978 | # dictionary must specify stream Length
|
---|
979 | my $length = $$dict{Length} or return $dict;
|
---|
980 | if (ref $length) {
|
---|
981 | $length = $$length;
|
---|
982 | my $oldpos = $raf->Tell();
|
---|
983 | # get the location of the object specifying the length
|
---|
984 | # (compressed objects are not allowed)
|
---|
985 | my $offset = LocateObject($xref, $length) or return $dict;
|
---|
986 | $offset or $et->Warn('Bad stream Length object'), return $dict;
|
---|
987 | $data = CheckObject($et, 'stream Length', $length, $offset);
|
---|
988 | defined $data or return $dict;
|
---|
989 | $data =~ /^\s*(\d+)/ or $et->Warn('Stream Length not found'), return $dict;
|
---|
990 | $length = $1;
|
---|
991 | $raf->Seek($oldpos, 0); # restore position to start of stream
|
---|
992 | }
|
---|
993 | # extract the trailing stream data
|
---|
994 | for (;;) {
|
---|
995 | # find the stream token
|
---|
996 | if ($$dataPt =~ /(\S+)/) {
|
---|
997 | last unless $1 eq 'stream';
|
---|
998 | # read an extra line because it may contain our \x0a
|
---|
999 | $$dataPt .= $data if $raf->ReadLine($data);
|
---|
1000 | # remove our stream header
|
---|
1001 | $$dataPt =~ s/^\s*stream(\x0a|\x0d\x0a)//s;
|
---|
1002 | my $more = $length - length($$dataPt);
|
---|
1003 | if ($more > 0) {
|
---|
1004 | unless ($raf->Read($data, $more) == $more) {
|
---|
1005 | $et->Warn('Error reading stream data');
|
---|
1006 | $$dataPt = '';
|
---|
1007 | return $dict;
|
---|
1008 | }
|
---|
1009 | $$dict{_stream} = $$dataPt . $data;
|
---|
1010 | $$dataPt = '';
|
---|
1011 | } elsif ($more < 0) {
|
---|
1012 | $$dict{_stream} = substr($$dataPt, 0, $length);
|
---|
1013 | $$dataPt = substr($$dataPt, $length);
|
---|
1014 | } else {
|
---|
1015 | $$dict{_stream} = $$dataPt;
|
---|
1016 | $$dataPt = '';
|
---|
1017 | }
|
---|
1018 | last;
|
---|
1019 | }
|
---|
1020 | $raf->ReadLine($data) or last;
|
---|
1021 | $$dataPt .= $data;
|
---|
1022 | }
|
---|
1023 | return $dict;
|
---|
1024 | }
|
---|
1025 |
|
---|
1026 | #------------------------------------------------------------------------------
|
---|
1027 | # Read to nested delimiter
|
---|
1028 | # Inputs: 0) data reference, 1) optional raf reference
|
---|
1029 | # Returns: data up to and including matching delimiter (or undef on error)
|
---|
1030 | # - updates data reference with trailing data
|
---|
1031 | # - unescapes characters in literal strings
|
---|
1032 | my %closingDelim = ( # lookup for matching delimiter
|
---|
1033 | '(' => ')',
|
---|
1034 | '[' => ']',
|
---|
1035 | '<' => '>',
|
---|
1036 | '<<' => '>>',
|
---|
1037 | );
|
---|
1038 | sub ReadToNested($;$)
|
---|
1039 | {
|
---|
1040 | my ($dataPt, $raf) = @_;
|
---|
1041 | my @delim = (''); # closing delimiter list, most deeply nested first
|
---|
1042 | pos($$dataPt) = 0; # begin at start of data
|
---|
1043 | for (;;) {
|
---|
1044 | unless ($$dataPt =~ /(\\*)(\(|\)|<{1,2}|>{1,2}|\[|\]|%)/g) {
|
---|
1045 | # must read some more data
|
---|
1046 | my $buff;
|
---|
1047 | last unless $raf and $raf->ReadLine($buff);
|
---|
1048 | $$dataPt .= $buff;
|
---|
1049 | pos($$dataPt) = length($$dataPt) - length($buff);
|
---|
1050 | next;
|
---|
1051 | }
|
---|
1052 | # are we in a literal string?
|
---|
1053 | if ($delim[0] eq ')') {
|
---|
1054 | # ignore escaped delimiters (preceded by odd number of \'s)
|
---|
1055 | next if length($1) & 0x01;
|
---|
1056 | # ignore all delimiters but unescaped braces
|
---|
1057 | next unless $2 eq '(' or $2 eq ')';
|
---|
1058 | } elsif ($2 eq '%') {
|
---|
1059 | # ignore the comment
|
---|
1060 | my $pos = pos($$dataPt) - 1;
|
---|
1061 | # remove everything from '%' up to but not including newline
|
---|
1062 | $$dataPt =~ /.*/g;
|
---|
1063 | my $end = pos($$dataPt);
|
---|
1064 | $$dataPt = substr($$dataPt, 0, $pos) . substr($$dataPt, $end);
|
---|
1065 | pos($$dataPt) = $pos;
|
---|
1066 | next;
|
---|
1067 | }
|
---|
1068 | if ($closingDelim{$2}) {
|
---|
1069 | # push the corresponding closing delimiter
|
---|
1070 | unshift @delim, $closingDelim{$2};
|
---|
1071 | next;
|
---|
1072 | }
|
---|
1073 | unless ($2 eq $delim[0]) {
|
---|
1074 | # handle the case where we find a ">>>" and interpret it
|
---|
1075 | # as ">> >" instead of "> >>"
|
---|
1076 | next unless $2 eq '>>' and $delim[0] eq '>';
|
---|
1077 | pos($$dataPt) = pos($$dataPt) - 1;
|
---|
1078 | }
|
---|
1079 | shift @delim; # remove from nesting list
|
---|
1080 | next if $delim[0]; # keep going if we have more nested delimiters
|
---|
1081 | my $pos = pos($$dataPt);
|
---|
1082 | my $buff = substr($$dataPt, 0, $pos);
|
---|
1083 | $$dataPt = substr($$dataPt, $pos);
|
---|
1084 | return $buff; # success!
|
---|
1085 | }
|
---|
1086 | return undef; # didn't find matching delimiter
|
---|
1087 | }
|
---|
1088 |
|
---|
1089 | #------------------------------------------------------------------------------
|
---|
1090 | # Decode LZW-encoded data (ref 1)
|
---|
1091 | # Inputs: 0) data reference
|
---|
1092 | # Returns: true on success and data is decoded, or false and data is untouched
|
---|
1093 | sub DecodeLZW($)
|
---|
1094 | {
|
---|
1095 | my $dataPt = shift;
|
---|
1096 | return 0 if length $$dataPt < 4;
|
---|
1097 | my @lzw = (map(chr, 0..255), undef, undef); # LZW code table
|
---|
1098 | my $mask = 0x01ff; # mask for least-significant 9 bits
|
---|
1099 | my @dat = unpack 'n*', $$dataPt . "\0";
|
---|
1100 | my $word = ($dat[0] << 16) | $dat[1];
|
---|
1101 | my ($bit, $pos, $bits, $out) = (0, 2, 9, '');
|
---|
1102 | my $lastVal;
|
---|
1103 | for (;;) {
|
---|
1104 | # bits are packed MSB first in PDF LZW (the PDF spec doesn't mention this)
|
---|
1105 | my $shift = 32 - ($bit + $bits);
|
---|
1106 | if ($shift < 0) {
|
---|
1107 | return 0 if $pos >= @dat; # missing EOD marker
|
---|
1108 | $word = (($word & 0xffff) << 16) | $dat[$pos++]; # read next word
|
---|
1109 | $bit -= 16;
|
---|
1110 | $shift += 16;
|
---|
1111 | };
|
---|
1112 | my $code = ($word >> $shift) & $mask;
|
---|
1113 | $bit += $bits;
|
---|
1114 | my $val = $lzw[$code];
|
---|
1115 | if (defined $val) {
|
---|
1116 | # store new code as previous sequence plus 1st char of new sequence
|
---|
1117 | push @lzw, $lastVal . substr($val, 0, 1) if defined $lastVal;
|
---|
1118 | } elsif ($code == @lzw) { # new code
|
---|
1119 | return 0 unless defined $lastVal;
|
---|
1120 | # we are using the code that we are about to generate, so the last
|
---|
1121 | # character in the new sequence must be the same as the first
|
---|
1122 | # character in the previous sequence (makes sense if you think about it)
|
---|
1123 | $val = $lastVal . substr($lastVal, 0, 1);
|
---|
1124 | push @lzw, $val;
|
---|
1125 | } elsif ($code == 256) { # clear table
|
---|
1126 | splice @lzw, 258;
|
---|
1127 | $bits = 9;
|
---|
1128 | $mask = 0x1ff;
|
---|
1129 | undef $lastVal;
|
---|
1130 | next;
|
---|
1131 | } elsif ($code == 257) { # EOD marker
|
---|
1132 | last; # all done!
|
---|
1133 | } else {
|
---|
1134 | return 0;
|
---|
1135 | }
|
---|
1136 | $out .= $val; # add this byte sequence to the output
|
---|
1137 | # we added a new entry to the LZW table, so we must increase
|
---|
1138 | # the bit width if necessary, up to a maximum of 12
|
---|
1139 | @lzw >= $mask and $bits < 12 and ++$bits, $mask |= $mask << 1;
|
---|
1140 | $lastVal = $val;
|
---|
1141 | }
|
---|
1142 | $$dataPt = $out; # return decompressed data
|
---|
1143 | return 1;
|
---|
1144 | }
|
---|
1145 |
|
---|
1146 | #------------------------------------------------------------------------------
|
---|
1147 | # Decode filtered stream
|
---|
1148 | # Inputs: 0) ExifTool object reference, 1) dictionary reference
|
---|
1149 | # Returns: true if stream has been decoded OK
|
---|
1150 | sub DecodeStream($$)
|
---|
1151 | {
|
---|
1152 | local $_;
|
---|
1153 | my ($et, $dict) = @_;
|
---|
1154 |
|
---|
1155 | return 0 unless $$dict{_stream}; # no stream to decode
|
---|
1156 |
|
---|
1157 | # get list of filters
|
---|
1158 | my (@filters, @decodeParms, $filter);
|
---|
1159 | if (ref $$dict{Filter} eq 'ARRAY') {
|
---|
1160 | @filters = @{$$dict{Filter}};
|
---|
1161 | } elsif (defined $$dict{Filter}) {
|
---|
1162 | @filters = ($$dict{Filter});
|
---|
1163 | }
|
---|
1164 | # be sure we can process all the filters before we take the time to do the decryption
|
---|
1165 | foreach $filter (@filters) {
|
---|
1166 | next if $supportedFilter{$filter};
|
---|
1167 | $et->WarnOnce("Unsupported Filter $filter");
|
---|
1168 | return 0;
|
---|
1169 | }
|
---|
1170 | # apply decryption first if required (and if the default encryption
|
---|
1171 | # has not been overridden by a Crypt filter. Note: the Crypt filter
|
---|
1172 | # must be first in the Filter array: ref 3, page 38)
|
---|
1173 | unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) {
|
---|
1174 | CryptStream($dict, $lastFetched);
|
---|
1175 | }
|
---|
1176 | return 1 unless $$dict{Filter}; # Filter entry is mandatory
|
---|
1177 | return 0 if defined $$dict{_filtered}; # avoid double-filtering
|
---|
1178 | $$dict{_filtered} = 1; # set flag to prevent double-filtering
|
---|
1179 |
|
---|
1180 | # get array of DecodeParms dictionaries
|
---|
1181 | if (ref $$dict{DecodeParms} eq 'ARRAY') {
|
---|
1182 | @decodeParms = @{$$dict{DecodeParms}};
|
---|
1183 | } else {
|
---|
1184 | @decodeParms = ($$dict{DecodeParms});
|
---|
1185 | }
|
---|
1186 |
|
---|
1187 | foreach $filter (@filters) {
|
---|
1188 | my $decodeParms = shift @decodeParms;
|
---|
1189 |
|
---|
1190 | if ($filter eq '/FlateDecode') {
|
---|
1191 | # make sure we support the predictor (if used) before decoding
|
---|
1192 | my $pre;
|
---|
1193 | if (ref $decodeParms eq 'HASH') {
|
---|
1194 | $pre = $$decodeParms{Predictor};
|
---|
1195 | if ($pre and $pre ne '1' and $pre ne '12') {
|
---|
1196 | $et->WarnOnce("FlateDecode Predictor $pre currently not supported");
|
---|
1197 | return 0;
|
---|
1198 | }
|
---|
1199 | }
|
---|
1200 | if (eval { require Compress::Zlib }) {
|
---|
1201 | my $inflate = Compress::Zlib::inflateInit();
|
---|
1202 | my ($buff, $stat);
|
---|
1203 | $inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream});
|
---|
1204 | if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) {
|
---|
1205 | $$dict{_stream} = $buff;
|
---|
1206 | } else {
|
---|
1207 | $et->Warn('Error inflating stream');
|
---|
1208 | return 0;
|
---|
1209 | }
|
---|
1210 | } else {
|
---|
1211 | $et->WarnOnce('Install Compress::Zlib to process filtered streams');
|
---|
1212 | return 0;
|
---|
1213 | }
|
---|
1214 | next unless $pre and $pre eq '12'; # 12 = 'up' prediction
|
---|
1215 |
|
---|
1216 | # apply anti-predictor
|
---|
1217 | my $cols = $$decodeParms{Columns};
|
---|
1218 | unless ($cols) {
|
---|
1219 | # currently only support 'up' prediction
|
---|
1220 | $et->WarnOnce('No Columns for decoding stream');
|
---|
1221 | return 0;
|
---|
1222 | }
|
---|
1223 | my @bytes = unpack('C*', $$dict{_stream});
|
---|
1224 | my @pre = (0) x $cols; # initialize predictor array
|
---|
1225 | my $buff = '';
|
---|
1226 | while (@bytes > $cols) {
|
---|
1227 | unless (($_ = shift @bytes) == 2) {
|
---|
1228 | $et->WarnOnce("Unsupported PNG filter $_"); # (yes, PNG)
|
---|
1229 | return 0;
|
---|
1230 | }
|
---|
1231 | foreach (@pre) {
|
---|
1232 | $_ = ($_ + shift(@bytes)) & 0xff;
|
---|
1233 | }
|
---|
1234 | $buff .= pack('C*', @pre);
|
---|
1235 | }
|
---|
1236 | $$dict{_stream} = $buff;
|
---|
1237 |
|
---|
1238 | } elsif ($filter eq '/Crypt') {
|
---|
1239 |
|
---|
1240 | # (we shouldn't have to check the _decrypted flag since we
|
---|
1241 | # already checked the _filtered flag, but what the heck...)
|
---|
1242 | next if defined $$dict{_decrypted};
|
---|
1243 | # assume Identity filter (the default) if DecodeParms are missing
|
---|
1244 | next unless ref $decodeParms eq 'HASH';
|
---|
1245 | my $name = $$decodeParms{Name};
|
---|
1246 | next unless defined $name or $name eq 'Identity';
|
---|
1247 | if ($name ne 'StdCF') {
|
---|
1248 | $et->WarnOnce("Unsupported Crypt Filter $name");
|
---|
1249 | return 0;
|
---|
1250 | }
|
---|
1251 | unless ($cryptInfo) {
|
---|
1252 | $et->WarnOnce('Missing Encrypt StdCF entry');
|
---|
1253 | return 0;
|
---|
1254 | }
|
---|
1255 | # decrypt the stream manually because we want to:
|
---|
1256 | # 1) ignore $cryptStream (StmF) setting
|
---|
1257 | # 2) ignore EncryptMetadata setting (I can't find mention of how to
|
---|
1258 | # reconcile this in the spec., but this would make sense)
|
---|
1259 | # 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b)
|
---|
1260 | # 4) set _decrypted flag so we will recrypt according to StmF when
|
---|
1261 | # writing (since we don't yet write Filter'd streams)
|
---|
1262 | Crypt(\$$dict{_stream}, 'none');
|
---|
1263 | $$dict{_decrypted} = ($cryptStream ? 1 : 0);
|
---|
1264 |
|
---|
1265 | } elsif ($filter eq '/LZWDecode') {
|
---|
1266 |
|
---|
1267 | # make sure we don't have any unsupported decoding parameters
|
---|
1268 | if (ref $decodeParms eq 'HASH') {
|
---|
1269 | if ($$decodeParms{Predictor}) {
|
---|
1270 | $et->WarnOnce("LZWDecode Predictor $$decodeParms{Predictor} currently not supported");
|
---|
1271 | return 0;
|
---|
1272 | } elsif ($$decodeParms{EarlyChange}) {
|
---|
1273 | $et->WarnOnce("LZWDecode EarlyChange currently not supported");
|
---|
1274 | return 0;
|
---|
1275 | }
|
---|
1276 | }
|
---|
1277 | unless (DecodeLZW(\$$dict{_stream})) {
|
---|
1278 | $et->WarnOnce('LZW decompress error');
|
---|
1279 | return 0;
|
---|
1280 | }
|
---|
1281 |
|
---|
1282 | } elsif ($filter eq '/ASCIIHexDecode') {
|
---|
1283 |
|
---|
1284 | $$dict{_stream} =~ s/>.*//; # truncate at '>' (end of data mark)
|
---|
1285 | $$dict{_stream} =~ tr/0-9a-zA-Z//d; # remove illegal characters
|
---|
1286 | $$dict{_stream} = pack 'H*', $$dict{_stream};
|
---|
1287 |
|
---|
1288 | } elsif ($filter eq '/ASCII85Decode') {
|
---|
1289 |
|
---|
1290 | my ($err, @out, $i);
|
---|
1291 | my ($n, $val) = (0, 0);
|
---|
1292 | foreach (split //, $$dict{_stream}) {
|
---|
1293 | if ($_ ge '!' and $_ le 'u') {;
|
---|
1294 | $val = 85 * $val + ord($_) - 33;
|
---|
1295 | next unless ++$n == 5;
|
---|
1296 | } elsif ($_ eq '~') {
|
---|
1297 | $n == 1 and $err = 1; # error to have a single char in the last group of 5
|
---|
1298 | for ($i=$n; $i<5; ++$i) { $val *= 85; }
|
---|
1299 | } elsif ($_ eq 'z') {
|
---|
1300 | $n and $err = 2, last; # error if 'z' isn't the first char
|
---|
1301 | $n = 5;
|
---|
1302 | } else {
|
---|
1303 | next if /^\s$/; # ignore white space
|
---|
1304 | $err = 3, last; # any other character is an error
|
---|
1305 | }
|
---|
1306 | $val = unpack('V', pack('N', $val)); # reverse byte order
|
---|
1307 | while (--$n > 0) {
|
---|
1308 | push @out, $val & 0xff;
|
---|
1309 | $val >>= 8;
|
---|
1310 | }
|
---|
1311 | last if $_ eq '~';
|
---|
1312 | # (both $n and $val are zero again now)
|
---|
1313 | }
|
---|
1314 | $err and $et->WarnOnce("ASCII85Decode error $err");
|
---|
1315 | $$dict{_stream} = pack('C*', @out);
|
---|
1316 | }
|
---|
1317 | }
|
---|
1318 | return 1;
|
---|
1319 | }
|
---|
1320 |
|
---|
1321 | #------------------------------------------------------------------------------
|
---|
1322 | # Initialize state for RC4 en/decryption (ref 2)
|
---|
1323 | # Inputs: 0) RC4 key string
|
---|
1324 | # Returns: RC4 key hash reference
|
---|
1325 | sub RC4Init($)
|
---|
1326 | {
|
---|
1327 | my @key = unpack('C*', shift);
|
---|
1328 | my @state = (0 .. 255);
|
---|
1329 | my ($i, $j) = (0, 0);
|
---|
1330 | while ($i < 256) {
|
---|
1331 | my $st = $state[$i];
|
---|
1332 | $j = ($j + $st + $key[$i % scalar(@key)]) & 0xff;
|
---|
1333 | $state[$i++] = $state[$j];
|
---|
1334 | $state[$j] = $st;
|
---|
1335 | }
|
---|
1336 | return { State => \@state, XY => [ 0, 0 ] };
|
---|
1337 | }
|
---|
1338 |
|
---|
1339 | #------------------------------------------------------------------------------
|
---|
1340 | # Apply RC4 en/decryption (ref 2)
|
---|
1341 | # Inputs: 0) data reference, 1) RC4 key hash reference or RC4 key string
|
---|
1342 | # - can call this method directly with a key string, or with with the key
|
---|
1343 | # reference returned by RC4Init
|
---|
1344 | # - RC4 is a symmetric algorithm, so encryption is the same as decryption
|
---|
1345 | sub RC4Crypt($$)
|
---|
1346 | {
|
---|
1347 | my ($dataPt, $key) = @_;
|
---|
1348 | $key = RC4Init($key) unless ref $key eq 'HASH';
|
---|
1349 | my $state = $$key{State};
|
---|
1350 | my ($x, $y) = @{$$key{XY}};
|
---|
1351 |
|
---|
1352 | my @data = unpack('C*', $$dataPt);
|
---|
1353 | foreach (@data) {
|
---|
1354 | $x = ($x + 1) & 0xff;
|
---|
1355 | my $stx = $$state[$x];
|
---|
1356 | $y = ($stx + $y) & 0xff;
|
---|
1357 | my $sty = $$state[$x] = $$state[$y];
|
---|
1358 | $$state[$y] = $stx;
|
---|
1359 | $_ ^= $$state[($stx + $sty) & 0xff];
|
---|
1360 | }
|
---|
1361 | $$key{XY} = [ $x, $y ];
|
---|
1362 | $$dataPt = pack('C*', @data);
|
---|
1363 | }
|
---|
1364 |
|
---|
1365 | #------------------------------------------------------------------------------
|
---|
1366 | # Update AES cipher with a bit of data
|
---|
1367 | # Inputs: 0) data
|
---|
1368 | # Returns: encrypted data
|
---|
1369 | my $cipherMore;
|
---|
1370 | sub CipherUpdate($)
|
---|
1371 | {
|
---|
1372 | my $dat = shift;
|
---|
1373 | my $pos = 0;
|
---|
1374 | $dat = $cipherMore . $dat if length $dat;
|
---|
1375 | while ($pos + 16 <= length($dat)) {
|
---|
1376 | substr($dat,$pos,16) = Image::ExifTool::AES::Cipher(substr($dat,$pos,16));
|
---|
1377 | $pos += 16;
|
---|
1378 | }
|
---|
1379 | if ($pos < length $dat) {
|
---|
1380 | $cipherMore = substr($dat,$pos);
|
---|
1381 | $dat = substr($dat,0,$pos);
|
---|
1382 | } else {
|
---|
1383 | $cipherMore = '';
|
---|
1384 | }
|
---|
1385 | return $dat;
|
---|
1386 | }
|
---|
1387 |
|
---|
1388 | #------------------------------------------------------------------------------
|
---|
1389 | # Get encrypted hash
|
---|
1390 | # Inputs: 0) Password, 1) salt, 2) vector, 3) encryption revision
|
---|
1391 | # Returns: hash
|
---|
1392 | sub GetHash($$$$)
|
---|
1393 | {
|
---|
1394 | my ($password, $salt, $vector, $rev) = @_;
|
---|
1395 |
|
---|
1396 | # return Rev 5 hash
|
---|
1397 | return Digest::SHA::sha256($password, $salt, $vector) if $rev == 5;
|
---|
1398 |
|
---|
1399 | # compute Rev 6 hardened hash
|
---|
1400 | # (ref http://code.google.com/p/origami-pdf/source/browse/lib/origami/encryption.rb)
|
---|
1401 | my $blockSize = 32;
|
---|
1402 | my $input = Digest::SHA::sha256($password, $salt, $vector) . ("\0" x 32);
|
---|
1403 | my $key = substr($input, 0, 16);
|
---|
1404 | my $iv = substr($input, 16, 16);
|
---|
1405 | my $h;
|
---|
1406 | my $x = '';
|
---|
1407 | my $i = 0;
|
---|
1408 | while ($i < 64 or $i < ord(substr($x,-1,1))+32) {
|
---|
1409 |
|
---|
1410 | my $block = substr($input, 0, $blockSize);
|
---|
1411 | $x = '';
|
---|
1412 | Image::ExifTool::AES::Crypt(\$x, $key, $iv, 1);
|
---|
1413 | $cipherMore = '';
|
---|
1414 |
|
---|
1415 | my ($j, $digest);
|
---|
1416 | for ($j=0; $j<64; ++$j) {
|
---|
1417 | $x = '';
|
---|
1418 | $x .= CipherUpdate($password) if length $password;
|
---|
1419 | $x .= CipherUpdate($block);
|
---|
1420 | $x .= CipherUpdate($vector) if length $vector;
|
---|
1421 | if ($j == 0) {
|
---|
1422 | my @a = unpack('C16', $x);
|
---|
1423 | my $sum = 0;
|
---|
1424 | $sum += $_ foreach @a;
|
---|
1425 | # set SHA block size (32, 48 or 64 bytes = SHA-256, 384 or 512)
|
---|
1426 | $blockSize = 32 + ($sum % 3) * 16;
|
---|
1427 | $digest = Digest::SHA->new($blockSize * 8);
|
---|
1428 | }
|
---|
1429 | $digest->add($x);
|
---|
1430 | }
|
---|
1431 |
|
---|
1432 | $h = $digest->digest();
|
---|
1433 | $key = substr($h, 0, 16);
|
---|
1434 | substr($input,0,16) = $h;
|
---|
1435 | $iv = substr($h, 16, 16);
|
---|
1436 | ++$i;
|
---|
1437 | }
|
---|
1438 | return substr($h, 0, 32);
|
---|
1439 | }
|
---|
1440 |
|
---|
1441 | #------------------------------------------------------------------------------
|
---|
1442 | # Initialize decryption
|
---|
1443 | # Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference,
|
---|
1444 | # 2) ID from file trailer dictionary
|
---|
1445 | # Returns: error string or undef on success (and sets $cryptInfo)
|
---|
1446 | sub DecryptInit($$$)
|
---|
1447 | {
|
---|
1448 | local $_;
|
---|
1449 | my ($et, $encrypt, $id) = @_;
|
---|
1450 |
|
---|
1451 | undef $cryptInfo;
|
---|
1452 | unless ($encrypt and ref $encrypt eq 'HASH') {
|
---|
1453 | return 'Error loading Encrypt object';
|
---|
1454 | }
|
---|
1455 | my $filt = $$encrypt{Filter};
|
---|
1456 | unless ($filt and $filt =~ s/^\///) {
|
---|
1457 | return 'Encrypt dictionary has no Filter!';
|
---|
1458 | }
|
---|
1459 | # extract some interesting tags
|
---|
1460 | my $ver = $$encrypt{V} || 0;
|
---|
1461 | my $rev = $$encrypt{R} || 0;
|
---|
1462 | my $enc = "$filt V$ver";
|
---|
1463 | $enc .= ".$rev" if $filt eq 'Standard';
|
---|
1464 | $enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/;
|
---|
1465 | $enc .= ' (' . ($$encrypt{Length} || 40) . '-bit)' if $filt eq 'Standard';
|
---|
1466 | my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt');
|
---|
1467 | $et->HandleTag($tagTablePtr, 'Filter', $enc);
|
---|
1468 | if ($filt ne 'Standard') {
|
---|
1469 | return "Encryption filter $filt currently not supported";
|
---|
1470 | } elsif (not defined $$encrypt{R}) {
|
---|
1471 | return 'Standard security handler missing revision';
|
---|
1472 | }
|
---|
1473 | unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) {
|
---|
1474 | return 'Incomplete Encrypt specification';
|
---|
1475 | }
|
---|
1476 | if ("$ver.$rev" >= 5.6) {
|
---|
1477 | # apologize for poor performance (AES is a pure Perl implementation)
|
---|
1478 | $et->Warn('Decryption is very slow for encryption V5.6 or higher', 3);
|
---|
1479 | }
|
---|
1480 | $et->HandleTag($tagTablePtr, 'P', $$encrypt{P});
|
---|
1481 |
|
---|
1482 | my %parm; # optional parameters extracted from Encrypt dictionary
|
---|
1483 |
|
---|
1484 | if ($ver == 1 or $ver == 2) {
|
---|
1485 | $cryptString = $cryptStream = 1;
|
---|
1486 | } elsif ($ver == 4 or $ver == 5) {
|
---|
1487 | # initialize our $cryptString and $cryptStream flags
|
---|
1488 | foreach ('StrF', 'StmF') {
|
---|
1489 | my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream;
|
---|
1490 | $$flagPt = $$encrypt{$_};
|
---|
1491 | undef $$flagPt if $$flagPt and $$flagPt eq '/Identity';
|
---|
1492 | return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF';
|
---|
1493 | }
|
---|
1494 | if ($cryptString or $cryptStream) {
|
---|
1495 | return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and
|
---|
1496 | ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM};
|
---|
1497 | my $cryptMeth = $$encrypt{CF}{StdCF}{CFM};
|
---|
1498 | unless ($cryptMeth =~ /^\/(V2|AESV2|AESV3)$/) {
|
---|
1499 | return "Unsupported encryption method $cryptMeth";
|
---|
1500 | }
|
---|
1501 | # set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used
|
---|
1502 | $$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2|AESV3)$/;
|
---|
1503 | }
|
---|
1504 | if ($ver == 5) {
|
---|
1505 | # validate OE and UE entries
|
---|
1506 | foreach ('OE', 'UE') {
|
---|
1507 | return "Missing Encrypt $_ entry" unless $$encrypt{$_};
|
---|
1508 | $parm{$_} = ReadPDFValue($$encrypt{$_});
|
---|
1509 | return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32;
|
---|
1510 | }
|
---|
1511 | require Image::ExifTool::AES; # will need this later
|
---|
1512 | }
|
---|
1513 | } else {
|
---|
1514 | return "Encryption version $ver currently not supported";
|
---|
1515 | }
|
---|
1516 | $id or return "Can't decrypt (no document ID)";
|
---|
1517 |
|
---|
1518 | # make sure we have the necessary libraries available
|
---|
1519 | if ($ver < 5) {
|
---|
1520 | unless (eval { require Digest::MD5 }) {
|
---|
1521 | return "Install Digest::MD5 to process encrypted PDF";
|
---|
1522 | }
|
---|
1523 | } else {
|
---|
1524 | unless (eval { require Digest::SHA }) {
|
---|
1525 | return "Install Digest::SHA to process AES-256 encrypted PDF";
|
---|
1526 | }
|
---|
1527 | }
|
---|
1528 |
|
---|
1529 | # calculate file-level en/decryption key
|
---|
1530 | my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08".
|
---|
1531 | "\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
|
---|
1532 | my $o = ReadPDFValue($$encrypt{O});
|
---|
1533 | my $u = ReadPDFValue($$encrypt{U});
|
---|
1534 |
|
---|
1535 | # set flag indicating whether metadata is encrypted
|
---|
1536 | # (in version 4 and higher, metadata streams may not be encrypted)
|
---|
1537 | if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) {
|
---|
1538 | $$encrypt{_meta} = 1;
|
---|
1539 | }
|
---|
1540 | # try no password first, then try provided password if available
|
---|
1541 | my ($try, $key);
|
---|
1542 | for ($try=0; ; ++$try) {
|
---|
1543 | my $password;
|
---|
1544 | if ($try == 0) {
|
---|
1545 | $password = '';
|
---|
1546 | } elsif ($try == 1) {
|
---|
1547 | $password = $et->Options('Password');
|
---|
1548 | return 'Document is password protected (use Password option)' unless defined $password;
|
---|
1549 | # make sure there is no UTF-8 flag on the password
|
---|
1550 | if ($] >= 5.006 and (eval { require Encode; Encode::is_utf8($password) } or $@)) {
|
---|
1551 | # repack by hand if Encode isn't available
|
---|
1552 | $password = $@ ? pack('C*',unpack($] < 5.010000 ? 'U0C*' : 'C0C*',$password)) : Encode::encode('utf8',$password);
|
---|
1553 | }
|
---|
1554 | } else {
|
---|
1555 | return 'Incorrect password';
|
---|
1556 | }
|
---|
1557 | if ($ver < 5) {
|
---|
1558 | if (length $password) {
|
---|
1559 | # password must be encoding in PDFDocEncoding (ref iso32000)
|
---|
1560 | $password = $et->Encode($password, 'PDFDoc');
|
---|
1561 | # truncate or pad the password to exactly 32 bytes
|
---|
1562 | if (length($password) > 32) {
|
---|
1563 | $password = substr($password, 0, 32);
|
---|
1564 | } elsif (length($password) < 32) {
|
---|
1565 | $password .= substr($pad, 0, 32-length($password));
|
---|
1566 | }
|
---|
1567 | } else {
|
---|
1568 | $password = $pad;
|
---|
1569 | }
|
---|
1570 | $key = $password . $o . pack('V', $$encrypt{P}) . $id;
|
---|
1571 | my $rep = 1;
|
---|
1572 | if ($rev == 3 or $rev == 4) {
|
---|
1573 | # must add this if metadata not encrypted
|
---|
1574 | $key .= "\xff\xff\xff\xff" unless $$encrypt{_meta};
|
---|
1575 | $rep += 50; # repeat MD5 50 more times if revision is 3 or greater
|
---|
1576 | }
|
---|
1577 | my ($len, $i, $dat);
|
---|
1578 | if ($ver == 1) {
|
---|
1579 | $len = 5;
|
---|
1580 | } else {
|
---|
1581 | $len = $$encrypt{Length} || 40;
|
---|
1582 | $len >= 40 or return 'Bad Encrypt Length';
|
---|
1583 | $len = int($len / 8);
|
---|
1584 | }
|
---|
1585 | for ($i=0; $i<$rep; ++$i) {
|
---|
1586 | $key = substr(Digest::MD5::md5($key), 0, $len);
|
---|
1587 | }
|
---|
1588 | # decrypt U to see if a user password is required
|
---|
1589 | if ($rev >= 3) {
|
---|
1590 | $dat = Digest::MD5::md5($pad . $id);
|
---|
1591 | RC4Crypt(\$dat, $key);
|
---|
1592 | for ($i=1; $i<=19; ++$i) {
|
---|
1593 | my @key = unpack('C*', $key);
|
---|
1594 | foreach (@key) { $_ ^= $i; }
|
---|
1595 | RC4Crypt(\$dat, pack('C*', @key));
|
---|
1596 | }
|
---|
1597 | $dat .= substr($u, 16);
|
---|
1598 | } else {
|
---|
1599 | $dat = $pad;
|
---|
1600 | RC4Crypt(\$dat, $key);
|
---|
1601 | }
|
---|
1602 | last if $dat eq $u; # all done if this was the correct key
|
---|
1603 | } else {
|
---|
1604 | return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48;
|
---|
1605 | if (length $password) {
|
---|
1606 | # Note: this should be good for passwords containing reasonable characters,
|
---|
1607 | # but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile
|
---|
1608 | # of stringprep (IETF RFC 3454) to the password before encoding in UTF-8
|
---|
1609 | $password = $et->Encode($password, 'UTF8');
|
---|
1610 | $password = substr($password, 0, 127) if length($password) > 127;
|
---|
1611 | }
|
---|
1612 | # test for the owner password
|
---|
1613 | my $sha = GetHash($password, substr($o,32,8), substr($u,0,48), $rev);
|
---|
1614 | if ($sha eq substr($o, 0, 32)) {
|
---|
1615 | $key = GetHash($password, substr($o,40,8), substr($u,0,48), $rev);
|
---|
1616 | my $dat = ("\0" x 16) . $parm{OE};
|
---|
1617 | # decrypt with no padding
|
---|
1618 | my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
|
---|
1619 | return $err if $err;
|
---|
1620 | $key = $dat; # use this as the file decryption key
|
---|
1621 | last;
|
---|
1622 | }
|
---|
1623 | # test for the user password
|
---|
1624 | $sha = GetHash($password, substr($u,32,8), '', $rev);
|
---|
1625 | if ($sha eq substr($u, 0, 32)) {
|
---|
1626 | $key = GetHash($password, substr($u,40,8), '', $rev);
|
---|
1627 | my $dat = ("\0" x 16) . $parm{UE};
|
---|
1628 | my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1);
|
---|
1629 | return $err if $err;
|
---|
1630 | $key = $dat; # use this as the file decryption key
|
---|
1631 | last;
|
---|
1632 | }
|
---|
1633 | }
|
---|
1634 | }
|
---|
1635 | $$encrypt{_key} = $key; # save the file-level encryption key
|
---|
1636 | $cryptInfo = $encrypt; # save reference to the file-level Encrypt object
|
---|
1637 | return undef; # success!
|
---|
1638 | }
|
---|
1639 |
|
---|
1640 | #------------------------------------------------------------------------------
|
---|
1641 | # Decrypt/Encrypt data
|
---|
1642 | # Inputs: 0) data ref
|
---|
1643 | # 1) PDF object reference to use as crypt key extension (may be 'none' to
|
---|
1644 | # avoid extending the encryption key, as for streams with Crypt Filter)
|
---|
1645 | # 2) encrypt flag (false for decryption)
|
---|
1646 | sub Crypt($$;$)
|
---|
1647 | {
|
---|
1648 | return unless $cryptInfo;
|
---|
1649 | my ($dataPt, $keyExt, $encrypt) = @_;
|
---|
1650 | # do not decrypt if the key extension object is undefined
|
---|
1651 | # (this doubles as a flag to disable decryption/encryption)
|
---|
1652 | return unless defined $keyExt;
|
---|
1653 | my $key = $$cryptInfo{_key};
|
---|
1654 | # apply the necessary crypt key extension
|
---|
1655 | unless ($$cryptInfo{_aesv3}) {
|
---|
1656 | unless ($keyExt eq 'none') {
|
---|
1657 | # extend crypt key using object and generation number
|
---|
1658 | unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) {
|
---|
1659 | $$cryptInfo{_error} = 'Invalid object reference for encryption';
|
---|
1660 | return;
|
---|
1661 | }
|
---|
1662 | $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2);
|
---|
1663 | }
|
---|
1664 | # add AES-128 salt if necessary (this little gem is conveniently
|
---|
1665 | # omitted from the Adobe PDF 1.6 documentation, causing me to
|
---|
1666 | # waste 12 hours trying to figure out why this wasn't working --
|
---|
1667 | # it appears in ISO32000 though, so I should have been using that)
|
---|
1668 | $key .= 'sAlT' if $$cryptInfo{_aesv2};
|
---|
1669 | my $len = length($key);
|
---|
1670 | $key = Digest::MD5::md5($key); # get 16-byte MD5 digest
|
---|
1671 | $key = substr($key, 0, $len) if $len < 16; # trim if necessary
|
---|
1672 | }
|
---|
1673 | # perform the decryption/encryption
|
---|
1674 | if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) {
|
---|
1675 | require Image::ExifTool::AES;
|
---|
1676 | my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt);
|
---|
1677 | $err and $$cryptInfo{_error} = $err;
|
---|
1678 | } else {
|
---|
1679 | RC4Crypt($dataPt, $key);
|
---|
1680 | }
|
---|
1681 | }
|
---|
1682 |
|
---|
1683 | #------------------------------------------------------------------------------
|
---|
1684 | # Decrypt/Encrypt stream data
|
---|
1685 | # Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension
|
---|
1686 | sub CryptStream($$)
|
---|
1687 | {
|
---|
1688 | return unless $cryptStream;
|
---|
1689 | my ($dict, $keyExt) = @_;
|
---|
1690 | my $type = $$dict{Type} || '';
|
---|
1691 | # XRef streams are not encrypted (ref 3, page 50),
|
---|
1692 | # and Metadata may or may not be encrypted
|
---|
1693 | if ($cryptInfo and $type ne '/XRef' and
|
---|
1694 | ($$cryptInfo{_meta} or $type ne '/Metadata'))
|
---|
1695 | {
|
---|
1696 | Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted});
|
---|
1697 | # toggle _decrypted flag
|
---|
1698 | $$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1);
|
---|
1699 | } else {
|
---|
1700 | $$dict{_decrypted} = 0; # stream should never be encrypted
|
---|
1701 | }
|
---|
1702 | }
|
---|
1703 |
|
---|
1704 | #------------------------------------------------------------------------------
|
---|
1705 | # Generate a new PDF tag (based on its ID) and add it to a tag table
|
---|
1706 | # Inputs: 0) tag table ref, 1) tag ID
|
---|
1707 | # Returns: tag info ref
|
---|
1708 | sub NewPDFTag($$)
|
---|
1709 | {
|
---|
1710 | my ($tagTablePtr, $tag) = @_;
|
---|
1711 | my $name = $tag;
|
---|
1712 | # translate URL-like escape sequences
|
---|
1713 | $name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige;
|
---|
1714 | $name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline
|
---|
1715 | $name =~ s/(^|_)([a-z])/\U$2/g; # start words with upper case
|
---|
1716 | my $tagInfo = { Name => $name };
|
---|
1717 | AddTagToTable($tagTablePtr, $tag, $tagInfo);
|
---|
1718 | return $tagInfo;
|
---|
1719 | }
|
---|
1720 |
|
---|
1721 | #------------------------------------------------------------------------------
|
---|
1722 | # Process AcroForm dictionary to set HasXMLFormsArchitecture flag
|
---|
1723 | # Inputs: Same as ProcessDict
|
---|
1724 | sub ProcessAcroForm($$$$;$$)
|
---|
1725 | {
|
---|
1726 | my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
|
---|
1727 | $et->HandleTag($tagTablePtr, '_has_xfa', $$dict{XFA} ? 'true' : 'false');
|
---|
1728 | return ProcessDict($et, $tagTablePtr, $dict, $xref, $nesting, $type);
|
---|
1729 | }
|
---|
1730 |
|
---|
1731 | #------------------------------------------------------------------------------
|
---|
1732 | # Expand array into a string
|
---|
1733 | # Inputs: 0) array ref
|
---|
1734 | # Return: string
|
---|
1735 | sub ExpandArray($)
|
---|
1736 | {
|
---|
1737 | my $val = shift;
|
---|
1738 | my @list = @$val;
|
---|
1739 | foreach (@list) {
|
---|
1740 | ref $_ eq 'SCALAR' and $_ = "ref($$_)", next;
|
---|
1741 | ref $_ eq 'ARRAY' and $_ = ExpandArray($_), next;
|
---|
1742 | defined $_ or $_ = '<undef>', next;
|
---|
1743 | }
|
---|
1744 | return '[' . join(',',@list) . ']';
|
---|
1745 | }
|
---|
1746 |
|
---|
1747 | #------------------------------------------------------------------------------
|
---|
1748 | # Process PDF dictionary extract tag values
|
---|
1749 | # Inputs: 0) ExifTool object reference, 1) tag table reference
|
---|
1750 | # 2) dictionary reference, 3) cross-reference table reference,
|
---|
1751 | # 4) nesting depth, 5) dictionary capture type
|
---|
1752 | sub ProcessDict($$$$;$$)
|
---|
1753 | {
|
---|
1754 | my ($et, $tagTablePtr, $dict, $xref, $nesting, $type) = @_;
|
---|
1755 | my $verbose = $et->Options('Verbose');
|
---|
1756 | my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN};
|
---|
1757 | my $embedded = (defined $unknown and not $unknown and $et->Options('ExtractEmbedded'));
|
---|
1758 | my @tags = @{$$dict{_tags}};
|
---|
1759 | my ($next, %join);
|
---|
1760 | my $index = 0;
|
---|
1761 |
|
---|
1762 | $nesting = ($nesting || 0) + 1;
|
---|
1763 | if ($nesting > 50) {
|
---|
1764 | $et->WarnOnce('Nesting too deep (directory ignored)');
|
---|
1765 | return;
|
---|
1766 | }
|
---|
1767 | # save entire dictionary for rewriting if specified
|
---|
1768 | if ($$et{PDF_CAPTURE} and $$tagTablePtr{VARS} and
|
---|
1769 | $tagTablePtr->{VARS}->{CAPTURE})
|
---|
1770 | {
|
---|
1771 | my $name;
|
---|
1772 | foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) {
|
---|
1773 | next if $$et{PDF_CAPTURE}{$name};
|
---|
1774 | # make sure we load the right type if indicated
|
---|
1775 | next if $type and $type ne $name;
|
---|
1776 | $$et{PDF_CAPTURE}{$name} = $dict;
|
---|
1777 | last;
|
---|
1778 | }
|
---|
1779 | }
|
---|
1780 | #
|
---|
1781 | # extract information from all tags in the dictionary
|
---|
1782 | #
|
---|
1783 | for (;;) {
|
---|
1784 | my ($tag, $isSubDoc);
|
---|
1785 | if (@tags) {
|
---|
1786 | $tag = shift @tags;
|
---|
1787 | } elsif (defined $next and not $next) {
|
---|
1788 | $tag = 'Next';
|
---|
1789 | $next = 1;
|
---|
1790 | } else {
|
---|
1791 | last;
|
---|
1792 | }
|
---|
1793 | my $val = $$dict{$tag};
|
---|
1794 | my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag);
|
---|
1795 | if ($tagInfo) {
|
---|
1796 | undef $tagInfo if $$tagInfo{NoProcess};
|
---|
1797 | } elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and
|
---|
1798 | $$tagTablePtr{$1} and (ref $val ne 'SCALAR' or not $fetched{$$val}))
|
---|
1799 | {
|
---|
1800 | my ($name, $num) = ($1, $2);
|
---|
1801 | $tagInfo = $et->GetTagInfo($tagTablePtr, $name);
|
---|
1802 | if (ref $tagInfo eq 'HASH' and $$tagInfo{JoinStreams}) {
|
---|
1803 | $fetched{$$val} = 1;
|
---|
1804 | my $obj = FetchObject($et, $$val, $xref, $tag);
|
---|
1805 | $join{$name} = [] unless $join{$name};
|
---|
1806 | next unless ref $obj eq 'HASH' and $$obj{_stream};
|
---|
1807 | # save all the stream data to join later
|
---|
1808 | DecodeStream($et, $obj);
|
---|
1809 | $join{$name}->[$num] = $$obj{_stream};
|
---|
1810 | undef $tagInfo; # don't process
|
---|
1811 | } else {
|
---|
1812 | $isSubDoc = 1; # treat as a sub-document
|
---|
1813 | }
|
---|
1814 | }
|
---|
1815 | if ($verbose) {
|
---|
1816 | my ($val2, $extra);
|
---|
1817 | if (ref $val eq 'SCALAR') {
|
---|
1818 | $extra = ", indirect object ($$val)";
|
---|
1819 | if ($fetched{$$val}) {
|
---|
1820 | $val2 = "ref($$val)";
|
---|
1821 | } elsif ($tag eq 'Next' and not $next) {
|
---|
1822 | # handle 'Next' links after all others
|
---|
1823 | $next = 0;
|
---|
1824 | next;
|
---|
1825 | } else {
|
---|
1826 | $fetched{$$val} = 1;
|
---|
1827 | $val = FetchObject($et, $$val, $xref, $tag);
|
---|
1828 | unless (defined $val) {
|
---|
1829 | my $str;
|
---|
1830 | if (defined $lastOffset) {
|
---|
1831 | $val2 = '<free>';
|
---|
1832 | $str = 'Object was freed';
|
---|
1833 | } else {
|
---|
1834 | $val2 = '<err>';
|
---|
1835 | $str = 'Error reading object';
|
---|
1836 | }
|
---|
1837 | $et->VPrint(0, "$$et{INDENT}${str}:\n");
|
---|
1838 | }
|
---|
1839 | }
|
---|
1840 | } elsif (ref $val eq 'HASH') {
|
---|
1841 | $extra = ', direct dictionary';
|
---|
1842 | } elsif (ref $val eq 'ARRAY') {
|
---|
1843 | $extra = ', direct array of ' . scalar(@$val) . ' objects';
|
---|
1844 | } else {
|
---|
1845 | $extra = ', direct object';
|
---|
1846 | }
|
---|
1847 | my $isSubdir;
|
---|
1848 | if (ref $val eq 'HASH') {
|
---|
1849 | $isSubdir = 1;
|
---|
1850 | } elsif (ref $val eq 'ARRAY') {
|
---|
1851 | # recurse into objects in arrays only if they are lists of
|
---|
1852 | # dictionaries or indirect objects which could be dictionaries
|
---|
1853 | $isSubdir = 1 if @$val;
|
---|
1854 | foreach (@$val) {
|
---|
1855 | next if ref $_ eq 'HASH' or ref $_ eq 'SCALAR';
|
---|
1856 | undef $isSubdir;
|
---|
1857 | last;
|
---|
1858 | }
|
---|
1859 | }
|
---|
1860 | if ($isSubdir) {
|
---|
1861 | # create bogus subdirectory to recurse into this dict
|
---|
1862 | $tagInfo or $tagInfo = {
|
---|
1863 | Name => $tag,
|
---|
1864 | SubDirectory => { TagTable => 'Image::ExifTool::PDF::Unknown' },
|
---|
1865 | };
|
---|
1866 | } else {
|
---|
1867 | $val2 = ExpandArray($val) if ref $val eq 'ARRAY';
|
---|
1868 | # generate tag info if we will use it later
|
---|
1869 | if (not $tagInfo and defined $val and $unknown) {
|
---|
1870 | $tagInfo = NewPDFTag($tagTablePtr, $tag);
|
---|
1871 | }
|
---|
1872 | }
|
---|
1873 | $et->VerboseInfo($tag, $tagInfo,
|
---|
1874 | Value => $val2 || $val,
|
---|
1875 | Extra => $extra,
|
---|
1876 | Index => $index++,
|
---|
1877 | );
|
---|
1878 | next unless defined $val;
|
---|
1879 | }
|
---|
1880 | unless ($tagInfo) {
|
---|
1881 | # add any tag found in Info dictionary to table
|
---|
1882 | next unless $unknown;
|
---|
1883 | $tagInfo = NewPDFTag($tagTablePtr, $tag);
|
---|
1884 | }
|
---|
1885 | # increment document number if necessary
|
---|
1886 | my ($oldDocNum, $oldNumTags);
|
---|
1887 | if ($isSubDoc) {
|
---|
1888 | $oldDocNum = $$et{DOC_NUM};
|
---|
1889 | $oldNumTags = $$et{NUM_FOUND};
|
---|
1890 | $$et{DOC_NUM} = ++$$et{DOC_COUNT};
|
---|
1891 | }
|
---|
1892 | if ($$tagInfo{SubDirectory}) {
|
---|
1893 | # process the subdirectory
|
---|
1894 | my @subDicts;
|
---|
1895 | if (ref $val eq 'ARRAY') {
|
---|
1896 | # hack to convert array to dictionary if necessary
|
---|
1897 | if ($$tagInfo{ConvertToDict} and @$val == 2 and not ref $$val[0]) {
|
---|
1898 | my $tg = $$val[0];
|
---|
1899 | $tg =~ s(^/)(); # remove name
|
---|
1900 | my %dict = ( _tags => [ $tg ], $tg => $$val[1] );
|
---|
1901 | @subDicts = ( \%dict );
|
---|
1902 | } else {
|
---|
1903 | @subDicts = @{$val};
|
---|
1904 | }
|
---|
1905 | } else {
|
---|
1906 | @subDicts = ( $val );
|
---|
1907 | }
|
---|
1908 | # loop through all values of this tag
|
---|
1909 | for (;;) {
|
---|
1910 | my $subDict = shift @subDicts or last;
|
---|
1911 | # save last fetched object in case we fetch another one here
|
---|
1912 | my $prevFetched = $lastFetched;
|
---|
1913 | if (ref $subDict eq 'SCALAR') {
|
---|
1914 | # only fetch once (other copies are obsolete)
|
---|
1915 | next if $fetched{$$subDict};
|
---|
1916 | if ($$tagInfo{IgnoreDuplicates}) {
|
---|
1917 | my $flag = "ProcessedPDF_$tag";
|
---|
1918 | if ($$et{$flag}) {
|
---|
1919 | next if $et->WarnOnce("Ignored duplicate $tag dictionary", 2);
|
---|
1920 | } else {
|
---|
1921 | $$et{$flag} = 1;
|
---|
1922 | }
|
---|
1923 | }
|
---|
1924 | # load dictionary via an indirect reference
|
---|
1925 | $fetched{$$subDict} = 1;
|
---|
1926 | my $obj = FetchObject($et, $$subDict, $xref, $tag);
|
---|
1927 | unless (defined $obj) {
|
---|
1928 | unless (defined $lastOffset) {
|
---|
1929 | $et->Warn("Error reading $tag object ($$subDict)");
|
---|
1930 | }
|
---|
1931 | next;
|
---|
1932 | }
|
---|
1933 | $subDict = $obj;
|
---|
1934 | }
|
---|
1935 | if (ref $subDict eq 'ARRAY') {
|
---|
1936 | # convert array of key/value pairs to a hash
|
---|
1937 | next if @$subDict < 2;
|
---|
1938 | my %hash = ( _tags => [] );
|
---|
1939 | while (@$subDict >= 2) {
|
---|
1940 | my $key = shift @$subDict;
|
---|
1941 | $key =~ s/^\///;
|
---|
1942 | push @{$hash{_tags}}, $key;
|
---|
1943 | $hash{$key} = shift @$subDict;
|
---|
1944 | }
|
---|
1945 | $subDict = \%hash;
|
---|
1946 | } else {
|
---|
1947 | next unless ref $subDict eq 'HASH';
|
---|
1948 | }
|
---|
1949 | # set flag to re-crypt all strings when rewriting if the dictionary
|
---|
1950 | # came from an encrypted stream
|
---|
1951 | $$subDict{_needCrypt}{'*'} = 1 unless $lastFetched;
|
---|
1952 | my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
|
---|
1953 | if (not $verbose) {
|
---|
1954 | my $proc = $$subTablePtr{PROCESS_PROC} || \&ProcessDict;
|
---|
1955 | &$proc($et, $subTablePtr, $subDict, $xref, $nesting);
|
---|
1956 | } elsif ($next) {
|
---|
1957 | # handle 'Next' links at this level to avoid deep recursion
|
---|
1958 | undef $next;
|
---|
1959 | $index = 0;
|
---|
1960 | $tagTablePtr = $subTablePtr;
|
---|
1961 | $dict = $subDict;
|
---|
1962 | @tags = @{$$subDict{_tags}};
|
---|
1963 | $et->VerboseDir($tag, scalar(@tags));
|
---|
1964 | } else {
|
---|
1965 | my $oldIndent = $$et{INDENT};
|
---|
1966 | my $oldDir = $$et{DIR_NAME};
|
---|
1967 | $$et{INDENT} .= '| ';
|
---|
1968 | $$et{DIR_NAME} = $tag;
|
---|
1969 | $et->VerboseDir($tag, scalar(@{$$subDict{_tags}}));
|
---|
1970 | ProcessDict($et, $subTablePtr, $subDict, $xref, $nesting);
|
---|
1971 | $$et{INDENT} = $oldIndent;
|
---|
1972 | $$et{DIR_NAME} = $oldDir;
|
---|
1973 | }
|
---|
1974 | $lastFetched = $prevFetched;
|
---|
1975 | }
|
---|
1976 | } else {
|
---|
1977 | # fetch object if necessary
|
---|
1978 | # (OS X 10.6 writes indirect objects in the Info dictionary!)
|
---|
1979 | if (ref $val eq 'SCALAR') {
|
---|
1980 | my $prevFetched = $lastFetched;
|
---|
1981 | # (note: fetching the same object multiple times is OK here)
|
---|
1982 | $val = FetchObject($et, $$val, $xref, $tag);
|
---|
1983 | if (defined $val) {
|
---|
1984 | $val = ReadPDFValue($val);
|
---|
1985 | # set flag to re-encrypt if necessary if rewritten
|
---|
1986 | $$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString;
|
---|
1987 | $lastFetched = $prevFetched; # restore last fetched object reference
|
---|
1988 | }
|
---|
1989 | } else {
|
---|
1990 | $val = ReadPDFValue($val);
|
---|
1991 | }
|
---|
1992 | # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary
|
---|
1993 | # unless this is binary data (hex-encoded strings would not have been converted)
|
---|
1994 | if (ref $val) {
|
---|
1995 | if (ref $val eq 'ARRAY') {
|
---|
1996 | my $v;
|
---|
1997 | foreach $v (@$val) {
|
---|
1998 | $et->FoundTag($tagInfo, $v);
|
---|
1999 | }
|
---|
2000 | }
|
---|
2001 | } elsif (defined $val) {
|
---|
2002 | my $format = $$tagInfo{Format} || $$tagInfo{Writable} || 'string';
|
---|
2003 | $val = ConvertPDFDate($val) if $format eq 'date';
|
---|
2004 | if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) {
|
---|
2005 | # text string is already in Unicode if it starts with "\xfe\xff",
|
---|
2006 | # otherwise we must first convert from PDFDocEncoding
|
---|
2007 | $val = $et->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM');
|
---|
2008 | }
|
---|
2009 | if ($$tagInfo{List} and not $$et{OPTIONS}{NoPDFList}) {
|
---|
2010 | # separate tokens in comma or whitespace delimited lists
|
---|
2011 | my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val;
|
---|
2012 | foreach $val (@values) {
|
---|
2013 | $et->FoundTag($tagInfo, $val);
|
---|
2014 | }
|
---|
2015 | } else {
|
---|
2016 | # a simple tag value
|
---|
2017 | $et->FoundTag($tagInfo, $val);
|
---|
2018 | }
|
---|
2019 | }
|
---|
2020 | }
|
---|
2021 | if ($isSubDoc) {
|
---|
2022 | # restore original document number
|
---|
2023 | $$et{DOC_NUM} = $oldDocNum;
|
---|
2024 | --$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
|
---|
2025 | }
|
---|
2026 | }
|
---|
2027 | #
|
---|
2028 | # extract information from joined streams if necessary
|
---|
2029 | #
|
---|
2030 |
|
---|
2031 | if (%join) {
|
---|
2032 | my ($tag, $i);
|
---|
2033 | foreach $tag (sort keys %join) {
|
---|
2034 | my $list = $join{$tag};
|
---|
2035 | last unless defined $$list[1] and $$list[1] =~ /^%.*?([\x0d\x0a]*)/;
|
---|
2036 | my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break
|
---|
2037 | for ($i=1; defined $$list[$i]; ++$i) {
|
---|
2038 | $buff .= $$list[$i];
|
---|
2039 | undef $$list[$i]; # free memory
|
---|
2040 | }
|
---|
2041 | # increment document number for tags extracted from embedded EPS
|
---|
2042 | my $oldDocNum = $$et{DOC_NUM};
|
---|
2043 | my $oldNumTags = $$et{NUM_FOUND};
|
---|
2044 | $$et{DOC_NUM} = ++$$et{DOC_COUNT};
|
---|
2045 | # extract PostScript information
|
---|
2046 | $et->HandleTag($tagTablePtr, $tag, $buff);
|
---|
2047 | $$et{DOC_NUM} = $oldDocNum;
|
---|
2048 | # revert document counter if we didn't add any new tags
|
---|
2049 | --$$et{DOC_COUNT} if $oldNumTags == $$et{NUM_FOUND};
|
---|
2050 | delete $$et{DOC_NUM};
|
---|
2051 | }
|
---|
2052 | }
|
---|
2053 | #
|
---|
2054 | # extract information from stream object if it exists (eg. Metadata stream)
|
---|
2055 | #
|
---|
2056 | for (;;) { # (cheap goto)
|
---|
2057 | last unless $$dict{_stream};
|
---|
2058 | my $tag = '_stream';
|
---|
2059 | # add Subtype (if it exists) to stream name and remove leading '/'
|
---|
2060 | ($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype};
|
---|
2061 | last unless $$tagTablePtr{$tag};
|
---|
2062 | my $tagInfo = $et->GetTagInfo($tagTablePtr, $tag) or last;
|
---|
2063 | unless ($$tagInfo{SubDirectory}) {
|
---|
2064 | # don't build filter lists across different images
|
---|
2065 | delete $$et{LIST_TAGS}{$$tagTablePtr{Filter}};
|
---|
2066 | # we arrive here only when extracting embedded images
|
---|
2067 | # - only extract known image types and ignore others
|
---|
2068 | my $filter = $$dict{Filter} || '';
|
---|
2069 | $filter = @$filter[-1] if ref $filter eq 'ARRAY'; # (get last Filter type)
|
---|
2070 | my $result;
|
---|
2071 | if ($filter eq '/DCTDecode' or $filter eq '/JPXDecode') {
|
---|
2072 | DecodeStream($et, $dict) or last;
|
---|
2073 | # save the image itself
|
---|
2074 | $et->FoundTag($tagInfo, \$$dict{_stream});
|
---|
2075 | # extract information from embedded image
|
---|
2076 | $result = $et->ExtractInfo(\$$dict{_stream}, { ReEntry => 1 });
|
---|
2077 | }
|
---|
2078 | unless ($result) {
|
---|
2079 | $et->FoundTag('FileType', defined $result ? '(unknown)' : '(unsupported)');
|
---|
2080 | }
|
---|
2081 | last;
|
---|
2082 | }
|
---|
2083 | # decode stream if necessary
|
---|
2084 | DecodeStream($et, $dict) or last;
|
---|
2085 | if ($verbose > 2) {
|
---|
2086 | $et->VPrint(2,"$$et{INDENT}$$et{DIR_NAME} stream data\n");
|
---|
2087 | $et->VerboseDump(\$$dict{_stream});
|
---|
2088 | }
|
---|
2089 | # extract information from stream
|
---|
2090 | my %dirInfo = (
|
---|
2091 | DataPt => \$$dict{_stream},
|
---|
2092 | DataLen => length $$dict{_stream},
|
---|
2093 | DirStart => 0,
|
---|
2094 | DirLen => length $$dict{_stream},
|
---|
2095 | Parent => 'PDF',
|
---|
2096 | );
|
---|
2097 | my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable});
|
---|
2098 | unless ($et->ProcessDirectory(\%dirInfo, $subTablePtr)) {
|
---|
2099 | $et->Warn("Error processing $$tagInfo{Name} information");
|
---|
2100 | }
|
---|
2101 | last;
|
---|
2102 | }
|
---|
2103 | }
|
---|
2104 |
|
---|
2105 | #------------------------------------------------------------------------------
|
---|
2106 | # Extract information from PDF file
|
---|
2107 | # Inputs: 0) ExifTool object reference, 1) dirInfo reference
|
---|
2108 | # Returns: 0 if not a PDF file, 1 on success, otherwise a negative error number
|
---|
2109 | sub ReadPDF($$)
|
---|
2110 | {
|
---|
2111 | my ($et, $dirInfo) = @_;
|
---|
2112 | my $raf = $$dirInfo{RAF};
|
---|
2113 | my $verbose = $et->Options('Verbose');
|
---|
2114 | my ($buff, $encrypt, $id);
|
---|
2115 | #
|
---|
2116 | # validate PDF file
|
---|
2117 | #
|
---|
2118 | # (linearization dictionary must be in the first 1024 bytes of the file)
|
---|
2119 | $raf->Read($buff, 1024) >= 8 or return 0;
|
---|
2120 | $buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
|
---|
2121 | $$et{PDFBase} = length $1 and $et->Warn('PDF header is not at start of file',1);
|
---|
2122 | $pdfVer = $2;
|
---|
2123 | $et->SetFileType(); # set the FileType tag
|
---|
2124 | $et->Warn("The PDF $pdfVer specification is held hostage by the ISO") if $pdfVer >= 2.0;
|
---|
2125 | # store PDFVersion tag
|
---|
2126 | my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root');
|
---|
2127 | $et->HandleTag($tagTablePtr, 'Version', $pdfVer);
|
---|
2128 | $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main');
|
---|
2129 | #
|
---|
2130 | # check for a linearized PDF (only if reading)
|
---|
2131 | #
|
---|
2132 | my $capture = $$et{PDF_CAPTURE};
|
---|
2133 | unless ($capture) {
|
---|
2134 | my $lin = 'false';
|
---|
2135 | if ($buff =~ /<</g) {
|
---|
2136 | $buff = substr($buff, pos($buff) - 2);
|
---|
2137 | my $dict = ExtractObject($et, \$buff);
|
---|
2138 | if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) {
|
---|
2139 | if (not $$et{VALUE}{FileSize}) {
|
---|
2140 | undef $lin; # can't determine if it is linearized
|
---|
2141 | } elsif ($$dict{L} == $$et{VALUE}{FileSize} - $$et{PDFBase}) {
|
---|
2142 | $lin = 'true';
|
---|
2143 | }
|
---|
2144 | }
|
---|
2145 | }
|
---|
2146 | $et->HandleTag($tagTablePtr, '_linearized', $lin) if $lin;
|
---|
2147 | }
|
---|
2148 | #
|
---|
2149 | # read the xref tables referenced from startxref at the end of the file
|
---|
2150 | #
|
---|
2151 | my @xrefOffsets;
|
---|
2152 | $raf->Seek(0, 2) or return -2;
|
---|
2153 | # the %%EOF must occur within the last 1024 bytes of the file (PDF spec, appendix H)
|
---|
2154 | my $len = $raf->Tell();
|
---|
2155 | $len = 1024 if $len > 1024;
|
---|
2156 | $raf->Seek(-$len, 2) or return -2;
|
---|
2157 | $raf->Read($buff, $len) == $len or return -3;
|
---|
2158 | # find the LAST xref table in the file (may be multiple %%EOF marks,
|
---|
2159 | # and comments between "startxref" and "%%EOF")
|
---|
2160 | $buff =~ /^.*startxref(\s+)(\d+)(\s+)(%[^\x0d\x0a]*\s+)*%%EOF/s or return -4;
|
---|
2161 | my $ws = $1 . $3;
|
---|
2162 | my $xr = $2;
|
---|
2163 | push @xrefOffsets, $xr, 'Main';
|
---|
2164 | # set input record separator
|
---|
2165 | local $/ = $ws =~ /(\x0d\x0a|\x0d|\x0a)/ ? $1 : "\x0a";
|
---|
2166 | my (%xref, @mainDicts, %loaded, $mainFree);
|
---|
2167 | my ($xrefSize, $mainDictSize) = (0, 0);
|
---|
2168 | # initialize variables to capture when rewriting
|
---|
2169 | if ($capture) {
|
---|
2170 | $capture->{startxref} = $xr;
|
---|
2171 | $capture->{xref} = \%xref;
|
---|
2172 | $capture->{newline} = $/;
|
---|
2173 | $capture->{mainFree} = $mainFree = { };
|
---|
2174 | }
|
---|
2175 | XRef:
|
---|
2176 | while (@xrefOffsets) {
|
---|
2177 | my $offset = shift @xrefOffsets;
|
---|
2178 | my $type = shift @xrefOffsets;
|
---|
2179 | next if $loaded{$offset}; # avoid infinite recursion
|
---|
2180 | unless ($raf->Seek($offset+$$et{PDFBase}, 0)) {
|
---|
2181 | %loaded or return -5;
|
---|
2182 | $et->Warn('Bad offset for secondary xref table');
|
---|
2183 | next;
|
---|
2184 | }
|
---|
2185 | # Note: care must be taken because ReadLine may read more than we want if
|
---|
2186 | # the newline sequence for this table is different than the rest of the file
|
---|
2187 | for (;;) {
|
---|
2188 | unless ($raf->ReadLine($buff)) {
|
---|
2189 | %loaded or return -6;
|
---|
2190 | $et->Warn('Bad offset for secondary xref table');
|
---|
2191 | next XRef;
|
---|
2192 | }
|
---|
2193 | last if $buff =~/\S/; # skip blank lines
|
---|
2194 | }
|
---|
2195 | my $loadXRefStream;
|
---|
2196 | if ($buff =~ s/^\s*xref\s+//s) {
|
---|
2197 | # load xref table
|
---|
2198 | for (;;) {
|
---|
2199 | # read another line if necessary (skipping blank lines)
|
---|
2200 | $raf->ReadLine($buff) or return -6 until $buff =~ /\S/;
|
---|
2201 | last if $buff =~ s/^\s*trailer([\s<[(])/$1/s;
|
---|
2202 | $buff =~ s/^\s*(\d+)\s+(\d+)\s+//s or return -4;
|
---|
2203 | my ($start, $num) = ($1, $2);
|
---|
2204 | $raf->Seek(-length($buff), 1) or return -4;
|
---|
2205 | my $i;
|
---|
2206 | for ($i=0; $i<$num; ++$i) {
|
---|
2207 | $raf->Read($buff, 20) == 20 or return -6;
|
---|
2208 | $buff =~ /^\s*(\d{10}) (\d{5}) (f|n)/s or return -4;
|
---|
2209 | my $num = $start + $i;
|
---|
2210 | $xrefSize = $num if $num > $xrefSize;
|
---|
2211 | # locate object to generate entry from stream if necessary
|
---|
2212 | # (must do this before we test $xref{$num})
|
---|
2213 | LocateAnyObject(\%xref, $num) if $xref{dicts};
|
---|
2214 | # save offset for newest copy of all objects
|
---|
2215 | # (or next object number for free objects)
|
---|
2216 | unless (defined $xref{$num}) {
|
---|
2217 | my ($offset, $gen) = (int($1), int($2));
|
---|
2218 | $xref{$num} = $offset;
|
---|
2219 | if ($3 eq 'f') {
|
---|
2220 | # save free objects in last xref table for rewriting
|
---|
2221 | $$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree;
|
---|
2222 | next;
|
---|
2223 | }
|
---|
2224 | # also save offset keyed by object reference string
|
---|
2225 | $xref{"$num $gen R"} = $offset;
|
---|
2226 | }
|
---|
2227 | }
|
---|
2228 | # (I have a sample from Adobe which has an empty xref table)
|
---|
2229 | # %xref or return -4; # xref table may not be empty
|
---|
2230 | $buff = '';
|
---|
2231 | }
|
---|
2232 | undef $mainFree; # only do this for the last xref table
|
---|
2233 | } elsif ($buff =~ s/^\s*(\d+)\s+(\d+)\s+obj//s) {
|
---|
2234 | # this is a PDF-1.5 cross-reference stream dictionary
|
---|
2235 | $loadXRefStream = 1;
|
---|
2236 | } else {
|
---|
2237 | %loaded or return -4;
|
---|
2238 | $et->Warn('Invalid secondary xref table');
|
---|
2239 | next;
|
---|
2240 | }
|
---|
2241 | my $mainDict = ExtractObject($et, \$buff, $raf, \%xref);
|
---|
2242 | unless (ref $mainDict eq 'HASH') {
|
---|
2243 | %loaded or return -8;
|
---|
2244 | $et->Warn('Error loading secondary dictionary');
|
---|
2245 | next;
|
---|
2246 | }
|
---|
2247 | # keep track of total trailer dictionary Size
|
---|
2248 | $mainDictSize = $$mainDict{Size} if $$mainDict{Size} and $$mainDict{Size} > $mainDictSize;
|
---|
2249 | if ($loadXRefStream) {
|
---|
2250 | # decode and save our XRef stream from PDF-1.5 file
|
---|
2251 | # (but parse it later as required to save time)
|
---|
2252 | # Note: this technique can potentially result in an old object
|
---|
2253 | # being used if the file was incrementally updated and an older
|
---|
2254 | # object from an xref table was replaced by a newer object in an
|
---|
2255 | # xref stream. But doing so isn't a good idea (if allowed at all)
|
---|
2256 | # because a PDF 1.4 consumer would also make this same mistake.
|
---|
2257 | if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and
|
---|
2258 | @{$$mainDict{W}} > 2 and $$mainDict{Size} and
|
---|
2259 | DecodeStream($et, $mainDict))
|
---|
2260 | {
|
---|
2261 | # create Index entry if it doesn't exist
|
---|
2262 | $$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ];
|
---|
2263 | # create '_entry_size' entry for internal use
|
---|
2264 | my $w = $$mainDict{W};
|
---|
2265 | my $size = 0;
|
---|
2266 | foreach (@$w) { $size += $_; }
|
---|
2267 | $$mainDict{_entry_size} = $size;
|
---|
2268 | # save this stream dictionary to use later if required
|
---|
2269 | $xref{dicts} = [] unless $xref{dicts};
|
---|
2270 | push @{$xref{dicts}}, $mainDict;
|
---|
2271 | } else {
|
---|
2272 | %loaded or return -9;
|
---|
2273 | $et->Warn('Invalid xref stream in secondary dictionary');
|
---|
2274 | }
|
---|
2275 | }
|
---|
2276 | $loaded{$offset} = 1;
|
---|
2277 | # load XRef stream in hybrid file if it exists
|
---|
2278 | push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm};
|
---|
2279 | $encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt};
|
---|
2280 | if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') {
|
---|
2281 | $id = ReadPDFValue($mainDict->{ID}->[0]);
|
---|
2282 | }
|
---|
2283 | push @mainDicts, $mainDict, $type;
|
---|
2284 | # load previous xref table if it exists
|
---|
2285 | push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev};
|
---|
2286 | }
|
---|
2287 | if ($xrefSize > $mainDictSize) {
|
---|
2288 | my $str = "Objects in xref table ($xrefSize) exceed trailer dictionary Size ($mainDictSize)";
|
---|
2289 | $capture ? $et->Error($str) : $et->Warn($str);
|
---|
2290 | }
|
---|
2291 | #
|
---|
2292 | # extract encryption information if necessary
|
---|
2293 | #
|
---|
2294 | if ($encrypt) {
|
---|
2295 | if (ref $encrypt eq 'SCALAR') {
|
---|
2296 | $encrypt = FetchObject($et, $$encrypt, \%xref, 'Encrypt');
|
---|
2297 | }
|
---|
2298 | # generate Encryption tag information
|
---|
2299 | my $err = DecryptInit($et, $encrypt, $id);
|
---|
2300 | if ($err) {
|
---|
2301 | $et->Warn($err);
|
---|
2302 | $$capture{Error} = $err if $capture;
|
---|
2303 | return -1;
|
---|
2304 | }
|
---|
2305 | }
|
---|
2306 | #
|
---|
2307 | # extract the information beginning with each of the main dictionaries
|
---|
2308 | #
|
---|
2309 | my $i = 0;
|
---|
2310 | my $num = (scalar @mainDicts) / 2;
|
---|
2311 | while (@mainDicts) {
|
---|
2312 | my $dict = shift @mainDicts;
|
---|
2313 | my $type = shift @mainDicts;
|
---|
2314 | if ($verbose) {
|
---|
2315 | ++$i;
|
---|
2316 | my $n = scalar(@{$$dict{_tags}});
|
---|
2317 | $et->VPrint(0, "PDF dictionary ($i of $num) with $n entries:\n");
|
---|
2318 | }
|
---|
2319 | ProcessDict($et, $tagTablePtr, $dict, \%xref, 0, $type);
|
---|
2320 | }
|
---|
2321 | # handle any decryption errors
|
---|
2322 | if ($encrypt) {
|
---|
2323 | my $err = $$encrypt{_error};
|
---|
2324 | if ($err) {
|
---|
2325 | $et->Warn($err);
|
---|
2326 | $$capture{Error} = $err if $capture;
|
---|
2327 | return -1;
|
---|
2328 | }
|
---|
2329 | }
|
---|
2330 | return 1;
|
---|
2331 | }
|
---|
2332 |
|
---|
2333 | #------------------------------------------------------------------------------
|
---|
2334 | # ReadPDF() warning strings for each error return value
|
---|
2335 | my %pdfWarning = (
|
---|
2336 | # -1 is reserved as error return value with no associated warning
|
---|
2337 | -2 => 'Error seeking in file',
|
---|
2338 | -3 => 'Error reading file',
|
---|
2339 | -4 => 'Invalid xref table',
|
---|
2340 | -5 => 'Invalid xref offset',
|
---|
2341 | -6 => 'Error reading xref table',
|
---|
2342 | -7 => 'Error reading trailer',
|
---|
2343 | -8 => 'Error reading main dictionary',
|
---|
2344 | -9 => 'Invalid xref stream in main dictionary',
|
---|
2345 | );
|
---|
2346 |
|
---|
2347 | #------------------------------------------------------------------------------
|
---|
2348 | # Extract information from PDF file
|
---|
2349 | # Inputs: 0) ExifTool object reference, 1) dirInfo reference
|
---|
2350 | # Returns: 1 if this was a valid PDF file
|
---|
2351 | sub ProcessPDF($$)
|
---|
2352 | {
|
---|
2353 | my ($et, $dirInfo) = @_;
|
---|
2354 |
|
---|
2355 | undef $cryptInfo; # (must not delete after returning so writer can use it)
|
---|
2356 | undef $cryptStream;
|
---|
2357 | undef $cryptString;
|
---|
2358 | my $result = ReadPDF($et, $dirInfo);
|
---|
2359 | if ($result < 0) {
|
---|
2360 | $et->Warn($pdfWarning{$result}) if $pdfWarning{$result};
|
---|
2361 | $result = 1;
|
---|
2362 | }
|
---|
2363 | # clean up and return
|
---|
2364 | undef %streamObjs;
|
---|
2365 | undef %fetched;
|
---|
2366 | return $result;
|
---|
2367 | }
|
---|
2368 |
|
---|
2369 | 1; # end
|
---|
2370 |
|
---|
2371 |
|
---|
2372 | __END__
|
---|
2373 |
|
---|
2374 | =head1 NAME
|
---|
2375 |
|
---|
2376 | Image::ExifTool::PDF - Read PDF meta information
|
---|
2377 |
|
---|
2378 | =head1 SYNOPSIS
|
---|
2379 |
|
---|
2380 | This module is loaded automatically by Image::ExifTool when required.
|
---|
2381 |
|
---|
2382 | =head1 DESCRIPTION
|
---|
2383 |
|
---|
2384 | This code reads meta information from PDF (Adobe Portable Document Format)
|
---|
2385 | files. It supports object streams introduced in PDF-1.5 but only with a
|
---|
2386 | limited set of Filter and Predictor algorithms, however all standard
|
---|
2387 | encryption methods through PDF-1.7 extension level 3 are supported,
|
---|
2388 | including AESV2 (AES-128) and AESV3 (AES-256).
|
---|
2389 |
|
---|
2390 | =head1 AUTHOR
|
---|
2391 |
|
---|
2392 | Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
|
---|
2393 |
|
---|
2394 | This library is free software; you can redistribute it and/or modify it
|
---|
2395 | under the same terms as Perl itself.
|
---|
2396 |
|
---|
2397 | =head1 REFERENCES
|
---|
2398 |
|
---|
2399 | =over 4
|
---|
2400 |
|
---|
2401 | =item L<http://partners.adobe.com/public/developer/pdf/index_reference.html>
|
---|
2402 |
|
---|
2403 | =item L<Crypt::RC4|Crypt::RC4>
|
---|
2404 |
|
---|
2405 | =item L<http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf>
|
---|
2406 |
|
---|
2407 | =item L<http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf>
|
---|
2408 |
|
---|
2409 | =item L<http://tools.ietf.org/search/rfc3454>
|
---|
2410 |
|
---|
2411 | =item L<http://www.armware.dk/RFC/rfc/rfc4013.html>
|
---|
2412 |
|
---|
2413 | =back
|
---|
2414 |
|
---|
2415 | =head1 SEE ALSO
|
---|
2416 |
|
---|
2417 | L<Image::ExifTool::TagNames/PDF Tags>,
|
---|
2418 | L<Image::ExifTool(3pm)|Image::ExifTool>
|
---|
2419 |
|
---|
2420 | =cut
|
---|