- Timestamp:
- 2011-06-01T12:33:42+12:00 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/cpan/Image/ExifTool/PDF.pm
r16842 r24107 4 4 # Description: Read PDF meta information 5 5 # 6 # Revisions: 07/11/05 - P. Harvey Created 7 # 07/25/05 - P. Harvey Add support for encrypted documents 8 # 9 # References: 1) http://partners.adobe.com/public/developer/pdf/index_reference.html 10 # 2) http://www.cr0.net:8040/code/crypto/rc4/ 6 # Revisions: 07/11/2005 - P. Harvey Created 7 # 07/25/2005 - P. Harvey Add support for encrypted documents 8 # 9 # References: 1) http://www.adobe.com/devnet/pdf/pdf_reference.html 10 # 2) http://search.cpan.org/dist/Crypt-RC4/ 11 # 3) http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf 12 # 4) http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf 13 # 5) http://tools.ietf.org/search/rfc3454 14 # 6) http://www.armware.dk/RFC/rfc/rfc4013.html 11 15 #------------------------------------------------------------------------------ 12 16 … … 14 18 15 19 use strict; 16 use vars qw($VERSION );20 use vars qw($VERSION $AUTOLOAD $lastFetched); 17 21 use Image::ExifTool qw(:DataAccess :Utils); 18 22 require Exporter; 19 23 20 $VERSION = '1.10'; 21 22 sub LocateObject($$); 24 $VERSION = '1.29'; 25 23 26 sub FetchObject($$$$); 24 27 sub ExtractObject($$;$$); 25 28 sub ReadToNested($;$); 26 sub ProcessDict($$$$;$); 27 28 my %warnedOnce; # hash of warnings we issued 29 sub ProcessDict($$$$;$$); 30 sub ReadPDFValue($); 31 sub CheckPDF($$$); 32 33 # $lastFetched - last fetched object reference (used for decryption) 34 # (undefined if fetched object was already decrypted, ie. object from stream) 35 36 my $cryptInfo; # encryption object reference (plus additional information) 37 my $cryptString; # flag that strings are encrypted 38 my $cryptStream; # flag that streams are encrypted 39 my $lastOffset; # last fetched object offset 29 40 my %streamObjs; # hash of stream objects 30 41 my %fetched; # dicts fetched in verbose mode (to avoid cyclical recursion) 31 my $lastFetched; # last fetched object reference (used for decryption) 32 my $cryptInfo; # encryption object reference (plus additional information) 42 my $pdfVer; # version of PDF file being processed 33 43 34 44 # tags in main PDF directories 35 45 %Image::ExifTool::PDF::Main = ( 46 GROUPS => { 2 => 'Document' }, 47 VARS => { CAPTURE => ['Main','Prev'] }, 36 48 Info => { 37 SubDirectory => { 38 TagTable => 'Image::ExifTool::PDF::Info', 39 }, 49 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Info' }, 40 50 }, 41 51 Root => { 42 SubDirectory => { 43 TagTable => 'Image::ExifTool::PDF::Root', 44 }, 52 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Root' }, 53 }, 54 Encrypt => { 55 NoProcess => 1, # don't process normally (processed in advance) 56 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Encrypt' }, 57 }, 58 _linearized => { 59 Name => 'Linearized', 60 Notes => 'flag set if document is linearized for fast web display; not a real Tag ID', 61 PrintConv => { 'true' => 'Yes', 'false' => 'No' }, 45 62 }, 46 63 ); … … 48 65 # tags in PDF Info directory 49 66 %Image::ExifTool::PDF::Info = ( 50 GROUPS => { 2 => 'Image' }, 67 GROUPS => { 2 => 'Document' }, 68 VARS => { CAPTURE => ['Info'] }, 51 69 EXTRACT_UNKNOWN => 1, # extract all unknown tags in this directory 70 WRITE_PROC => \&Image::ExifTool::DummyWriteProc, 71 CHECK_PROC => \&CheckPDF, 72 WRITABLE => 'string', 52 73 NOTES => q{ 53 74 As well as the tags listed below, the PDF specification allows for 54 75 user-defined tags to exist in the Info dictionary. These tags, which should 55 have corresponding XMP-pdfx entries in the PDF Metadata, are also extracted 56 by ExifTool. 76 have corresponding XMP-pdfx entries in the XMP of the PDF XML Metadata 77 object, are also extracted by ExifTool. 78 79 B<Writable> specifies the value format, and may be C<string>, C<date>, 80 C<integer>, C<real>, C<boolean> or C<name> for PDF tags. 57 81 }, 58 82 Title => { }, 59 83 Author => { Groups => { 2 => 'Author' } }, 60 84 Subject => { }, 61 Keywords => { List => 1 }, # this is a list of tokens85 Keywords => { List => 'string' }, # this is a string list 62 86 Creator => { }, 63 87 Producer => { }, 64 88 CreationDate => { 65 89 Name => 'CreateDate', 90 Writable => 'date', 66 91 Groups => { 2 => 'Time' }, 67 ValueConv => 'Image::ExifTool::PDF::ConvertPDFDate($self, $val)', 92 Shift => 'Time', 93 PrintConv => '$self->ConvertDateTime($val)', 94 PrintConvInv => '$self->InverseDateTime($val)', 68 95 }, 69 96 ModDate => { 70 97 Name => 'ModifyDate', 98 Writable => 'date', 71 99 Groups => { 2 => 'Time' }, 72 ValueConv => 'Image::ExifTool::PDF::ConvertPDFDate($self, $val)', 100 Shift => 'Time', 101 PrintConv => '$self->ConvertDateTime($val)', 102 PrintConvInv => '$self->InverseDateTime($val)', 73 103 }, 74 104 Trapped => { 105 Protected => 1, 75 106 # remove leading '/' from '/True' or '/False' 76 107 ValueConv => '$val=~s{^/}{}; $val', 108 ValueConvInv => '"/$val"', 109 }, 110 'AAPL:Keywords' => { #PH 111 Name => 'AppleKeywords', 112 List => 'array', # this is an array of values 113 Notes => q{ 114 keywords written by Apple utilities, although they seem to use PDF:Keywords 115 when reading 116 }, 77 117 }, 78 118 ); … … 80 120 # tags in the PDF Root document catalog 81 121 %Image::ExifTool::PDF::Root = ( 122 GROUPS => { 2 => 'Document' }, 123 # note: can't capture previous versions of Root since they are not parsed 124 VARS => { CAPTURE => ['Root'] }, 82 125 NOTES => 'This is the PDF document catalog.', 126 MarkInfo => { 127 SubDirectory => { TagTable => 'Image::ExifTool::PDF::MarkInfo' }, 128 }, 83 129 Metadata => { 84 SubDirectory => { 85 TagTable => 'Image::ExifTool::PDF::Metadata', 130 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' }, 131 }, 132 Pages => { 133 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Pages' }, 134 }, 135 Perms => { 136 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Perms' }, 137 }, 138 Lang => 'Language', 139 PageLayout => { }, 140 PageMode => { }, 141 Version => 'PDFVersion', 142 ); 143 144 # tags extracted from the PDF Encrypt dictionary 145 %Image::ExifTool::PDF::Encrypt = ( 146 GROUPS => { 2 => 'Document' }, 147 NOTES => 'Tags extracted from the document Encrypt dictionary.', 148 Filter => { 149 Name => 'Encryption', 150 Notes => q{ 151 extracted value is actually a combination of the Filter, SubFilter, V, R and 152 Length information from the Encrypt dictionary 86 153 }, 87 154 }, 88 Pages => { 89 SubDirectory => { 90 TagTable => 'Image::ExifTool::PDF::Pages', 91 }, 155 P => { 156 Name => 'UserAccess', 157 ValueConv => '$val & 0x0f3c', # ignore reserved bits 158 PrintConvColumns => 2, 159 PrintConv => { BITMASK => { 160 2 => 'Print', 161 3 => 'Modify', 162 4 => 'Copy', 163 5 => 'Annotate', 164 8 => 'Fill forms', 165 9 => 'Extract', 166 10 => 'Assemble', 167 11 => 'Print high-res', 168 }}, 92 169 }, 93 170 ); … … 95 172 # tags in PDF Pages directory 96 173 %Image::ExifTool::PDF::Pages = ( 97 GROUPS => { 2 => ' Image' },174 GROUPS => { 2 => 'Document' }, 98 175 Count => 'PageCount', 99 176 Kids => { 100 SubDirectory => { 101 TagTable => 'Image::ExifTool::PDF::Kids', 102 }, 177 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Kids' }, 178 }, 179 ); 180 181 # tags in PDF Perms directory 182 %Image::ExifTool::PDF::Perms = ( 183 NOTES => 'Additional document permissions imposed by digital signatures.', 184 DocMDP => { 185 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' }, 186 }, 187 FieldMDP => { 188 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' }, 189 }, 190 UR3 => { 191 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Signature' }, 103 192 }, 104 193 ); … … 107 196 %Image::ExifTool::PDF::Kids = ( 108 197 Metadata => { 109 SubDirectory => { 110 TagTable => 'Image::ExifTool::PDF::Metadata', 111 }, 198 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Metadata' }, 112 199 }, 113 200 PieceInfo => { 114 SubDirectory => { 115 TagTable => 'Image::ExifTool::PDF::PieceInfo', 116 }, 201 SubDirectory => { TagTable => 'Image::ExifTool::PDF::PieceInfo' }, 117 202 }, 118 203 Resources => { 119 SubDirectory => { 120 TagTable => 'Image::ExifTool::PDF::Resources', 121 }, 204 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Resources' }, 122 205 }, 123 206 ); … … 126 209 %Image::ExifTool::PDF::Resources = ( 127 210 ColorSpace => { 128 SubDirectory => { 129 TagTable => 'Image::ExifTool::PDF::ColorSpace', 130 }, 211 SubDirectory => { TagTable => 'Image::ExifTool::PDF::ColorSpace' }, 131 212 }, 132 213 ); … … 135 216 %Image::ExifTool::PDF::ColorSpace = ( 136 217 DefaultRGB => { 137 SubDirectory => { 138 TagTable => 'Image::ExifTool::PDF::DefaultRGB', 139 }, 218 SubDirectory => { TagTable => 'Image::ExifTool::PDF::DefaultRGB' }, 140 219 }, 141 220 ); … … 144 223 %Image::ExifTool::PDF::DefaultRGB = ( 145 224 ICCBased => { 146 SubDirectory => { 147 TagTable => 'Image::ExifTool::PDF::ICCBased', 148 }, 225 SubDirectory => { TagTable => 'Image::ExifTool::PDF::ICCBased' }, 149 226 }, 150 227 ); … … 152 229 # tags in PDF ICCBased directory 153 230 %Image::ExifTool::PDF::ICCBased = ( 154 stream => { 155 SubDirectory => { 156 TagTable => 'Image::ExifTool::ICC_Profile::Main', 157 }, 231 _stream => { 232 SubDirectory => { TagTable => 'Image::ExifTool::ICC_Profile::Main' }, 158 233 }, 159 234 ); … … 162 237 %Image::ExifTool::PDF::PieceInfo = ( 163 238 AdobePhotoshop => { 164 SubDirectory => { 165 TagTable => 'Image::ExifTool::PDF::AdobePhotoshop', 239 SubDirectory => { TagTable => 'Image::ExifTool::PDF::AdobePhotoshop' }, 240 }, 241 Illustrator => { 242 # assume this is an illustrator file if it contains this directory 243 # and doesn't have a ".PDF" extension 244 Condition => q{ 245 $self->OverrideFileType("AI") unless $$self{FILE_EXT} and $$self{FILE_EXT} eq 'PDF'; 246 return 1; 166 247 }, 248 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Illustrator' }, 167 249 }, 168 250 ); … … 171 253 %Image::ExifTool::PDF::AdobePhotoshop = ( 172 254 Private => { 173 SubDirectory => { 174 TagTable => 'Image::ExifTool::PDF::Private', 175 }, 255 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Private' }, 256 }, 257 ); 258 259 # tags in PDF Illustrator directory 260 %Image::ExifTool::PDF::Illustrator = ( 261 Private => { 262 SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIPrivate' }, 176 263 }, 177 264 ); … … 180 267 %Image::ExifTool::PDF::Private = ( 181 268 ImageResources => { 182 SubDirectory => { 183 TagTable => 'Image::ExifTool::PDF::ImageResources', 269 SubDirectory => { TagTable => 'Image::ExifTool::PDF::ImageResources' }, 270 }, 271 ); 272 273 # tags in PDF AI Private directory 274 %Image::ExifTool::PDF::AIPrivate = ( 275 GROUPS => { 2 => 'Document' }, 276 EXTRACT_UNKNOWN => 0, # extract known but numbered tags 277 AIMetaData => { 278 SubDirectory => { TagTable => 'Image::ExifTool::PDF::AIMetaData' }, 279 }, 280 AIPrivateData => { 281 Notes => q{ 282 the ExtractEmbedded option enables information to be extracted from embedded 283 PostScript documents in the AIPrivateData stream 184 284 }, 285 SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' }, 286 }, 287 RoundTripVersion => { }, 288 ContainerVersion => { }, 289 CreatorVersion => { }, 290 ); 291 292 # tags in PDF AIMetaData directory 293 %Image::ExifTool::PDF::AIMetaData = ( 294 _stream => { 295 SubDirectory => { TagTable => 'Image::ExifTool::PostScript::Main' }, 185 296 }, 186 297 ); … … 188 299 # tags in PDF ImageResources directory 189 300 %Image::ExifTool::PDF::ImageResources = ( 190 stream => { 191 SubDirectory => { 192 TagTable => 'Image::ExifTool::Photoshop::Main', 193 }, 301 _stream => { 302 SubDirectory => { TagTable => 'Image::ExifTool::Photoshop::Main' }, 303 }, 304 ); 305 306 # tags in PDF MarkInfo directory 307 %Image::ExifTool::PDF::MarkInfo = ( 308 GROUPS => { 2 => 'Document' }, 309 Marked => { 310 Name => 'TaggedPDF', 311 Notes => "not a Tagged PDF if this tag is missing", 312 PrintConv => { 'true' => 'Yes', 'false' => 'No' }, 194 313 }, 195 314 ); … … 197 316 # tags in PDF Metadata directory 198 317 %Image::ExifTool::PDF::Metadata = ( 199 GROUPS => { 2 => ' Image' },318 GROUPS => { 2 => 'Document' }, 200 319 XML_stream => { # this is the stream for a Subtype /XML dictionary (not a real tag) 201 320 Name => 'XMP', 202 SubDirectory => { 203 TagTable => 'Image::ExifTool::XMP::Main', 321 SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' }, 322 }, 323 ); 324 325 # tags in PDF signature directories (DocMDP, FieldMDP or UR3) 326 %Image::ExifTool::PDF::Signature = ( 327 GROUPS => { 2 => 'Document' }, 328 ContactInfo => 'SignerContactInfo', 329 Location => 'SigningLocation', 330 M => { 331 Name => 'SigningDate', 332 Format => 'date', 333 Groups => { 2 => 'Time' }, 334 PrintConv => '$self->ConvertDateTime($val)', 335 }, 336 Name => 'SigningAuthority', 337 Reason => 'SigningReason', 338 Reference => { 339 SubDirectory => { TagTable => 'Image::ExifTool::PDF::Reference' }, 340 }, 341 Prop_AuthTime => { 342 Name => 'AuthenticationTime', 343 PrintConv => 'ConvertTimeSpan($val) . " ago"', 344 }, 345 Prop_AuthType => 'AuthenticationType', 346 ); 347 348 # tags in PDF Reference directory 349 %Image::ExifTool::PDF::Reference = ( 350 TransformParams => { 351 SubDirectory => { TagTable => 'Image::ExifTool::PDF::TransformParams' }, 352 }, 353 ); 354 355 # tags in PDF TransformParams directory 356 %Image::ExifTool::PDF::TransformParams = ( 357 GROUPS => { 2 => 'Document' }, 358 Annots => { 359 Name => 'AnnotationUsageRights', 360 Notes => 'UR3 signatures only', 361 List => 1, 362 }, 363 Document => { 364 Name => 'DocumentUsageRights', 365 Notes => 'UR3 signatures only', 366 List => 1, 367 }, 368 Form => { 369 Name => 'FormUsageRights', 370 Notes => 'UR3 signatures only', 371 List => 1, 372 }, 373 Signature => { 374 Name => 'SignatureUsageRights', 375 Notes => 'UR3 signatures only', 376 List => 1, 377 }, 378 EF => { 379 Name => 'EmbeddedFileUsageRights', 380 Notes => 'UR3 signatures only', 381 List => 1, 382 }, 383 Msg => { 384 Name => 'UsageRightsMessage', 385 Notes => 'UR3 signatures only', 386 }, 387 P => { 388 Name => 'ModificationPermissions', 389 Notes => q{ 390 1-3 for DocMDP signatures, default 2; true/false for UR3 signatures, default 391 false 204 392 }, 393 PrintConv => { 394 1 => 'No changes permitted', 395 2 => 'Fill forms, Create page templates, Sign', 396 3 => 'Fill forms, Create page templates, Sign, Create/Delete/Edit annotations', 397 'true' => 'Restrict al applications to reader permissions', 398 'false' => 'Do not restrict applications to reader permissions', 399 }, 400 }, 401 Action => { 402 Name => 'FieldPermissions', 403 Notes => 'FieldMDP signatures only', 404 PrintConv => { 405 'All' => 'Disallow changes to all form fields', 406 'Include' => 'Disallow changes to specified form fields', 407 'Exclude' => 'Allow changes to specified form fields', 408 }, 409 }, 410 Fields => { 411 Notes => 'FieldMDP signatures only', 412 Name => 'FormFields', 413 List => 1, 205 414 }, 206 415 ); … … 212 421 213 422 #------------------------------------------------------------------------------ 214 # Issue one warning of each type215 # Inputs: 0) ExifTool object reference, 1) warning216 sub WarnOnce($$)423 # AutoLoad our writer routines when necessary 424 # 425 sub AUTOLOAD 217 426 { 218 my ($exifTool, $warn) = @_; 219 unless ($warnedOnce{$warn}) { 220 $warnedOnce{$warn} = 1; 221 $exifTool->Warn($warn); 222 } 427 return Image::ExifTool::DoAutoLoad($AUTOLOAD, @_); 223 428 } 224 429 225 430 #------------------------------------------------------------------------------ 226 # Set PDF format error warning227 # Inputs: 0) ExifTool object reference, 1) error string228 # Returns: 1229 sub PDFErr($$)431 # Convert from PDF to EXIF-style date/time 432 # Inputs: 0) PDF date/time string (D:YYYYmmddHHMMSS+HH'MM') 433 # Returns: EXIF date string (YYYY:mm:dd HH:MM:SS+HH:MM) 434 sub ConvertPDFDate($) 230 435 { 231 my ($exifTool, $str) = @_; 232 $exifTool->Warn("PDF format error ($str)"); 233 return 1; 234 } 235 236 #------------------------------------------------------------------------------ 237 # Convert from PDF to EXIF-style date/time 238 # Inputs: 0) ExifTool object reference, 239 # 1) PDF date/time string (D:yyyymmddhhmmss+hh'mm') 240 # Returns: EXIF date string (yyyy:mm:dd hh:mm:ss+hh:mm) 241 sub ConvertPDFDate($$) 242 { 243 my ($exifTool, $date) = @_; 436 my $date = shift; 244 437 # remove optional 'D:' prefix 245 438 $date =~ s/^D://; 246 439 # fill in default values if necessary 247 # yyyymmddhhmmss440 # YYYYmmddHHMMSS 248 441 my $default = '00000101000000'; 249 442 if (length $date < length $default) { … … 253 446 $date = "$1:$2:$3 $4:$5:$6"; 254 447 if ($7) { 255 my @t = split /'/, $7; 256 $date .= $t[0]; 257 $date .= ':' . ($t[1] || 0) if $t[0] ne 'Z'; 258 } 259 return $exifTool->ConvertDateTime($date); 448 my $tz = $7; 449 if ($tz =~ /^\s*Z/i) { 450 # ignore any "HH'mm'" after the Z (OS X 10.6 does this) 451 $date .= 'Z'; 452 # tolerate some improper formatting in timezone specification 453 } elsif ($tz =~ /^\s*([-+])\s*(\d+)[': ]+(\d*)/) { 454 $date .= $1 . $2 . ':' . ($3 || '00'); 455 } 456 } 457 return $date; 260 458 } 261 459 262 460 #------------------------------------------------------------------------------ 263 # Locate an object in the XRref tables 264 # Inputs: 0) XRef reference, 1) object reference string 265 # Returns: offset to object in file, or undefined on error 266 sub LocateObject($$) 461 # Locate any object in the XRef tables (including compressed objects) 462 # Inputs: 0) XRef reference, 1) object reference string (or free object number) 463 # Returns: offset to object in file or compressed object reference string, 464 # 0 if object is free, or undefined on error 465 sub LocateAnyObject($$) 267 466 { 268 467 my ($xref, $ref) = @_; 269 468 return undef unless $xref; 270 return $$xref{$ref} if $$xref{$ref};469 return $$xref{$ref} if exists $$xref{$ref}; 271 470 # get the object number 272 471 return undef unless $ref =~ /^(\d+)/; 273 472 my $objNum = $1; 473 # return 0 if the object number has been reused (old object is free) 474 return 0 if defined $$xref{$objNum}; 274 475 # 275 476 # scan our XRef stream dictionaries for this object … … 283 484 next if $objNum < $$index[0]; 284 485 # scan the tables for the specified object 285 my $size = $$dict{ entry_size};486 my $size = $$dict{_entry_size}; 286 487 my $num = scalar(@$index) / 2; 287 488 my $tot = 0; … … 291 492 my $count = $$index[$i*2+1]; 292 493 # table is in ascending order, so quit if we have passed the object 293 last if $objNum >= $start + $count;294 if ($objNum >= $start) {494 last if $objNum < $start; 495 if ($objNum < $start + $count) { 295 496 my $offset = $size * ($objNum - $start + $tot); 296 last if $offset + $size > length $$dict{ stream};297 my @c = unpack("x$offset C$size", $$dict{ stream});497 last if $offset + $size > length $$dict{_stream}; 498 my @c = unpack("x$offset C$size", $$dict{_stream}); 298 499 # extract values from this table entry 299 500 # (can be 1, 2, 3, 4, etc.. bytes per value) 300 my (@t, $j, $k , $ref2);501 my (@t, $j, $k); 301 502 my $w = $$dict{W}; 302 503 for ($j=0; $j<3; ++$j) { 303 504 # use default value if W entry is 0 (as per spec) 304 $$w[$j] or $t[$j] = ($j ? 1 : 0), next; 505 # - 0th element defaults to 1, others default to 0 506 $$w[$j] or $t[$j] = ($j ? 0 : 1), next; 305 507 $t[$j] = shift(@c); 306 508 for ($k=1; $k < $$w[$j]; ++$k) { … … 308 510 } 309 511 } 512 # by default, use "o g R" as the xref key 513 # (o = object number, g = generation number) 514 my $ref2 = "$objNum $t[2] R"; 310 515 if ($t[0] == 1) { 311 # normal object reference: use "o g R" as hash ref 312 # (o = object number, g = generation number) 313 $ref2 = "$objNum $t[2] R"; 314 # xref is offset of object from start 516 # normal object reference: 517 # $t[1]=offset of object from start, $t[2]=generation number 315 518 $$xref{$ref2} = $t[1]; 316 519 } elsif ($t[0] == 2) { 317 520 # compressed object reference: 521 # $t[1]=stream object number, $t[2]=index of object in stream 318 522 $ref2 = "$objNum 0 R"; 319 # xref is object index and stream object reference320 523 $$xref{$ref2} = "I$t[2] $t[1] 0 R"; 524 } elsif ($t[0] == 0) { 525 # free object: 526 # $t[1]=next free object in linked list, $t[2]=generation number 527 $$xref{$ref2} = 0; 321 528 } else { 322 last; 529 # treat as a null object 530 $$xref{$ref2} = undef; 323 531 } 532 $$xref{$objNum} = $t[1]; # remember offsets by object number too 324 533 return $$xref{$ref} if $ref eq $ref2; 534 return 0; # object is free or was reused 325 535 } 326 536 $tot += $count; … … 331 541 332 542 #------------------------------------------------------------------------------ 543 # Locate a regular object in the XRef tables (does not include compressed objects) 544 # Inputs: 0) XRef reference, 1) object reference string (or free object number) 545 # Returns: offset to object in file, 0 if object is free, 546 # or undef on error or if object was compressed 547 sub LocateObject($$) 548 { 549 my ($xref, $ref) = @_; 550 my $offset = LocateAnyObject($xref, $ref); 551 return undef if $offset and $offset =~ /^I/; 552 return $offset; 553 } 554 555 #------------------------------------------------------------------------------ 333 556 # Fetch indirect object from file (from inside a stream if required) 334 # Inputs: 0) ExifTool object reference, 1) object reference string, 2) xref lookup, 557 # Inputs: 0) ExifTool object reference, 1) object reference string, 558 # 2) xref lookup, 3) object name (for warning messages) 335 559 # Returns: object data or undefined on error 560 # Notes: sets $lastFetched to the object reference, or undef if the object 561 # was extracted from an encrypted stream 336 562 sub FetchObject($$$$) 337 563 { 338 564 my ($exifTool, $ref, $xref, $tag) = @_; 339 565 $lastFetched = $ref; # save this for decoding if necessary 340 my $offset = LocateObject($xref, $ref); 566 my $offset = LocateAnyObject($xref, $ref); 567 $lastOffset = $offset; 341 568 unless ($offset) { 342 $exifTool->Warn("Bad $tag reference") ;569 $exifTool->Warn("Bad $tag reference") unless defined $offset; 343 570 return undef; 344 571 } … … 348 575 my ($objNum) = split ' ', $ref; # save original object number 349 576 $ref = $offset; # now a reference to the containing stream object 350 my$obj = $streamObjs{$ref};577 $obj = $streamObjs{$ref}; 351 578 unless ($obj) { 352 579 # don't try to load the same object stream twice … … 359 586 return undef unless $$obj{First} and $$obj{N}; 360 587 return undef unless DecodeStream($exifTool, $obj); 361 # add a special ' table' entry to this dictionary which contains588 # add a special '_table' entry to this dictionary which contains 362 589 # the list of object number/offset pairs from the stream header 363 590 my $num = $$obj{N} * 2; 364 my @table = split ' ', $$obj{ stream}, $num;591 my @table = split ' ', $$obj{_stream}, $num; 365 592 return undef unless @table == $num; 366 593 # remove everything before first object in stream 367 $$obj{ stream} = substr($$obj{stream}, $$obj{First});368 $table[$num-1] =~ s/^(\d+).*/$1/ ; # trim excess from last number369 $$obj{ table} = \@table;594 $$obj{_stream} = substr($$obj{_stream}, $$obj{First}); 595 $table[$num-1] =~ s/^(\d+).*/$1/s; # trim excess from last number 596 $$obj{_table} = \@table; 370 597 # save the object stream so we don't have to re-load it later 371 598 $streamObjs{$ref} = $obj; … … 373 600 # verify that we have the specified object 374 601 my $i = 2 * $index; 375 my $table = $$obj{ table};602 my $table = $$obj{_table}; 376 603 unless ($index < $$obj{N} and $$table[$i] == $objNum) { 377 604 $exifTool->Warn("Bad index for stream object $tag"); … … 382 609 # the next offset to get the object length) 383 610 $offset = $$table[$i + 1]; 384 my $len = ($$table[$i + 3] || length($$obj{stream})) - $offset; 385 $data = substr($$obj{stream}, $offset, $len); 611 my $len = ($$table[$i + 3] || length($$obj{_stream})) - $offset; 612 $data = substr($$obj{_stream}, $offset, $len); 613 # avoid re-decrypting data in already decrypted streams 614 undef $lastFetched if $cryptStream; 386 615 return ExtractObject($exifTool, \$data); 387 616 } … … 396 625 } 397 626 return ExtractObject($exifTool, \$data, $raf, $xref); 627 } 628 629 #------------------------------------------------------------------------------ 630 # Convert PDF value to something readable 631 # Inputs: 0) PDF object data 632 # Returns: converted object 633 sub ReadPDFValue($) 634 { 635 my $str = shift; 636 # decode all strings in an array 637 if (ref $str eq 'ARRAY') { 638 # create new list to not alter the original data when rewriting 639 my ($val, @vals); 640 foreach $val (@$str) { 641 push @vals, ReadPDFValue($val); 642 } 643 return \@vals; 644 } 645 length $str or return $str; 646 my $delim = substr($str, 0, 1); 647 if ($delim eq '(') { # literal string 648 $str = $1 if $str =~ /.*?\((.*)\)/s; # remove brackets 649 # decode escape sequences in literal strings 650 while ($str =~ /\\(.)/sg) { 651 my $n = pos($str) - 2; 652 my $c = $1; 653 my $r; 654 if ($c =~ /[0-7]/) { 655 # get up to 2 more octal digits 656 $c .= $1 if $str =~ /\G([0-7]{1,2})/g; 657 # convert octal escape code 658 $r = chr(oct($c) & 0xff); 659 } elsif ($c eq "\x0d") { 660 # the string is continued if the line ends with '\' 661 # (also remove "\x0d\x0a") 662 $c .= $1 if $str =~ /\G(\x0a)/g; 663 $r = ''; 664 } elsif ($c eq "\x0a") { 665 $r = ''; 666 } else { 667 # convert escaped characters 668 ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/; 669 } 670 substr($str, $n, length($c)+1) = $r; 671 # continue search after this character 672 pos($str) = $n + length($r); 673 } 674 Crypt(\$str, $lastFetched) if $cryptString; 675 } elsif ($delim eq '<') { # hex string 676 # decode hex data 677 $str =~ tr/0-9A-Fa-f//dc; 678 $str .= '0' if length($str) & 0x01; # (by the spec) 679 $str = pack('H*', $str); 680 Crypt(\$str, $lastFetched) if $cryptString; 681 } elsif ($delim eq '/') { # name 682 $str = substr($str, 1); 683 # convert escape codes (PDF 1.2 or later) 684 $str =~ s/#([0-9a-f]{2})/chr(hex($1))/sgei if $pdfVer >= 1.2; 685 } 686 return $str; 398 687 } 399 688 … … 408 697 # d) string, name, integer, boolean, null --> scalar value 409 698 # - updates $$dataPt on return to contain unused data 410 # - creates two bogus entries (' stream' and 'tags') in dictionaries to represent411 # the stream data and a list of the tags (not including ' stream' and 'tags')699 # - creates two bogus entries ('_stream' and '_tags') in dictionaries to represent 700 # the stream data and a list of the tags (not including '_stream' and '_tags') 412 701 # in their original order 413 702 sub ExtractObject($$;$$) … … 419 708 420 709 for (;;) { 421 if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/ ) {710 if ($$dataPt =~ /^\s*(<{1,2}|\[|\()/s) { 422 711 $delim = $1; 712 $$dataPt =~ s/^\s+//; # remove leading white space 423 713 $objData = ReadToNested($dataPt, $raf); 424 714 return undef unless defined $objData; 425 715 last; 426 } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{} ) {716 } elsif ($$dataPt =~ s{^\s*(\S[^[(/<>\s]*)\s*}{}s) { 427 717 # 428 718 # extract boolean, numerical, string, name, null object or indirect reference … … 430 720 $objData = $1; 431 721 # look for an indirect reference 432 if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R// ) {722 if ($objData =~ /^\d+$/ and $$dataPt =~ s/^(\d+)\s+R//s) { 433 723 $objData .= "$1 R"; 434 724 $objData = \$objData; # return scalar reference … … 440 730 } 441 731 # 442 # extract literal string 443 # 444 if ($delim eq '(') { 445 $objData = $1 if $objData =~ /.*?\((.*)\)/s; # remove brackets 446 # decode escape sequences in literal strings 447 while ($objData =~ /\\(.)/sg) { 448 my $n = pos($objData) - 2; 449 my $c = $1; 450 my $r; 451 if ($c =~ /[0-7]/) { 452 # get up to 2 more octal digits 453 $c .= $1 if $objData =~ /\G([0-7]{1,2})/g; 454 # convert octal escape code 455 $r = chr(oct($c) & 0xff); 456 } elsif ($c eq "\x0d") { 457 # the string is continued if the line ends with '\' 458 # (also remove "\x0d\x0a") 459 $c .= $1 if $objData =~ /\G(\x0a)/g; 460 $r = ''; 461 } elsif ($c eq "\x0a") { 462 # (also remove "\x0a\x0d") 463 $c .= $1 if $objData =~ /\G(\x0d)/g; 464 $r = ''; 465 } else { 466 # convert escaped characters 467 ($r = $c) =~ tr/nrtbf/\n\r\t\b\f/; 468 } 469 substr($objData, $n, length($c)+1) = $r; 470 # contine search after this character 471 pos($objData) = $n + length($r); 472 } 473 Decrypt(\$objData) if $cryptInfo; 474 # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary 475 if ($objData =~ s/^\xfe\xff//) { 476 $objData = $exifTool->Unicode2Charset($objData, 'MM'); 477 } 732 # return literal string or hex string without parsing 733 # 734 if ($delim eq '(' or $delim eq '<') { 478 735 return $objData; 479 736 # 480 # extract hex string481 #482 } elsif ($delim eq '<') {483 # decode hex data484 $objData =~ tr/0-9A-Fa-f//dc;485 $objData .= '0' if length($objData) & 0x01; # (by the spec)486 $objData = pack('H*', $objData);487 Decrypt(\$objData) if $cryptInfo;488 return $objData;489 #490 737 # extract array 491 738 # 492 739 } elsif ($delim eq '[') { 493 $objData =~ /.*?\[(.*)\]/s or return ; # remove brackets494 my $data = $1; 740 $objData =~ /.*?\[(.*)\]/s or return undef; 741 my $data = $1; # brackets removed 495 742 my @list; 496 743 for (;;) { … … 536 783 } elsif ($val =~ /^\d/) { 537 784 my $pos = pos($objData); 538 if ($objData =~ /\G\s+(\d+)\s+R/ g) {785 if ($objData =~ /\G\s+(\d+)\s+R/sg) { 539 786 $val = \ "$val $1 R"; # make a reference 540 787 } else { … … 544 791 if ($$dict{$tag}) { 545 792 # duplicate dictionary entries are not allowed 546 $exifTool->Warn( "Duplicate $tag entry in dictionary (ignored)");793 $exifTool->Warn('Duplicate $tag entry in dictionary (ignored)'); 547 794 } else { 548 795 # save the entry … … 552 799 } 553 800 return undef unless @tags; 554 $$dict{ tags} = \@tags;801 $$dict{_tags} = \@tags; 555 802 return $dict unless $raf; # direct objects can not have streams 556 803 # … … 563 810 my $oldpos = $raf->Tell(); 564 811 # get the location of the object specifying the length 812 # (compressed objects are not allowed) 565 813 my $offset = LocateObject($xref, $length) or return $dict; 566 $raf->Seek($offset, 0) or $exifTool->Warn("Bad Length offset"), return $dict; 814 $offset or $exifTool->Warn('Bad Length object'), return $dict; 815 $raf->Seek($offset, 0) or $exifTool->Warn('Bad Length offset'), return $dict; 567 816 # verify that we are reading the expected object 568 $raf->ReadLine($data) or $exifTool->Warn( "Error reading Length data"), return $dict;817 $raf->ReadLine($data) or $exifTool->Warn('Error reading Length data'), return $dict; 569 818 $length =~ s/R/obj/; 570 819 unless ($data =~ /^$length/) { … … 572 821 return $dict; 573 822 } 574 $raf->ReadLine($data) or $exifTool->Warn( "Error reading stream Length"), return $dict;575 $data =~ /(\d+)/ or $exifTool->Warn( "Stream length not found"), return $dict;823 $raf->ReadLine($data) or $exifTool->Warn('Error reading stream Length'), return $dict; 824 $data =~ /(\d+)/ or $exifTool->Warn('Stream length not found'), return $dict; 576 825 $length = $1; 577 826 $raf->Seek($oldpos, 0); # restore position to start of stream … … 585 834 $$dataPt .= $data if $raf->ReadLine($data); 586 835 # remove our stream header 587 $$dataPt =~ s/^ .*stream(\x0a|\x0d\x0a)//s;836 $$dataPt =~ s/^\s*stream(\x0a|\x0d\x0a)//s; 588 837 my $more = $length - length($$dataPt); 589 838 if ($more > 0) { 590 839 unless ($raf->Read($data, $more) == $more) { 591 $exifTool->Warn( "Error reading stream data");840 $exifTool->Warn('Error reading stream data'); 592 841 $$dataPt = ''; 593 842 return $dict; 594 843 } 595 $$dict{ stream} = $$dataPt . $data;844 $$dict{_stream} = $$dataPt . $data; 596 845 $$dataPt = ''; 597 846 } elsif ($more < 0) { 598 $$dict{ stream} = substr($$dataPt, 0, $length);847 $$dict{_stream} = substr($$dataPt, 0, $length); 599 848 $$dataPt = substr($$dataPt, $length); 600 849 } else { 601 $$dict{ stream} = $$dataPt;850 $$dict{_stream} = $$dataPt; 602 851 $$dataPt = ''; 603 852 } … … 616 865 # - updates data reference with trailing data 617 866 # - unescapes characters in literal strings 867 my %closingDelim = ( # lookup for matching delimiter 868 '(' => ')', 869 '[' => ']', 870 '<' => '>', 871 '<<' => '>>', 872 ); 618 873 sub ReadToNested($;$) 619 874 { 620 875 my ($dataPt, $raf) = @_; 621 # matching closing delimiters622 my %closingDelim = (623 '<<' => '>>',624 '(' => ')',625 '[' => ']',626 '<' => '>',627 );628 876 my @delim = (''); # closing delimiter list, most deeply nested first 629 877 pos($$dataPt) = 0; # begin at start of data … … 664 912 pos($$dataPt) = pos($$dataPt) - 1; 665 913 } 666 my $delim = shift @delim;# remove from nesting list914 shift @delim; # remove from nesting list 667 915 next if $delim[0]; # keep going if we have more nested delimiters 668 916 my $pos = pos($$dataPt); … … 682 930 my ($exifTool, $dict) = @_; 683 931 684 return 0 unless $$dict{stream}; # no stream to decode 685 # apply decryption first if required 686 if ($cryptInfo and not $$dict{decrypted}) { 687 $$dict{decrypted} = 1; 688 if ($$cryptInfo{meta} or ($$dict{Type} and $$dict{Type} ne '/Metadata')) { 689 Decrypt(\$$dict{stream}); 690 } 691 } 692 return 1 unless $$dict{Filter}; 693 if ($$dict{Filter} eq '/FlateDecode') { 694 if (eval 'require Compress::Zlib') { 695 my $inflate = Compress::Zlib::inflateInit(); 696 my ($buff, $stat); 697 $inflate and ($buff, $stat) = $inflate->inflate($$dict{stream}); 698 if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) { 699 $$dict{stream} = $buff; 700 # move Filter to prevent double decoding 701 $$dict{oldFilter} = $$dict{Filter}; 702 $$dict{Filter} = ''; 932 return 0 unless $$dict{_stream}; # no stream to decode 933 934 # get list of filters 935 my (@filters, @decodeParms, $filter); 936 if (ref $$dict{Filter} eq 'ARRAY') { 937 @filters = @{$$dict{Filter}}; 938 } elsif (defined $$dict{Filter}) { 939 @filters = ($$dict{Filter}); 940 } 941 # apply decryption first if required (and if the default encryption 942 # has not been overridden by a Crypt filter. Note: the Crypt filter 943 # must be first in the Filter array: ref 3, page 38) 944 unless (defined $$dict{_decrypted} or ($filters[0] and $filters[0] eq '/Crypt')) { 945 CryptStream($dict, $lastFetched); 946 } 947 return 1 unless $$dict{Filter}; # Filter entry is mandatory 948 return 0 if defined $$dict{_filtered}; # avoid double-filtering 949 $$dict{_filtered} = 1; # set flag to prevent double-filtering 950 951 # get array of DecodeParms dictionaries 952 if (ref $$dict{DecodeParms} eq 'ARRAY') { 953 @decodeParms = @{$$dict{DecodeParms}}; 954 } else { 955 @decodeParms = ($$dict{DecodeParms}); 956 } 957 foreach $filter (@filters) { 958 my $decodeParms = shift @decodeParms; 959 960 if ($filter eq '/FlateDecode') { 961 if (eval 'require Compress::Zlib') { 962 my $inflate = Compress::Zlib::inflateInit(); 963 my ($buff, $stat); 964 $inflate and ($buff, $stat) = $inflate->inflate($$dict{_stream}); 965 if ($inflate and $stat == Compress::Zlib::Z_STREAM_END()) { 966 $$dict{_stream} = $buff; 967 } else { 968 $exifTool->Warn('Error inflating stream'); 969 return 0; 970 } 703 971 } else { 704 $exifTool->Warn ('Error inflating stream');972 $exifTool->WarnOnce('Install Compress::Zlib to process filtered streams'); 705 973 return 0; 706 974 } 707 } else { 708 WarnOnce($exifTool,'Install Compress::Zlib to decode filtered streams'); 975 # apply anti-predictor if necessary 976 next unless ref $decodeParms eq 'HASH'; 977 my $pre = $$decodeParms{Predictor}; 978 next unless $pre and $pre != 1; 979 if ($pre != 12) { 980 # currently only support 'up' prediction 981 $exifTool->WarnOnce("FlateDecode Predictor $pre not currently supported"); 982 return 0; 983 } 984 my $cols = $$decodeParms{Columns}; 985 unless ($cols) { 986 # currently only support 'up' prediction 987 $exifTool->WarnOnce('No Columns for decoding stream'); 988 return 0; 989 } 990 my @bytes = unpack('C*', $$dict{_stream}); 991 my @pre = (0) x $cols; # initialize predictor array 992 my $buff = ''; 993 while (@bytes > $cols) { 994 unless (($_ = shift @bytes) == 2) { 995 $exifTool->WarnOnce("Unsupported PNG filter $_"); # (yes, PNG) 996 return 0; 997 } 998 foreach (@pre) { 999 $_ = ($_ + shift(@bytes)) & 0xff; 1000 } 1001 $buff .= pack('C*', @pre); 1002 } 1003 $$dict{_stream} = $buff; 1004 1005 } elsif ($filter eq '/Crypt') { 1006 1007 # (we shouldn't have to check the _decrypted flag since we 1008 # already checked the _filtered flag, but what the heck...) 1009 next if defined $$dict{_decrypted}; 1010 # assume Identity filter (the default) if DecodeParms are missing 1011 next unless ref $decodeParms eq 'HASH'; 1012 my $name = $$decodeParms{Name}; 1013 next unless defined $name or $name eq 'Identity'; 1014 if ($name ne 'StdCF') { 1015 $exifTool->WarnOnce("Unsupported Crypt Filter $name"); 1016 return 0; 1017 } 1018 unless ($cryptInfo) { 1019 $exifTool->WarnOnce('Missing Encrypt StdCF entry'); 1020 return 0; 1021 } 1022 # decrypt the stream manually because we want to: 1023 # 1) ignore $cryptStream (StmF) setting 1024 # 2) ignore EncryptMetadata setting (I can't find mention of how to 1025 # reconcile this in the spec., but this would make sense) 1026 # 3) avoid adding the crypt key extension (ref 3, page 58, Algorithm 1b) 1027 # 4) set _decrypted flag so we will recrypt according to StmF when 1028 # writing (since we don't yet write Filter'd streams) 1029 Crypt(\$$dict{_stream}, 'none'); 1030 $$dict{_decrypted} = ($cryptStream ? 1 : 0); 1031 1032 } elsif ($filter ne '/Identity') { 1033 1034 $exifTool->WarnOnce("Unsupported Filter $filter"); 709 1035 return 0; 710 1036 } 711 #712 # apply anti-predictor if necessary713 #714 return 1 unless $$dict{DecodeParms};715 my $pre = $dict->{DecodeParms}->{Predictor};716 return 1 unless $pre and $pre != 1;717 if ($pre != 12) {718 # currently only support 'up' prediction719 WarnOnce($exifTool,"FlateDecode Predictor $pre not currently supported");720 return 0;721 }722 my $cols = $dict->{DecodeParms}->{Columns};723 unless ($cols) {724 # currently only support 'up' prediction725 WarnOnce($exifTool,'No Columns for decoding stream');726 return 0;727 }728 my @bytes = unpack('C*', $$dict{stream});729 my @pre = (0) x $cols; # initialize predictor array730 my $buff = '';731 while (@bytes > $cols) {732 unless (($_ = shift @bytes) == 2) {733 WarnOnce($exifTool, "Unsupported PNG filter $_");734 return 0;735 }736 foreach (@pre) {737 $_ = ($_ + shift(@bytes)) & 0xff;738 }739 $buff .= pack('C*', @pre);740 }741 $$dict{stream} = $buff;742 } else {743 WarnOnce($exifTool, "Unsupported Filter $$dict{Filter}");744 return 0;745 1037 } 746 1038 return 1; … … 795 1087 # Inputs: 0) ExifTool object reference, 1) Encrypt dictionary reference, 796 1088 # 2) ID from file trailer dictionary 797 # Returns: error string or undef on success 1089 # Returns: error string or undef on success (and sets $cryptInfo) 798 1090 sub DecryptInit($$$) 799 1091 { 1092 local $_; 800 1093 my ($exifTool, $encrypt, $id) = @_; 1094 1095 undef $cryptInfo; 801 1096 unless ($encrypt and ref $encrypt eq 'HASH') { 802 1097 return 'Error loading Encrypt object'; … … 806 1101 return 'Encrypt dictionary has no Filter!'; 807 1102 } 1103 # extract some interesting tags 808 1104 my $ver = $$encrypt{V} || 0; 809 1105 my $rev = $$encrypt{R} || 0; 810 $exifTool->FoundTag('Encryption', "$filt v$ver.$rev"); 811 unless ($$encrypt{Filter} eq '/Standard') { 812 $$encrypt{Filter} =~ s/^\///; 813 return "PDF '$$encrypt{Filter}' encryption not currently supported"; 1106 my $enc = "$filt V$ver"; 1107 $enc .= ".$rev" if $filt eq 'Standard'; 1108 $enc .= " ($1)" if $$encrypt{SubFilter} and $$encrypt{SubFilter} =~ /^\/(.*)/; 1109 $enc .= ' (' . ($$encrypt{Length} || 40) . '-bit)' if $filt eq 'Standard'; 1110 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Encrypt'); 1111 $exifTool->HandleTag($tagTablePtr, 'Filter', $enc); 1112 if ($filt ne 'Standard') { 1113 return "Encryption filter $filt not currently supported"; 1114 } elsif (not defined $$encrypt{R}) { 1115 return 'Standard security handler missing revision'; 814 1116 } 815 1117 unless ($$encrypt{O} and $$encrypt{P} and $$encrypt{U}) { 816 1118 return 'Incomplete Encrypt specification'; 817 1119 } 818 unless ($ver == 1 or $ver == 2) { 819 return "Encryption algorithm $ver currently not supported"; 1120 $exifTool->HandleTag($tagTablePtr, 'P', $$encrypt{P}); 1121 1122 my %parm; # optional parameters extracted from Encrypt dictionary 1123 1124 if ($ver == 1 or $ver == 2) { 1125 $cryptString = $cryptStream = 1; 1126 } elsif ($ver == 4 or $ver == 5) { 1127 # initialize our $cryptString and $cryptStream flags 1128 foreach ('StrF', 'StmF') { 1129 my $flagPt = $_ eq 'StrF' ? \$cryptString : \$cryptStream; 1130 $$flagPt = $$encrypt{$_}; 1131 undef $$flagPt if $$flagPt and $$flagPt eq '/Identity'; 1132 return "Unsupported $_ encryption $$flagPt" if $$flagPt and $$flagPt ne '/StdCF'; 1133 } 1134 if ($cryptString or $cryptStream) { 1135 return 'Missing or invalid Encrypt StdCF entry' unless ref $$encrypt{CF} eq 'HASH' and 1136 ref $$encrypt{CF}{StdCF} eq 'HASH' and $$encrypt{CF}{StdCF}{CFM}; 1137 my $cryptMeth = $$encrypt{CF}{StdCF}{CFM}; 1138 unless ($cryptMeth =~ /^\/(V2|AESV2|AESV3)$/) { 1139 return "Unsupported encryption method $cryptMeth"; 1140 } 1141 # set "_aesv2" or "_aesv3" flag in %$encrypt hash if AES encryption was used 1142 $$encrypt{'_' . lc($1)} = 1 if $cryptMeth =~ /^\/(AESV2|AESV3)$/; 1143 } 1144 if ($ver == 5) { 1145 # validate OE and UE entries 1146 foreach ('OE', 'UE') { 1147 return "Missing Encrypt $_ entry" unless $$encrypt{$_}; 1148 $parm{$_} = ReadPDFValue($$encrypt{$_}); 1149 return "Invalid Encrypt $_ entry" unless length $parm{$_} == 32; 1150 } 1151 require Image::ExifTool::AES; # will need this later 1152 } 1153 } else { 1154 return "Encryption version $ver not currently supported"; 820 1155 } 821 1156 $id or return "Can't decrypt (no document ID)"; 822 unless (eval 'require Digest::MD5') { 823 return 'Install Digest::MD5 to extract encrypted information'; 824 } 1157 1158 # make sure we have the necessary libraries available 1159 if ($ver < 5) { 1160 unless (eval 'require Digest::MD5') { 1161 return "Install Digest::MD5 to process encrypted PDF"; 1162 } 1163 } else { 1164 unless (eval 'require Digest::SHA') { 1165 return "Install Digest::SHA to process AES-256 encrypted PDF"; 1166 } 1167 } 1168 825 1169 # calculate file-level en/decryption key 826 1170 my $pad = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08". 827 1171 "\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"; 828 my $key = $pad . $$encrypt{O} . pack('V', $$encrypt{P}) . $id; 829 my $rep = 1; 830 $$encrypt{meta} = 1; # set flag that Metadata is encrypted 831 if ($rev >= 3) { 832 # in rev 4 (not yet supported), metadata streams may not be encrypted 833 if ($$encrypt{EncryptMetadata} and $$encrypt{EncryptMetadata} =~ /false/i) { 834 delete $$encrypt{meta}; # Meta data isn't encrypted after all 835 $key .= "\xff\xff\xff\xff"; # must add this if metadata not encrypted 836 } 837 $rep += 50; # repeat MD5 50 more times if revision is 3 or greater 838 } 839 my ($len, $i); 840 if ($ver == 1) { 841 $len = 5; 842 } else { 843 $len = $$encrypt{Length} || 40; 844 $len >= 40 or return 'Bad Encrypt Length'; 845 $len = int($len / 8); 846 } 847 for ($i=0; $i<$rep; ++$i) { 848 $key = substr(Digest::MD5::md5($key), 0, $len); 849 } 850 # decrypt U to see if a user password is required 851 my $dat; 852 if ($rev >= 3) { 853 $dat = Digest::MD5::md5($pad . $id); 854 RC4Crypt(\$dat, $key); 855 for ($i=1; $i<=19; ++$i) { 856 my @key = unpack('C*', $key); 857 foreach (@key) { $_ ^= $i; } 858 RC4Crypt(\$dat, pack('C*', @key)); 859 } 860 $dat .= substr($$encrypt{U}, 16); 861 } else { 862 $dat = $pad; 863 RC4Crypt(\$dat, $key); 864 } 865 $dat eq $$encrypt{U} or return 'Document is password encrypted'; 866 $$encrypt{key} = $key; # save the file-level encryption key 867 $cryptInfo = $encrypt; # save a reference to the Encrypt object 1172 my $o = ReadPDFValue($$encrypt{O}); 1173 my $u = ReadPDFValue($$encrypt{U}); 1174 1175 # set flag indicating whether metadata is encrypted 1176 # (in version 4 and higher, metadata streams may not be encrypted) 1177 if ($ver < 4 or not $$encrypt{EncryptMetadata} or $$encrypt{EncryptMetadata} !~ /false/i) { 1178 $$encrypt{_meta} = 1; 1179 } 1180 # try no password first, then try provided password if available 1181 my ($try, $key); 1182 for ($try=0; ; ++$try) { 1183 my $password; 1184 if ($try == 0) { 1185 $password = ''; 1186 } elsif ($try == 1) { 1187 $password = $exifTool->Options('Password'); 1188 return 'Document is password protected (use Password option)' unless defined $password; 1189 # make sure there is no UTF-8 flag on the password 1190 if ($] >= 5.006 and (eval 'require Encode; Encode::is_utf8($password)' or $@)) { 1191 # repack by hand if Encode isn't available 1192 $password = $@ ? pack('C*',unpack('U0C*',$password)) : Encode::encode('utf8',$password); 1193 } 1194 } else { 1195 return 'Incorrect password'; 1196 } 1197 if ($ver < 5) { 1198 if (length $password) { 1199 # password must be encoding in PDFDocEncoding (ref iso32000) 1200 $password = $exifTool->Encode($password, 'PDFDoc'); 1201 # truncate or pad the password to exactly 32 bytes 1202 if (length($password) > 32) { 1203 $password = substr($password, 0, 32); 1204 } elsif (length($password) < 32) { 1205 $password .= substr($pad, 0, 32-length($password)); 1206 } 1207 } else { 1208 $password = $pad; 1209 } 1210 $key = $password . $o . pack('V', $$encrypt{P}) . $id; 1211 my $rep = 1; 1212 if ($rev == 3 or $rev == 4) { 1213 # must add this if metadata not encrypted 1214 $key .= "\xff\xff\xff\xff" unless $$encrypt{_meta}; 1215 $rep += 50; # repeat MD5 50 more times if revision is 3 or greater 1216 } 1217 my ($len, $i, $dat); 1218 if ($ver == 1) { 1219 $len = 5; 1220 } else { 1221 $len = $$encrypt{Length} || 40; 1222 $len >= 40 or return 'Bad Encrypt Length'; 1223 $len = int($len / 8); 1224 } 1225 for ($i=0; $i<$rep; ++$i) { 1226 $key = substr(Digest::MD5::md5($key), 0, $len); 1227 } 1228 # decrypt U to see if a user password is required 1229 if ($rev >= 3) { 1230 $dat = Digest::MD5::md5($pad . $id); 1231 RC4Crypt(\$dat, $key); 1232 for ($i=1; $i<=19; ++$i) { 1233 my @key = unpack('C*', $key); 1234 foreach (@key) { $_ ^= $i; } 1235 RC4Crypt(\$dat, pack('C*', @key)); 1236 } 1237 $dat .= substr($u, 16); 1238 } else { 1239 $dat = $pad; 1240 RC4Crypt(\$dat, $key); 1241 } 1242 last if $dat eq $u; # all done if this was the correct key 1243 } else { 1244 return 'Invalid O or U Encrypt entries' if length($o) < 48 or length($u) < 48; 1245 if (length $password) { 1246 # Note: this should be good for passwords containing reasonable characters, 1247 # but to be bullet-proof we need to apply the SASLprep (IETF RFC 4013) profile 1248 # of stringprep (IETF RFC 3454) to the password before encoding in UTF-8 1249 $password = $exifTool->Encode($password, 'UTF8'); 1250 $password = substr($password, 0, 127) if length($password) > 127; 1251 } 1252 # test for the owner password 1253 my $sha = Digest::SHA::sha256($password, substr($o,32,8), substr($u,0,48)); 1254 if ($sha eq substr($o, 0, 32)) { 1255 $key = Digest::SHA::sha256($password, substr($o,40,8), substr($u,0,48)); 1256 my $dat = ("\0" x 16) . $parm{OE}; 1257 # decrypt with no padding 1258 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1); 1259 return $err if $err; 1260 $key = $dat; # use this as the file decryption key 1261 last; 1262 } 1263 # test for the user password 1264 $sha = Digest::SHA::sha256($password, substr($u,32,8)); 1265 if ($sha eq substr($u, 0, 32)) { 1266 $key = Digest::SHA::sha256($password, substr($u,40,8)); 1267 my $dat = ("\0" x 16) . $parm{UE}; 1268 my $err = Image::ExifTool::AES::Crypt(\$dat, $key, 0, 1); 1269 return $err if $err; 1270 $key = $dat; # use this as the file decryption key 1271 last; 1272 } 1273 } 1274 } 1275 $$encrypt{_key} = $key; # save the file-level encryption key 1276 $cryptInfo = $encrypt; # save reference to the file-level Encrypt object 868 1277 return undef; # success! 869 1278 } 870 1279 871 1280 #------------------------------------------------------------------------------ 872 # Decrypt data 873 # Inputs: 0) data reference 874 sub Decrypt($) 1281 # Decrypt/Encrypt data 1282 # Inputs: 0) data ref 1283 # 1) PDF object reference to use as crypt key extension (may be 'none' to 1284 # avoid extending the encryption key, as for streams with Crypt Filter) 1285 # 2) encrypt flag (false for decryption) 1286 sub Crypt($$;$) 875 1287 { 876 my $dataPt = shift; 877 my $key = $$cryptInfo{key}; 878 my $len = length($key) + 5; 879 return unless $lastFetched =~ /^(I\d+ )?(\d+) (\d+)/; 880 $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2); 881 $len = 16 if $len > 16; 882 $key = substr(Digest::MD5::md5($key), 0, $len); 883 RC4Crypt($dataPt, $key); 1288 return unless $cryptInfo; 1289 my ($dataPt, $keyExt, $encrypt) = @_; 1290 # do not decrypt if the key extension object is undefined 1291 # (this doubles as a flag to disable decryption/encryption) 1292 return unless defined $keyExt; 1293 my $key = $$cryptInfo{_key}; 1294 # apply the necessary crypt key extension 1295 unless ($$cryptInfo{_aesv3}) { 1296 unless ($keyExt eq 'none') { 1297 # extend crypt key using object and generation number 1298 unless ($keyExt =~ /^(I\d+ )?(\d+) (\d+)/) { 1299 $$cryptInfo{_error} = 'Invalid object reference for encryption'; 1300 return; 1301 } 1302 $key .= substr(pack('V', $2), 0, 3) . substr(pack('V', $3), 0, 2); 1303 } 1304 # add AES-128 salt if necessary (this little gem is conveniently 1305 # omitted from the Adobe PDF 1.6 documentation, causing me to 1306 # waste 12 hours trying to figure out why this wasn't working -- 1307 # it appears in ISO32000 though, so I should have been using that) 1308 $key .= 'sAlT' if $$cryptInfo{_aesv2}; 1309 my $len = length($key); 1310 $key = Digest::MD5::md5($key); # get 16-byte MD5 digest 1311 $key = substr($key, 0, $len) if $len < 16; # trim if necessary 1312 } 1313 # perform the decryption/encryption 1314 if ($$cryptInfo{_aesv2} or $$cryptInfo{_aesv3}) { 1315 require Image::ExifTool::AES; 1316 my $err = Image::ExifTool::AES::Crypt($dataPt, $key, $encrypt); 1317 $err and $$cryptInfo{_error} = $err; 1318 } else { 1319 RC4Crypt($dataPt, $key); 1320 } 1321 } 1322 1323 #------------------------------------------------------------------------------ 1324 # Decrypt/Encrypt stream data 1325 # Inputs: 0) dictionary ref, 1) PDF object reference to use as crypt key extension 1326 sub CryptStream($$) 1327 { 1328 return unless $cryptStream; 1329 my ($dict, $keyExt) = @_; 1330 my $type = $$dict{Type} || ''; 1331 # XRef streams are not encrypted (ref 3, page 50), 1332 # and Metadata may or may not be encrypted 1333 if ($cryptInfo and $type ne '/XRef' and 1334 ($$cryptInfo{_meta} or $type ne '/Metadata')) 1335 { 1336 Crypt(\$$dict{_stream}, $keyExt, $$dict{_decrypted}); 1337 # toggle _decrypted flag 1338 $$dict{_decrypted} = ($$dict{_decrypted} ? undef : 1); 1339 } else { 1340 $$dict{_decrypted} = 0; # stream should never be encrypted 1341 } 1342 } 1343 1344 #------------------------------------------------------------------------------ 1345 # Generate a new PDF tag (based on its ID) and add it to a tag table 1346 # Inputs: 0) tag table ref, 1) tag ID 1347 # Returns: tag info ref 1348 sub NewPDFTag($$) 1349 { 1350 my ($tagTablePtr, $tag) = @_; 1351 my $name = $tag; 1352 # translate URL-like escape sequences 1353 $name =~ s/#([0-9a-f]{2})/chr(hex($1))/ige; 1354 $name =~ s/[^-\w]+/_/g; # translate invalid characters to an underline 1355 $name =~ s/(^|_)([a-z])/\U$2/g; # start words with upper case 1356 my $tagInfo = { Name => $name }; 1357 Image::ExifTool::AddTagToTable($tagTablePtr, $tag, $tagInfo); 1358 return $tagInfo; 884 1359 } 885 1360 … … 888 1363 # Inputs: 0) ExifTool object reference, 1) tag table reference 889 1364 # 2) dictionary reference, 3) cross-reference table reference, 890 # 4) nesting depth 891 sub ProcessDict($$$$;$ )1365 # 4) nesting depth, 5) dictionary capture type 1366 sub ProcessDict($$$$;$$) 892 1367 { 893 my ($exifTool, $tagTablePtr, $dict, $xref, $nesting ) = @_;1368 my ($exifTool, $tagTablePtr, $dict, $xref, $nesting, $type) = @_; 894 1369 my $verbose = $exifTool->Options('Verbose'); 895 my @tags = @{$$dict{tags}}; 1370 my $unknown = $$tagTablePtr{EXTRACT_UNKNOWN}; 1371 my $embedded = (defined $unknown and not $unknown and $exifTool->Options('ExtractEmbedded')); 1372 my @tags = @{$$dict{_tags}}; 1373 my ($next, %join); 896 1374 my $index = 0; 897 my $next;898 1375 899 1376 $nesting = ($nesting || 0) + 1; 900 1377 if ($nesting > 50) { 901 WarnOnce($exifTool,'Nesting too deep (directory ignored)');1378 $exifTool->WarnOnce('Nesting too deep (directory ignored)'); 902 1379 return; 1380 } 1381 # save entire dictionary for rewriting if specified 1382 if ($$exifTool{PDF_CAPTURE} and $$tagTablePtr{VARS} and 1383 $tagTablePtr->{VARS}->{CAPTURE}) 1384 { 1385 my $name; 1386 foreach $name (@{$tagTablePtr->{VARS}->{CAPTURE}}) { 1387 next if $exifTool->{PDF_CAPTURE}->{$name}; 1388 # make sure we load the right type if indicated 1389 next if $type and $type ne $name; 1390 $exifTool->{PDF_CAPTURE}->{$name} = $dict; 1391 last; 1392 } 903 1393 } 904 1394 # … … 915 1405 last; 916 1406 } 1407 my $val = $$dict{$tag}; 917 1408 if ($$tagTablePtr{$tag}) { 918 1409 $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag); 919 } 920 my $val = $$dict{$tag}; 1410 undef $tagInfo if $$tagInfo{NoProcess}; 1411 } elsif ($embedded and $tag =~ /^(.*?)(\d+)$/ and 1412 $$tagTablePtr{$1} and ref $val eq 'SCALAR' and not $fetched{$$val}) 1413 { 1414 my ($name, $num) = ($1, $2); 1415 $join{$name} = [] unless $join{$name}; 1416 $fetched{$$val} = 1; 1417 my $obj = FetchObject($exifTool, $$val, $xref, $tag); 1418 next unless ref $obj eq 'HASH' and $$obj{_stream}; 1419 # save all the stream data to join later 1420 $join{$name}->[$num] = $$obj{_stream}; 1421 } 921 1422 if ($verbose) { 922 1423 my ($val2, $extra); … … 932 1433 $fetched{$$val} = 1; 933 1434 $val = FetchObject($exifTool, $$val, $xref, $tag); 934 $val2 = '<err>' unless defined $val; 1435 unless (defined $val) { 1436 my $str; 1437 if (defined $lastOffset) { 1438 $val2 = '<free>'; 1439 $str = 'Object was freed'; 1440 } else { 1441 $val2 = '<err>'; 1442 $str = 'Error reading object'; 1443 } 1444 $exifTool->VPrint(0, "$$exifTool{INDENT}${str}:\n"); 1445 } 935 1446 } 936 1447 } elsif (ref $val eq 'HASH') { … … 962 1473 }, 963 1474 }; 964 } elsif (ref $val eq 'ARRAY') { 965 my @list = @$val; 966 foreach (@list) { 967 $_ = "ref($$_)" if ref $_ eq 'SCALAR'; 1475 } else { 1476 if (ref $val eq 'ARRAY') { 1477 my @list = @$val; 1478 foreach (@list) { 1479 $_ = "ref($$_)" if ref $_ eq 'SCALAR'; 1480 } 1481 $val2 = '[' . join(',',@list) . ']'; 968 1482 } 969 $val2 = '[' . join(',',@list) . ']'; 1483 # generate tag info if we will use it later 1484 if (not $tagInfo and defined $val and $unknown) { 1485 $tagInfo = NewPDFTag($tagTablePtr, $tag); 1486 } 970 1487 } 971 1488 $exifTool->VerboseInfo($tag, $tagInfo, … … 974 1491 Index => $index++, 975 1492 ); 1493 next unless defined $val; 976 1494 } 977 1495 unless ($tagInfo) { 978 1496 # add any tag found in Info directory to table 979 next unless $$tagTablePtr{EXTRACT_UNKNOWN}; 980 $tagInfo = { Name => $tag }; 981 Image::ExifTool::AddTagToTable($tagTablePtr, $tag, $tagInfo); 1497 next unless $unknown; 1498 $tagInfo = NewPDFTag($tagTablePtr, $tag); 982 1499 } 983 1500 unless ($$tagInfo{SubDirectory}) { 1501 # fetch object if necessary 1502 # (OS X 10.6 writes indirect objects in the Info dictionary!) 1503 if (ref $val eq 'SCALAR') { 1504 my $prevFetched = $lastFetched; 1505 # (note: fetching the same object multiple times is OK here) 1506 $val = FetchObject($exifTool, $$val, $xref, $tag); 1507 next unless defined $val; 1508 $val = ReadPDFValue($val); 1509 # set flag to re-encrypt if necessary if rewritten 1510 $$dict{_needCrypt}{$tag} = ($lastFetched ? 0 : 1) if $cryptString; 1511 $lastFetched = $prevFetched; # restore last fetched object reference 1512 } else { 1513 $val = ReadPDFValue($val); 1514 } 1515 my $format = $$tagInfo{Format} || $$tagInfo{Writable} || 'string'; 1516 $val = ConvertPDFDate($val) if $format eq 'date'; 984 1517 # convert from UTF-16 (big endian) to UTF-8 or Latin if necessary 985 1518 # unless this is binary data (hex-encoded strings would not have been converted) 986 if ($val =~ s/^\xfe\xff// and not $$tagInfo{Binary}) { 987 $val = $exifTool->Unicode2Charset($val, 'MM'); 988 } 989 if ($$tagInfo{List}) { 990 # separate tokens in comma or whitespace delimited lists 991 my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val; 992 foreach $val (@values) { 1519 if (ref $val) { 1520 next if ref $val ne 'ARRAY'; 1521 my $v; 1522 foreach $v (@$val) { 1523 $exifTool->FoundTag($tagInfo, $v); 1524 } 1525 } else { 1526 if (not $$tagInfo{Binary} and $val =~ /[\x18-\x1f\x80-\xff]/) { 1527 # text string is already in Unicode if it starts with "\xfe\xff", 1528 # otherwise we must first convert from PDFDocEncoding 1529 $val = $exifTool->Decode($val, ($val=~s/^\xfe\xff// ? 'UCS2' : 'PDFDoc'), 'MM'); 1530 } 1531 if ($$tagInfo{List}) { 1532 # separate tokens in comma or whitespace delimited lists 1533 my @values = ($val =~ /,/) ? split /,+\s*/, $val : split ' ', $val; 1534 foreach $val (@values) { 1535 $exifTool->FoundTag($tagInfo, $val); 1536 } 1537 } else { 1538 # a simple tag value 993 1539 $exifTool->FoundTag($tagInfo, $val); 994 1540 } 995 } else {996 # a tag value997 $exifTool->FoundTag($tagInfo, $val);998 1541 } 999 1542 next; … … 1009 1552 for (;;) { 1010 1553 my $subDict = shift @subDicts or last; 1554 # save last fetched object in case we fetch another one here 1555 my $prevFetched = $lastFetched; 1011 1556 if (ref $subDict eq 'SCALAR') { 1557 # only fetch once (other copies are obsolete) 1558 next if $fetched{$$subDict}; 1012 1559 # load dictionary via an indirect reference 1013 1560 $fetched{$$subDict} = 1; 1014 $subDict = FetchObject($exifTool, $$subDict, $xref, $tag); 1015 $subDict or $exifTool->Warn("Error reading $tag object"), next; 1561 my $obj = FetchObject($exifTool, $$subDict, $xref, $tag); 1562 unless (defined $obj) { 1563 unless (defined $lastOffset) { 1564 $exifTool->Warn("Error reading $tag object ($$subDict)"); 1565 } 1566 next; 1567 } 1568 $subDict = $obj; 1016 1569 } 1017 1570 if (ref $subDict eq 'ARRAY') { 1018 1571 # convert array of key/value pairs to a hash 1019 1572 next if @$subDict < 2; 1020 my %hash = ( tags => [] );1573 my %hash = ( _tags => [] ); 1021 1574 while (@$subDict >= 2) { 1022 1575 my $key = shift @$subDict; 1023 1576 $key =~ s/^\///; 1024 push @{$hash{ tags}}, $key;1577 push @{$hash{_tags}}, $key; 1025 1578 $hash{$key} = shift @$subDict; 1026 1579 } … … 1029 1582 next unless ref $subDict eq 'HASH'; 1030 1583 } 1584 # set flag to re-crypt all strings when rewriting if the dictionary 1585 # came from an encrypted stream 1586 $$subDict{_needCrypt}{'*'} = 1 unless $lastFetched; 1031 1587 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); 1032 1588 if (not $verbose) { 1033 ProcessDict($exifTool, $subTablePtr, $subDict, $xref, $nesting); 1589 my $proc = $$subTablePtr{PROCESS_PROC} || \&ProcessDict; 1590 &$proc($exifTool, $subTablePtr, $subDict, $xref, $nesting); 1034 1591 } elsif ($next) { 1035 1592 # handle 'Next' links at this level to avoid deep recursion … … 1038 1595 $tagTablePtr = $subTablePtr; 1039 1596 $dict = $subDict; 1040 @tags = @{$$subDict{ tags}};1597 @tags = @{$$subDict{_tags}}; 1041 1598 $exifTool->VerboseDir($tag, scalar(@tags)); 1042 1599 } else { … … 1045 1602 $exifTool->{INDENT} .= '| '; 1046 1603 $exifTool->{DIR_NAME} = $tag; 1047 $exifTool->VerboseDir($tag, scalar(@{$$subDict{ tags}}));1604 $exifTool->VerboseDir($tag, scalar(@{$$subDict{_tags}})); 1048 1605 ProcessDict($exifTool, $subTablePtr, $subDict, $xref, $nesting); 1049 1606 $exifTool->{INDENT} = $oldIndent; 1050 1607 $exifTool->{DIR_NAME} = $oldDir; 1051 1608 } 1052 } 1053 } 1054 # 1055 # extract information from stream object if it exists 1056 # 1057 return unless $$dict{stream}; 1058 my $tag = 'stream'; 1609 $lastFetched = $prevFetched; 1610 } 1611 } 1612 # 1613 # extract information from joined streams if necessary 1614 # 1615 1616 if (%join) { 1617 my ($tag, $i); 1618 foreach $tag (sort keys %join) { 1619 my $list = $join{$tag}; 1620 last unless defined $$list[1] and $$list[1] =~ /^%.*?([\x0d\x0a]*)/; 1621 my $buff = "%!PS-Adobe-3.0$1"; # add PS header with same line break 1622 for ($i=1; defined $$list[$i]; ++$i) { 1623 $buff .= $$list[$i]; 1624 undef $$list[$i]; # free memory 1625 } 1626 $exifTool->HandleTag($tagTablePtr, $tag, $buff); 1627 } 1628 } 1629 # 1630 # extract information from stream object if it exists (ie. Metadata stream) 1631 # 1632 return unless $$dict{_stream}; 1633 my $tag = '_stream'; 1059 1634 # add Subtype (if it exists) to stream name and remove leading '/' 1060 ($tag = "$$dict{Subtype}_$tag") =~ s/^\/// if $$dict{Subtype};1635 ($tag = $$dict{Subtype} . $tag) =~ s/^\/// if $$dict{Subtype}; 1061 1636 return unless $$tagTablePtr{$tag}; 1062 1637 my $tagInfo = $exifTool->GetTagInfo($tagTablePtr, $tag); … … 1065 1640 # extract information from stream 1066 1641 my %dirInfo = ( 1067 DataPt => \$$dict{stream},1068 DataLen => length $$dict{stream},1642 DataPt => \$$dict{_stream}, 1643 DataLen => length $$dict{_stream}, 1069 1644 DirStart => 0, 1070 DirLen => length $$dict{stream},1071 Parent => 'PDF',1645 DirLen => length $$dict{_stream}, 1646 Parent => 'PDF', 1072 1647 ); 1073 1648 my $subTablePtr = GetTagTable($tagInfo->{SubDirectory}->{TagTable}); … … 1086 1661 my $raf = $$dirInfo{RAF}; 1087 1662 my $verbose = $exifTool->Options('Verbose'); 1088 my ($ data, $encrypt, $id);1663 my ($buff, $encrypt, $id); 1089 1664 # 1090 1665 # validate PDF file 1091 1666 # 1092 $raf->Read($data, 4) == 4 or return 0; 1093 $data eq '%PDF' or return 0; 1667 # (linearization dictionary must be in the first 1024 bytes of the file) 1668 $raf->Read($buff, 1024) >= 8 or return 0; 1669 $buff =~ /^%PDF-(\d+\.\d+)/ or return 0; 1670 $pdfVer = $1; 1094 1671 $exifTool->SetFileType(); # set the FileType tag 1095 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main'); 1672 $exifTool->Warn("May not be able to read a PDF version $pdfVer file") if $pdfVer >= 2.0; 1673 # store PDFVersion tag 1674 my $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Root'); 1675 $exifTool->HandleTag($tagTablePtr, 'Version', $pdfVer); 1676 $tagTablePtr = GetTagTable('Image::ExifTool::PDF::Main'); 1677 # 1678 # check for a linearized PDF (only if reading) 1679 # 1680 my $capture = $$exifTool{PDF_CAPTURE}; 1681 unless ($capture) { 1682 my $lin = 'false'; 1683 if ($buff =~ /<</g) { 1684 $buff = substr($buff, pos($buff) - 2); 1685 my $dict = ExtractObject($exifTool, \$buff); 1686 if (ref $dict eq 'HASH' and $$dict{Linearized} and $$dict{L}) { 1687 if (not $$exifTool{VALUE}{FileSize}) { 1688 undef $lin; # can't determine if it is linearized 1689 } elsif ($$dict{L} == $$exifTool{VALUE}{FileSize}) { 1690 $lin = 'true'; 1691 } 1692 } 1693 } 1694 $exifTool->HandleTag($tagTablePtr, '_linearized', $lin) if $lin; 1695 } 1096 1696 # 1097 1697 # read the xref tables referenced from startxref at the end of the file … … 1103 1703 $len = 1024 if $len > 1024; 1104 1704 $raf->Seek(-$len, 2) or return -2; 1105 $raf->Read($data, $len) == $len or return -3; 1106 $data =~ /.*startxref(\x0d\x0a|\x0a\x0a|\x0d|\x0a)(\d+)\1%%EOF/s or return -4; 1107 $/ = $1; # set input record separator 1108 push @xrefOffsets, $2; 1109 my (%xref, @mainDicts, %loaded); 1705 $raf->Read($buff, $len) == $len or return -3; 1706 # find the last xref table in the file (may be multiple %%EOF marks) 1707 $buff =~ /.*startxref *(\x0d\x0a|\x0d|\x0a)\s*?(\d+)\s+%%EOF/s or return -4; 1708 local $/ = $1; # set input record separator 1709 push @xrefOffsets, $2, 'Main'; 1710 my (%xref, @mainDicts, %loaded, $mainFree); 1711 # initialize variables to capture when rewriting 1712 if ($capture) { 1713 $capture->{startxref} = $2; 1714 $capture->{xref} = \%xref; 1715 $capture->{newline} = $/; 1716 $capture->{mainFree} = $mainFree = { }; 1717 } 1110 1718 while (@xrefOffsets) { 1111 1719 my $offset = shift @xrefOffsets; 1720 my $type = shift @xrefOffsets; 1112 1721 next if $loaded{$offset}; # avoid infinite recursion 1113 1722 unless ($raf->Seek($offset, 0)) { … … 1116 1725 next; 1117 1726 } 1118 unless ($raf->ReadLine($data)) { 1727 # Note: care must be taken because ReadLine may read more than we want if 1728 # the newline sequence for this table is different than the rest of the file 1729 unless ($raf->ReadLine($buff)) { 1119 1730 %loaded or return -6; 1120 1731 $exifTool->Warn('Bad offset for secondary xref table'); … … 1122 1733 } 1123 1734 my $loadXRefStream; 1124 if ($ data eq "xref$/") {1735 if ($buff =~ s/^xref\s+//s) { 1125 1736 # load xref table 1126 1737 for (;;) { 1127 $raf->ReadLine($data) or return -6; 1128 last if $data eq "trailer$/"; 1129 my ($start, $num) = $data =~ /(\d+)\s+(\d+)/; 1130 $num or return -4; 1738 # read another line if necessary (skipping blank lines) 1739 $raf->ReadLine($buff) or return -6 until $buff =~ /\S/; 1740 last if $buff =~ s/^\s*trailer\s+//s; 1741 $buff =~ s/\s*(\d+)\s+(\d+)\s+//s or return -4; 1742 my ($start, $num) = ($1, $2); 1743 $raf->Seek(-length($buff), 1) or return -4; 1131 1744 my $i; 1132 1745 for ($i=0; $i<$num; ++$i) { 1133 $raf->Read($data, 20) == 20 or return -6; 1134 $data =~ /^(\d{10}) (\d{5}) (f|n)/ or return -4; 1135 next if $3 eq 'f'; # ignore free entries 1136 # save the object offset keyed by its reference 1137 my $ref = ($start + $i) . ' ' . int($2) . ' R'; 1138 $xref{$ref} = int($1); 1746 $raf->Read($buff, 20) == 20 or return -6; 1747 $buff =~ /^\s*(\d{10}) (\d{5}) (f|n)/s or return -4; 1748 my $num = $start + $i; 1749 # save offset for newest copy of all objects 1750 # (or next object number for free objects) 1751 unless (defined $xref{$num}) { 1752 my ($offset, $gen) = (int($1), int($2)); 1753 $xref{$num} = $offset; 1754 if ($3 eq 'f') { 1755 # save free objects in last xref table for rewriting 1756 $$mainFree{$num} = [ $offset, $gen, 'f' ] if $mainFree; 1757 next; 1758 } 1759 # also save offset keyed by object reference string 1760 $xref{"$num $gen R"} = $offset; 1761 } 1139 1762 } 1140 } 1141 %xref or return -4; 1142 $data = ''; 1143 } elsif ($data =~ s/^(\d+)\s+(\d+)\s+obj//) { 1763 # (I have a sample from Adobe which has an empty xref table) 1764 # %xref or return -4; # xref table may not be empty 1765 $buff = ''; 1766 } 1767 undef $mainFree; # only do this for the last xref table 1768 } elsif ($buff =~ s/^(\d+)\s+(\d+)\s+obj//s) { 1144 1769 # this is a PDF-1.5 cross-reference stream dictionary 1145 1770 $loadXRefStream = 1; … … 1149 1774 next; 1150 1775 } 1151 my $mainDict = ExtractObject($exifTool, \$ data, $raf, \%xref);1152 unless ( $mainDict) {1776 my $mainDict = ExtractObject($exifTool, \$buff, $raf, \%xref); 1777 unless (ref $mainDict eq 'HASH') { 1153 1778 %loaded or return -8; 1154 1779 $exifTool->Warn('Error loading secondary dictionary'); … … 1157 1782 if ($loadXRefStream) { 1158 1783 # decode and save our XRef stream from PDF-1.5 file 1159 # (parse it later as required to avoid wasting time) 1784 # (but parse it later as required to save time) 1785 # Note: this technique can potentially result in an old object 1786 # being used if the file was incrementally updated and an older 1787 # object from an xref table was replaced by a newer object in an 1788 # xref stream. But doing so isn't a good idea (if allowed at all) 1789 # because a PDF 1.4 consumer would also make this same mistake. 1160 1790 if ($$mainDict{Type} eq '/XRef' and $$mainDict{W} and 1161 1791 @{$$mainDict{W}} > 2 and $$mainDict{Size} and … … 1164 1794 # create Index entry if it doesn't exist 1165 1795 $$mainDict{Index} or $$mainDict{Index} = [ 0, $$mainDict{Size} ]; 1166 # create ' entry_size' entry for internal use1796 # create '_entry_size' entry for internal use 1167 1797 my $w = $$mainDict{W}; 1168 1798 my $size = 0; 1169 1799 foreach (@$w) { $size += $_; } 1170 $$mainDict{ entry_size} = $size;1800 $$mainDict{_entry_size} = $size; 1171 1801 # save this stream dictionary to use later if required 1172 1802 $xref{dicts} = [] unless $xref{dicts}; … … 1179 1809 $loaded{$offset} = 1; 1180 1810 # load XRef stream in hybrid file if it exists 1181 push @xrefOffsets, $$mainDict{XRefStm} if $$mainDict{XRefStm};1811 push @xrefOffsets, $$mainDict{XRefStm}, 'XRefStm' if $$mainDict{XRefStm}; 1182 1812 $encrypt = $$mainDict{Encrypt} if $$mainDict{Encrypt}; 1183 1813 if ($$mainDict{ID} and ref $$mainDict{ID} eq 'ARRAY') { 1184 $id = $mainDict->{ID}->[0];1185 } 1186 push @mainDicts, $mainDict ;1814 $id = ReadPDFValue($mainDict->{ID}->[0]); 1815 } 1816 push @mainDicts, $mainDict, $type; 1187 1817 # load previous xref table if it exists 1188 push @xrefOffsets, $$mainDict{Prev} if $$mainDict{Prev};1818 push @xrefOffsets, $$mainDict{Prev}, 'Prev' if $$mainDict{Prev}; 1189 1819 } 1190 1820 # … … 1197 1827 # generate Encryption tag information 1198 1828 my $err = DecryptInit($exifTool, $encrypt, $id); 1199 $err and $exifTool->Warn($err), return -1; 1829 if ($err) { 1830 $exifTool->Warn($err); 1831 $$capture{Error} = $err if $capture; 1832 return -1; 1833 } 1200 1834 } 1201 1835 # 1202 1836 # extract the information beginning with each of the main dictionaries 1203 1837 # 1204 my $dict; 1205 foreach $dict (@mainDicts) { 1838 while (@mainDicts) { 1839 my $dict = shift @mainDicts; 1840 my $type = shift @mainDicts; 1206 1841 if ($verbose) { 1207 my $n = scalar(@{$$dict{ tags}});1842 my $n = scalar(@{$$dict{_tags}}); 1208 1843 $exifTool->VPrint(0, "PDF dictionary with $n entries:\n"); 1209 1844 } 1210 ProcessDict($exifTool, $tagTablePtr, $dict, \%xref); 1845 ProcessDict($exifTool, $tagTablePtr, $dict, \%xref, 0, $type); 1846 } 1847 # handle any decryption errors 1848 if ($encrypt) { 1849 my $err = $$encrypt{_error}; 1850 if ($err) { 1851 $exifTool->Warn($err); 1852 $$capture{Error} = $err if $capture; 1853 return -1; 1854 } 1211 1855 } 1212 1856 return 1; … … 1235 1879 my ($exifTool, $dirInfo) = @_; 1236 1880 1237 my $oldsep = $/; 1881 undef $cryptInfo; # (must not delete after returning so writer can use it) 1882 undef $cryptStream; 1883 undef $cryptString; 1238 1884 my $result = ReadPDF($exifTool, $dirInfo); 1239 $/ = $oldsep; # restore input record separator in case it was changed1240 1885 if ($result < 0) { 1241 1886 $exifTool->Warn($pdfWarning{$result}) if $pdfWarning{$result}; … … 1243 1888 } 1244 1889 # clean up and return 1245 undef %warnedOnce;1246 1890 undef %streamObjs; 1247 1891 undef %fetched; 1248 undef $cryptInfo;1249 1892 return $result; 1250 1893 } … … 1267 1910 This code reads meta information from PDF (Adobe Portable Document Format) 1268 1911 files. It supports object streams introduced in PDF-1.5 but only with a 1269 limited set of Filter and Predictor algorithms, and it decodes encrypted 1270 information but only with a limited number of algorithms. 1912 limited set of Filter and Predictor algorithms, however all standard 1913 encryption methods through PDF-1.7 extension level 3 are supported, 1914 including AESV2 (AES-128) and AESV3 (AES-256). 1271 1915 1272 1916 =head1 AUTHOR 1273 1917 1274 Copyright 2003-20 07, Phil Harvey (phil at owl.phy.queensu.ca)1918 Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca) 1275 1919 1276 1920 This library is free software; you can redistribute it and/or modify it … … 1283 1927 =item L<http://partners.adobe.com/public/developer/pdf/index_reference.html> 1284 1928 1285 =item L<http://www.cr0.net:8040/code/crypto/rc4/> 1929 =item L<Crypt::RC4|Crypt::RC4> 1930 1931 =item L<http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf> 1932 1933 =item L<http://www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000.pdf> 1934 1935 =item L<http://tools.ietf.org/search/rfc3454> 1936 1937 =item L<http://www.armware.dk/RFC/rfc/rfc4013.html> 1286 1938 1287 1939 =back
Note:
See TracChangeset
for help on using the changeset viewer.