# # PDF::Parse.pm, version 1.11 February 2000 antro # # Copyright (c) 1998 - 2000 Antonio Rosella Italy antro@tiscalinet.it, Johannes Blach dw235@yahoo.com # # Free usage under the same Perl Licence condition. # package PDF::Parse; $PDF::Parse::VERSION = "1.11"; =pod =head1 NAME PDF::Parse - Library with parsing functions for PDF library =head1 SYNOPSIS use PDF::Parse; $pdf->TargetFile($filename); $pdf->LoadPageInfo; $version = $pdf->Version; $bool = $pdf->IsaPDF; $bool = $pdf->IscryptPDF; $info = $pdf->GetInfo ($key); $pagenum = $pdf->Pages; @size = $pdf->PageSize ($page); # or @size = $pdf->PageSize; $rotation = $pdf->PageRotation ($page); # or $rotation = $pdf->PageRotation; =head1 DESCRIPTION The main purpose of the PDF::Parse library is to provide parsing functions for the more general PDF library. =head1 Methods The available methods are: =cut require 5.005; #require PDF::Core; require Core; use strict; use Carp; use Exporter (); use vars qw(@ISA @EXPORT_OK); @ISA = qw(Exporter PDF::Core); @EXPORT_OK = qw( LoadPageInfo GetInfo TargetFile Pages PageSize PageRotation IsaPDF Version IscryptPDF ); ################################################################# sub ReadCrossReference_pass1 { my $fd = shift; my $offset=shift; my $self=shift; my $initial_number; my $obj_counter=0; my $global_obj_counter=0; my $buf; binmode $fd; $_=PDF::Core::PDFGetline ($fd,\$offset); die "Can't read cross-reference section, according to trailer\n" if ! /xref\r?\n?/ ; while () { $_=PDF::Core::PDFGetline ($fd,\$offset); s/^\n//; s/^\r//; last if (m/\btrailer\b/) ; # # An Object # /^\d+\s+\d+\s+n\r?\n?/ && do { my $buf =$_; my $ind = $initial_number + ($obj_counter++); ( not defined $self->{Objects}[$ind] )&& do { $self->{Objects}[$ind] = int substr($buf,0,10); $self->{Gen_Num}[$ind] = int substr($buf,11,5); }; $_=$buf; s/^.{18}//; next ; }; # # A Freed Object # /^\d+\s+\d+\s+f\r?\n?/ && do { my $buf =$_; my $objects_generation_nr = substr($buf,11,5); my $Num=substr($buf,0,10); my $ind = $initial_number + ($obj_counter++); # $ind = $ind . "_" . $objects_generation_nr; $self->{Objects}[$ind] = - $Num; $self->{Gen_Num}[$ind] = $objects_generation_nr; $_=$buf; s/^.{18}//; next ; }; # # A subsection # /^\d+\s+\d+\r?\n?/ && do { my $buf = $_ ; $initial_number = $buf; $initial_number=~ s/^(\d+)\s+\d+\r?\n?.*/$1/; $global_obj_counter += $obj_counter; $obj_counter=0; next ; }; } $global_obj_counter +=$obj_counter; # # Now the trailer for updates # # # Skip to start of dictionary. # until (m/<{"Trailer"}{"/Root"} eq "") { $self->{"Trailer"} = \%trailer; # # This code is here for backward compatibility only. If the content # of the root trailer is needed, use $self->{"Trailer"} instead. # $self->{"Cross_Reference_Size"} = $trailer{"/Size"}; $self->{"Root_Object"} = $trailer{"/Root"}; $self->{"Crypt_Object"} = $trailer{"/Encrypt"}; } if ($trailer{"/Prev"} =~ m/^\d+$/) { $self->{"Updated"} = 1; my $old_seek = tell $fd; $global_obj_counter += ReadCrossReference_pass1 ($fd, $trailer{"/Prev"}, $self ); seek $fd, $old_seek, 0; } return $global_obj_counter; } ################################################################# sub LoadPageSubtree (\*$;%) { my $self = shift; my $ref = shift; my %inheritance = @_ ; my $data = $self->GetObject ($ref); # Check which attributes are inherited. Adobe did not add any new # inherited attributes in version 1.2 or later, so this list is # complete. # Do simple values. foreach my $key ("/Rotate", "/Dur", "/Hid", "/Trans", "/MediaBox", "/CropBox") { if (defined ($data->{$key})){ # Check if it is an indirect reference if ($data->{$key} =~ m/^\d+ \d+ R$/) { my $dataref = $data->{$key}; do { $dataref = $self->GetObject ($dataref); } while ($dataref =~ m/^\d+ \d+ R$/); if (UNIVERSAL::isa ($data, "ARRAY")){ $inheritance{$key} = []; foreach my $i (@{$data}){ # Each element may be a reference. while ($i =~ m/^\d+ \d+ R$/){ $i = $self->GetObject ($i); } push @{$inheritance{$key}}, $i; } } else { $inheritance{$key} = $dataref; } } else { #not an indirect reference $inheritance{$key} = $data->{$key}; } } } # If this objects contains ressources, replace information in inheritance $inheritance{"Resource_Object"} = $data->{"/Resources"} if (defined ($data->{"/Resources"})); if ($data->{"/Type"} eq "/Pages") { # It's just an intermediate Node foreach my $kid (@{$data->{"/Kids"}}) { $self->LoadPageSubtree ($kid, %inheritance); } } elsif ($data->{"/Type"} eq "/Page") { # We have a real page! $inheritance{"Page_Object"} = $ref; push @{$self->{"Page"}}, +{ %inheritance }; } else { # Strange stuff. Complain and discard. carp "While loading pages got object of type '", $data->{"/Type"}, "'"; } } ################################################################# =pod =head2 TargetFile ( filename ) This method links the filename to the pdf descriptor and parses all kind of header information. =cut sub TargetFile { my $self = shift; my $file = shift; croak "Already linked to the file ",$self->{File_Name},"\n" if $self->{File_Name} ; my $offset; if ( $file ) { open(FILE, "< $file") or croak "can't open $file: $!"; binmode FILE; $self->{File_Name} = $file ; $self->{File_Handler} = \*FILE; my $buf; read(FILE,$buf,4); if ( $buf ne "%PDF" ) { print "File $_[0] is not PDF compliant !\n" if $PDF::Verbose ; return 0 ; } read(FILE,$buf,4); $buf =~ s/-//; $self->{Header}= $buf; seek FILE,-50,2; read( FILE, $offset, 50 ); $offset =~ s/[^s]*startxref\r?\n?(\d*)\r?\n?%%EOF\r?\n?/$1/; $self->{"Last_XRef_Offset"} = $offset; ReadCrossReference_pass1 (\*FILE, $offset, $self); $self->{"Info"} = $self->GetObject ($self->{"Trailer"}{"/Info"}); $self->{"Catalog"} = $self->GetObject ($self->{"Trailer"}{"/Root"}); $self->{"PageTree"} = $self->GetObject ($self->{"Catalog"}{"/Pages"}); return 1; } else { croak "I need a file name (!)"; } } ################################################################# =pod =head2 LoadPageInfo This function loads the information for all pages. This process can take some time for big PDF-files. =cut sub LoadPageInfo (\*) { my $self = shift; # Reset Page Array $#{$self->{"Page"}} = -1; # Recurse $self->LoadPageSubtree ($self->{"Catalog"}{"/Pages"}); } ################################################################# =pod =head2 Version Returns the PDF version used for writing the object file. =cut sub Version { return ($_[0]->{Header}); } ################################################################# =pod =head2 IsaPDF Returns true, if the file could be parsed and is a PDF-file. =cut sub IsaPDF { return ($_[0]->{Header} != undef) ; } ################################################################# =pod =head2 IscryptPDF Returns true if the PDF contains a crypt object. This indicates that the data of the PDF-File is encrypted. In this case, not all function work as expected. =cut sub IscryptPDF { return ($_[0]->{Crypt_Object} != undef) ; } ################################################################# =pod =head2 GetInfo ( key ) Returns the various information contained in the info section of a PDF file (if present). A PDF file can have: a title ==> GetInfo ("Title") a subject ==> GetInfo ("Subject") an author ==> GetInfo("Author") a creation date ==> GetInfo("CreationDate") a creator ==> GetInfo("Creator") a producer ==> GetInfo("Producer") a modification date ==> GetInfo("ModDate") some keywords ==> GetInfo("Keywords") =cut sub GetInfo (\*$) { my $self = shift; my $type = shift; return PDF::Core::UnQuoteString ($self->{"Info"}{"/" . $type}) } ################################################################# =pod =head2 Pages Returns the number of pages of the PDF-file. =cut sub Pages { my $self = shift; return $self->{"PageTree"}{"/Count"}; } ################################################################# =pod =head2 PageSize ( [ page ] ) Returns the size of a page in the PDF-file. If no parameter is given, the default size of the root page will be returned. This value may be overridden for any page. If the size of an individual page is requested and the page data is not already loaded, the method B will be executed. This may take some time for large PDF-files. The size of the root page is always available and will never execute B. =cut sub PageSize (;$) { my $self = shift; my $page = shift; if ($page > 0) { return undef if ($page > $self->{"PageTree"}{"/Count"}); $self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0); return @{$self->{"Page"}[$page - 1]{"/MediaBox"}} if (defined $self->{"Page"}[$page - 1]{"/MediaBox"}); } else { return @{$self->{"PageTree"}{"/MediaBox"}} if (defined $self->{"PageTree"}{"/MediaBox"}); } return undef; } ################################################################# =pod =head2 PageRotation ( [ page ] ) Returns the rotation of a page in the PDF-file. If no parameter is given, the default rotation of the root page will be returned. This value may be overridden for any page. If the rotation of an individual page is requested and the page data is not already loaded, the method B will be executed. This may take some time for large PDF-files. The rotation of the root page is always available and will never execute B. =cut sub PageRotation (;$) { my $self = shift; my $page = shift; my $rotate = 0; if ($page > 0) { return undef if ($page > $self->{"PageTree"}{"/Count"}); $self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0); $rotate = $self->{"Page"}[$page - 1]{"/Rotate"}; } else { $rotate = $self->{"PageTree"}{"/Rotate"}; } print "Rotation ", 0 + $rotate if ($PDF::Verbose); return 0 + $rotate; } ################################################################# 1; __END__ =head1 Variables The only available variable is : =over =item B<$PDF::Parse::VERSION> Contains the version of the library installed =back =head1 Copyright Copyright (c) 1998 - 2000 Antonio Rosella Italy antro@tiscalinet.it, Johannes Blach dw235@yahoo.com This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 Availability The latest version of this library is likely to be available from: http://www.geocities.com/CapeCanaveral/Hangar/4794/ =cut