1 | #
|
---|
2 | # PDF::Parse.pm, version 1.11 February 2000 antro
|
---|
3 | #
|
---|
4 | # Copyright (c) 1998 - 2000 Antonio Rosella Italy [email protected], Johannes Blach [email protected]
|
---|
5 | #
|
---|
6 | # Free usage under the same Perl Licence condition.
|
---|
7 | #
|
---|
8 |
|
---|
9 | package PDF::Parse;
|
---|
10 |
|
---|
11 | $PDF::Parse::VERSION = "1.11";
|
---|
12 |
|
---|
13 | =pod
|
---|
14 |
|
---|
15 | =head1 NAME
|
---|
16 |
|
---|
17 | PDF::Parse - Library with parsing functions for PDF library
|
---|
18 |
|
---|
19 | =head1 SYNOPSIS
|
---|
20 |
|
---|
21 | use PDF::Parse;
|
---|
22 |
|
---|
23 | $pdf->TargetFile($filename);
|
---|
24 | $pdf->LoadPageInfo;
|
---|
25 |
|
---|
26 | $version = $pdf->Version;
|
---|
27 | $bool = $pdf->IsaPDF;
|
---|
28 | $bool = $pdf->IscryptPDF;
|
---|
29 |
|
---|
30 | $info = $pdf->GetInfo ($key);
|
---|
31 | $pagenum = $pdf->Pages;
|
---|
32 |
|
---|
33 | @size = $pdf->PageSize ($page);
|
---|
34 | # or
|
---|
35 | @size = $pdf->PageSize;
|
---|
36 |
|
---|
37 | $rotation = $pdf->PageRotation ($page);
|
---|
38 | # or
|
---|
39 | $rotation = $pdf->PageRotation;
|
---|
40 |
|
---|
41 | =head1 DESCRIPTION
|
---|
42 |
|
---|
43 | The main purpose of the PDF::Parse library is to provide parsing functions
|
---|
44 | for the more general PDF library.
|
---|
45 |
|
---|
46 | =head1 Methods
|
---|
47 |
|
---|
48 | The available methods are:
|
---|
49 |
|
---|
50 | =cut
|
---|
51 |
|
---|
52 | require 5.005;
|
---|
53 | #require PDF::Core;
|
---|
54 | require Core;
|
---|
55 |
|
---|
56 | use strict;
|
---|
57 | use Carp;
|
---|
58 | use Exporter ();
|
---|
59 |
|
---|
60 | use vars qw(@ISA @EXPORT_OK);
|
---|
61 |
|
---|
62 | @ISA = qw(Exporter PDF::Core);
|
---|
63 |
|
---|
64 | @EXPORT_OK = qw( LoadPageInfo GetInfo TargetFile
|
---|
65 | Pages PageSize PageRotation IsaPDF
|
---|
66 | Version IscryptPDF );
|
---|
67 |
|
---|
68 | #################################################################
|
---|
69 | sub ReadCrossReference_pass1 {
|
---|
70 | my $fd = shift;
|
---|
71 | my $offset=shift;
|
---|
72 | my $self=shift;
|
---|
73 |
|
---|
74 | my $initial_number;
|
---|
75 | my $obj_counter=0;
|
---|
76 | my $global_obj_counter=0;
|
---|
77 | my $buf;
|
---|
78 |
|
---|
79 | binmode $fd;
|
---|
80 |
|
---|
81 | $_=PDF::Core::PDFGetline ($fd,\$offset);
|
---|
82 |
|
---|
83 | die "Can't read cross-reference section, according to trailer\n" if ! /xref\r?\n?/ ;
|
---|
84 |
|
---|
85 | while () {
|
---|
86 | $_=PDF::Core::PDFGetline ($fd,\$offset);
|
---|
87 | s/^\n//;
|
---|
88 | s/^\r//;
|
---|
89 | last if (m/\btrailer\b/) ;
|
---|
90 | #
|
---|
91 | # An Object
|
---|
92 | #
|
---|
93 | /^\d+\s+\d+\s+n\r?\n?/ && do { my $buf =$_;
|
---|
94 | my $ind = $initial_number + ($obj_counter++);
|
---|
95 | ( not defined $self->{Objects}[$ind] )&&
|
---|
96 | do { $self->{Objects}[$ind] = int substr($buf,0,10);
|
---|
97 | $self->{Gen_Num}[$ind] = int substr($buf,11,5);
|
---|
98 | };
|
---|
99 | $_=$buf;
|
---|
100 | s/^.{18}//;
|
---|
101 | next ;
|
---|
102 | };
|
---|
103 | #
|
---|
104 | # A Freed Object
|
---|
105 | #
|
---|
106 | /^\d+\s+\d+\s+f\r?\n?/ && do { my $buf =$_;
|
---|
107 | my $objects_generation_nr = substr($buf,11,5);
|
---|
108 | my $Num=substr($buf,0,10);
|
---|
109 | my $ind = $initial_number + ($obj_counter++);
|
---|
110 | # $ind = $ind . "_" . $objects_generation_nr;
|
---|
111 | $self->{Objects}[$ind] = - $Num;
|
---|
112 | $self->{Gen_Num}[$ind] = $objects_generation_nr;
|
---|
113 | $_=$buf;
|
---|
114 | s/^.{18}//;
|
---|
115 | next ;
|
---|
116 | };
|
---|
117 | #
|
---|
118 | # A subsection
|
---|
119 | #
|
---|
120 | /^\d+\s+\d+\r?\n?/ && do {
|
---|
121 | my $buf = $_ ;
|
---|
122 | $initial_number = $buf;
|
---|
123 | $initial_number=~ s/^(\d+)\s+\d+\r?\n?.*/$1/;
|
---|
124 | $global_obj_counter += $obj_counter;
|
---|
125 | $obj_counter=0;
|
---|
126 | next ;
|
---|
127 | };
|
---|
128 | }
|
---|
129 |
|
---|
130 | $global_obj_counter +=$obj_counter;
|
---|
131 | #
|
---|
132 | # Now the trailer for updates
|
---|
133 | #
|
---|
134 |
|
---|
135 | #
|
---|
136 | # Skip to start of dictionary.
|
---|
137 | #
|
---|
138 | until (m/<</)
|
---|
139 | {
|
---|
140 | $_=PDF::Core::PDFGetline ($fd,\$offset);
|
---|
141 | }
|
---|
142 |
|
---|
143 | #
|
---|
144 | # Read the dictionary
|
---|
145 | #
|
---|
146 | my %trailer = ( PDF::Core::PDFGetPrimitive ($fd, $offset) );
|
---|
147 |
|
---|
148 | if ($self->{"Trailer"}{"/Root"} eq "")
|
---|
149 | {
|
---|
150 | $self->{"Trailer"} = \%trailer;
|
---|
151 | #
|
---|
152 | # This code is here for backward compatibility only. If the content
|
---|
153 | # of the root trailer is needed, use $self->{"Trailer"} instead.
|
---|
154 | #
|
---|
155 | $self->{"Cross_Reference_Size"} = $trailer{"/Size"};
|
---|
156 | $self->{"Root_Object"} = $trailer{"/Root"};
|
---|
157 | $self->{"Crypt_Object"} = $trailer{"/Encrypt"};
|
---|
158 | }
|
---|
159 | if ($trailer{"/Prev"} =~ m/^\d+$/)
|
---|
160 | {
|
---|
161 | $self->{"Updated"} = 1;
|
---|
162 | my $old_seek = tell $fd;
|
---|
163 | $global_obj_counter += ReadCrossReference_pass1 ($fd,
|
---|
164 | $trailer{"/Prev"}, $self );
|
---|
165 | seek $fd, $old_seek, 0;
|
---|
166 | }
|
---|
167 |
|
---|
168 |
|
---|
169 | return $global_obj_counter;
|
---|
170 | }
|
---|
171 |
|
---|
172 | #################################################################
|
---|
173 | sub LoadPageSubtree (\*$;%)
|
---|
174 | {
|
---|
175 | my $self = shift;
|
---|
176 | my $ref = shift;
|
---|
177 | my %inheritance = @_ ;
|
---|
178 |
|
---|
179 | my $data = $self->GetObject ($ref);
|
---|
180 |
|
---|
181 | # Check which attributes are inherited. Adobe did not add any new
|
---|
182 | # inherited attributes in version 1.2 or later, so this list is
|
---|
183 | # complete.
|
---|
184 |
|
---|
185 | # Do simple values.
|
---|
186 | foreach my $key ("/Rotate", "/Dur", "/Hid", "/Trans",
|
---|
187 | "/MediaBox", "/CropBox") {
|
---|
188 |
|
---|
189 | if (defined ($data->{$key})){
|
---|
190 |
|
---|
191 | # Check if it is an indirect reference
|
---|
192 | if ($data->{$key} =~ m/^\d+ \d+ R$/) {
|
---|
193 |
|
---|
194 | my $dataref = $data->{$key};
|
---|
195 |
|
---|
196 | do {
|
---|
197 |
|
---|
198 | $dataref = $self->GetObject ($dataref);
|
---|
199 |
|
---|
200 | } while ($dataref =~ m/^\d+ \d+ R$/);
|
---|
201 |
|
---|
202 | if (UNIVERSAL::isa ($data, "ARRAY")){
|
---|
203 |
|
---|
204 | $inheritance{$key} = [];
|
---|
205 |
|
---|
206 | foreach my $i (@{$data}){
|
---|
207 | # Each element may be a reference.
|
---|
208 | while ($i =~ m/^\d+ \d+ R$/){
|
---|
209 | $i = $self->GetObject ($i);
|
---|
210 | }
|
---|
211 |
|
---|
212 | push @{$inheritance{$key}}, $i;
|
---|
213 | }
|
---|
214 | } else {
|
---|
215 | $inheritance{$key} = $dataref;
|
---|
216 | }
|
---|
217 | } else { #not an indirect reference
|
---|
218 | $inheritance{$key} = $data->{$key};
|
---|
219 | }
|
---|
220 | }
|
---|
221 | }
|
---|
222 |
|
---|
223 | # If this objects contains ressources, replace information in inheritance
|
---|
224 | $inheritance{"Resource_Object"} = $data->{"/Resources"}
|
---|
225 | if (defined ($data->{"/Resources"}));
|
---|
226 |
|
---|
227 | if ($data->{"/Type"} eq "/Pages")
|
---|
228 | {
|
---|
229 | # It's just an intermediate Node
|
---|
230 | foreach my $kid (@{$data->{"/Kids"}})
|
---|
231 | {
|
---|
232 | $self->LoadPageSubtree ($kid, %inheritance);
|
---|
233 | }
|
---|
234 | }
|
---|
235 | elsif ($data->{"/Type"} eq "/Page")
|
---|
236 | {
|
---|
237 | # We have a real page!
|
---|
238 | $inheritance{"Page_Object"} = $ref;
|
---|
239 | push @{$self->{"Page"}}, +{ %inheritance };
|
---|
240 | }
|
---|
241 | else
|
---|
242 | {
|
---|
243 | # Strange stuff. Complain and discard.
|
---|
244 | carp "While loading pages got object of type '", $data->{"/Type"}, "'";
|
---|
245 | }
|
---|
246 | }
|
---|
247 |
|
---|
248 | #################################################################
|
---|
249 | =pod
|
---|
250 |
|
---|
251 | =head2 TargetFile ( filename )
|
---|
252 |
|
---|
253 | This method links the filename to the pdf descriptor and parses all
|
---|
254 | kind of header information.
|
---|
255 |
|
---|
256 | =cut
|
---|
257 |
|
---|
258 | sub TargetFile {
|
---|
259 | my $self = shift;
|
---|
260 | my $file = shift;
|
---|
261 |
|
---|
262 | croak "Already linked to the file ",$self->{File_Name},"\n"
|
---|
263 | if $self->{File_Name} ;
|
---|
264 |
|
---|
265 | my $offset;
|
---|
266 |
|
---|
267 | if ( $file ) {
|
---|
268 | open(FILE, "< $file") or croak "can't open $file: $!";
|
---|
269 | binmode FILE;
|
---|
270 | $self->{File_Name} = $file ;
|
---|
271 | $self->{File_Handler} = \*FILE;
|
---|
272 | my $buf;
|
---|
273 | read(FILE,$buf,4);
|
---|
274 | if ( $buf ne "%PDF" ) {
|
---|
275 | print "File $_[0] is not PDF compliant !\n" if $PDF::Verbose ;
|
---|
276 | return 0 ;
|
---|
277 | }
|
---|
278 | read(FILE,$buf,4);
|
---|
279 | $buf =~ s/-//;
|
---|
280 | $self->{Header}= $buf;
|
---|
281 | seek FILE,-50,2;
|
---|
282 | read( FILE, $offset, 50 );
|
---|
283 | $offset =~ s/[^s]*startxref\r?\n?(\d*)\r?\n?%%EOF\r?\n?/$1/;
|
---|
284 |
|
---|
285 | $self->{"Last_XRef_Offset"} = $offset;
|
---|
286 | ReadCrossReference_pass1 (\*FILE, $offset, $self);
|
---|
287 | $self->{"Info"} = $self->GetObject ($self->{"Trailer"}{"/Info"});
|
---|
288 | $self->{"Catalog"} = $self->GetObject ($self->{"Trailer"}{"/Root"});
|
---|
289 | $self->{"PageTree"} = $self->GetObject ($self->{"Catalog"}{"/Pages"});
|
---|
290 | return 1;
|
---|
291 | } else {
|
---|
292 | croak "I need a file name (!)";
|
---|
293 | }
|
---|
294 | }
|
---|
295 |
|
---|
296 | #################################################################
|
---|
297 | =pod
|
---|
298 |
|
---|
299 | =head2 LoadPageInfo
|
---|
300 |
|
---|
301 | This function loads the information for all pages. This process can
|
---|
302 | take some time for big PDF-files.
|
---|
303 |
|
---|
304 | =cut
|
---|
305 |
|
---|
306 | sub LoadPageInfo (\*)
|
---|
307 | {
|
---|
308 | my $self = shift;
|
---|
309 |
|
---|
310 | # Reset Page Array
|
---|
311 | $#{$self->{"Page"}} = -1;
|
---|
312 |
|
---|
313 | # Recurse
|
---|
314 | $self->LoadPageSubtree ($self->{"Catalog"}{"/Pages"});
|
---|
315 | }
|
---|
316 |
|
---|
317 |
|
---|
318 |
|
---|
319 | #################################################################
|
---|
320 | =pod
|
---|
321 |
|
---|
322 | =head2 Version
|
---|
323 |
|
---|
324 | Returns the PDF version used for writing the object file.
|
---|
325 |
|
---|
326 | =cut
|
---|
327 |
|
---|
328 | sub Version {
|
---|
329 | return ($_[0]->{Header});
|
---|
330 | }
|
---|
331 |
|
---|
332 | #################################################################
|
---|
333 | =pod
|
---|
334 |
|
---|
335 | =head2 IsaPDF
|
---|
336 |
|
---|
337 | Returns true, if the file could be parsed and is a PDF-file.
|
---|
338 |
|
---|
339 | =cut
|
---|
340 |
|
---|
341 | sub IsaPDF {
|
---|
342 | return ($_[0]->{Header} != undef) ;
|
---|
343 | }
|
---|
344 |
|
---|
345 | #################################################################
|
---|
346 | =pod
|
---|
347 |
|
---|
348 | =head2 IscryptPDF
|
---|
349 |
|
---|
350 | Returns true if the PDF contains a crypt object. This indicates that
|
---|
351 | the data of the PDF-File is encrypted. In this case, not all function
|
---|
352 | work as expected.
|
---|
353 |
|
---|
354 | =cut
|
---|
355 |
|
---|
356 | sub IscryptPDF {
|
---|
357 | return ($_[0]->{Crypt_Object} != undef) ;
|
---|
358 | }
|
---|
359 |
|
---|
360 | #################################################################
|
---|
361 | =pod
|
---|
362 |
|
---|
363 | =head2 GetInfo ( key )
|
---|
364 |
|
---|
365 | Returns the various information contained in the info section of a PDF
|
---|
366 | file (if present). A PDF file can have:
|
---|
367 |
|
---|
368 | a title ==> GetInfo ("Title")
|
---|
369 | a subject ==> GetInfo ("Subject")
|
---|
370 | an author ==> GetInfo("Author")
|
---|
371 | a creation date ==> GetInfo("CreationDate")
|
---|
372 | a creator ==> GetInfo("Creator")
|
---|
373 | a producer ==> GetInfo("Producer")
|
---|
374 | a modification date ==> GetInfo("ModDate")
|
---|
375 | some keywords ==> GetInfo("Keywords")
|
---|
376 |
|
---|
377 | =cut
|
---|
378 |
|
---|
379 | sub GetInfo (\*$)
|
---|
380 | {
|
---|
381 | my $self = shift;
|
---|
382 | my $type = shift;
|
---|
383 |
|
---|
384 | return PDF::Core::UnQuoteString ($self->{"Info"}{"/" . $type})
|
---|
385 | }
|
---|
386 |
|
---|
387 | #################################################################
|
---|
388 | =pod
|
---|
389 |
|
---|
390 | =head2 Pages
|
---|
391 |
|
---|
392 | Returns the number of pages of the PDF-file.
|
---|
393 |
|
---|
394 | =cut
|
---|
395 |
|
---|
396 | sub Pages
|
---|
397 | {
|
---|
398 | my $self = shift;
|
---|
399 |
|
---|
400 | return $self->{"PageTree"}{"/Count"};
|
---|
401 | }
|
---|
402 |
|
---|
403 |
|
---|
404 |
|
---|
405 | #################################################################
|
---|
406 | =pod
|
---|
407 |
|
---|
408 | =head2 PageSize ( [ page ] )
|
---|
409 |
|
---|
410 | Returns the size of a page in the PDF-file. If no parameter is given,
|
---|
411 | the default size of the root page will be returned. This value may be
|
---|
412 | overridden for any page.
|
---|
413 |
|
---|
414 | If the size of an individual page is requested and the page data is
|
---|
415 | not already loaded, the method B<LoadPageInfo> will be executed. This
|
---|
416 | may take some time for large PDF-files. The size of the root page is
|
---|
417 | always available and will never execute B<LoadPageInfo>.
|
---|
418 |
|
---|
419 | =cut
|
---|
420 |
|
---|
421 | sub PageSize (;$)
|
---|
422 | {
|
---|
423 | my $self = shift;
|
---|
424 | my $page = shift;
|
---|
425 |
|
---|
426 | if ($page > 0)
|
---|
427 | {
|
---|
428 | return undef if ($page > $self->{"PageTree"}{"/Count"});
|
---|
429 | $self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0);
|
---|
430 |
|
---|
431 | return @{$self->{"Page"}[$page - 1]{"/MediaBox"}}
|
---|
432 | if (defined $self->{"Page"}[$page - 1]{"/MediaBox"});
|
---|
433 | }
|
---|
434 | else
|
---|
435 | {
|
---|
436 | return @{$self->{"PageTree"}{"/MediaBox"}}
|
---|
437 | if (defined $self->{"PageTree"}{"/MediaBox"});
|
---|
438 | }
|
---|
439 |
|
---|
440 | return undef;
|
---|
441 | }
|
---|
442 |
|
---|
443 |
|
---|
444 | #################################################################
|
---|
445 | =pod
|
---|
446 |
|
---|
447 | =head2 PageRotation ( [ page ] )
|
---|
448 |
|
---|
449 | Returns the rotation of a page in the PDF-file. If no parameter is given,
|
---|
450 | the default rotation of the root page will be returned. This value may be
|
---|
451 | overridden for any page.
|
---|
452 |
|
---|
453 | If the rotation of an individual page is requested and the page data is
|
---|
454 | not already loaded, the method B<LoadPageInfo> will be executed. This
|
---|
455 | may take some time for large PDF-files. The rotation of the root page is
|
---|
456 | always available and will never execute B<LoadPageInfo>.
|
---|
457 |
|
---|
458 | =cut
|
---|
459 | sub PageRotation (;$)
|
---|
460 | {
|
---|
461 | my $self = shift;
|
---|
462 | my $page = shift;
|
---|
463 |
|
---|
464 | my $rotate = 0;
|
---|
465 |
|
---|
466 | if ($page > 0)
|
---|
467 | {
|
---|
468 | return undef if ($page > $self->{"PageTree"}{"/Count"});
|
---|
469 | $self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0);
|
---|
470 |
|
---|
471 | $rotate = $self->{"Page"}[$page - 1]{"/Rotate"};
|
---|
472 | }
|
---|
473 | else
|
---|
474 | {
|
---|
475 | $rotate = $self->{"PageTree"}{"/Rotate"};
|
---|
476 | }
|
---|
477 |
|
---|
478 | print "Rotation ", 0 + $rotate if ($PDF::Verbose);
|
---|
479 |
|
---|
480 | return 0 + $rotate;
|
---|
481 | }
|
---|
482 | #################################################################
|
---|
483 | 1;
|
---|
484 | __END__
|
---|
485 |
|
---|
486 | =head1 Variables
|
---|
487 |
|
---|
488 | The only available variable is :
|
---|
489 |
|
---|
490 | =over
|
---|
491 |
|
---|
492 | =item B<$PDF::Parse::VERSION>
|
---|
493 |
|
---|
494 | Contains the version of the library installed
|
---|
495 |
|
---|
496 | =back
|
---|
497 |
|
---|
498 |
|
---|
499 | =head1 Copyright
|
---|
500 |
|
---|
501 | Copyright (c) 1998 - 2000 Antonio Rosella Italy [email protected], Johannes Blach [email protected]
|
---|
502 |
|
---|
503 | This library is free software; you can redistribute it and/or
|
---|
504 | modify it under the same terms as Perl itself.
|
---|
505 |
|
---|
506 | =head1 Availability
|
---|
507 |
|
---|
508 | The latest version of this library is likely to be available from:
|
---|
509 |
|
---|
510 | http://www.geocities.com/CapeCanaveral/Hangar/4794/
|
---|
511 |
|
---|
512 | =cut
|
---|
513 |
|
---|