source: trunk/gsdl/bin/script/indexes/Parse.pm@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1#
2# PDF::Parse.pm, version 1.11 February 2000 antro
3#
4# Copyright (c) 1998 - 2000 Antonio Rosella Italy [email protected], Johannes Blach [email protected]
5#
6# Free usage under the same Perl Licence condition.
7#
8
9package PDF::Parse;
10
11$PDF::Parse::VERSION = "1.11";
12
13=pod
14
15=head1 NAME
16
17PDF::Parse - Library with parsing functions for PDF library
18
19=head1 SYNOPSIS
20
21 use PDF::Parse;
22
23 $pdf->TargetFile($filename);
24 $pdf->LoadPageInfo;
25
26 $version = $pdf->Version;
27 $bool = $pdf->IsaPDF;
28 $bool = $pdf->IscryptPDF;
29
30 $info = $pdf->GetInfo ($key);
31 $pagenum = $pdf->Pages;
32
33 @size = $pdf->PageSize ($page);
34 # or
35 @size = $pdf->PageSize;
36
37 $rotation = $pdf->PageRotation ($page);
38 # or
39 $rotation = $pdf->PageRotation;
40
41=head1 DESCRIPTION
42
43The main purpose of the PDF::Parse library is to provide parsing functions
44for the more general PDF library.
45
46=head1 Methods
47
48The available methods are:
49
50=cut
51
52require 5.005;
53#require PDF::Core;
54require Core;
55
56use strict;
57use Carp;
58use Exporter ();
59
60use vars qw(@ISA @EXPORT_OK);
61
62@ISA = qw(Exporter PDF::Core);
63
64@EXPORT_OK = qw( LoadPageInfo GetInfo TargetFile
65 Pages PageSize PageRotation IsaPDF
66 Version IscryptPDF );
67
68#################################################################
69sub ReadCrossReference_pass1 {
70 my $fd = shift;
71 my $offset=shift;
72 my $self=shift;
73
74 my $initial_number;
75 my $obj_counter=0;
76 my $global_obj_counter=0;
77 my $buf;
78
79 binmode $fd;
80
81 $_=PDF::Core::PDFGetline ($fd,\$offset);
82
83 die "Can't read cross-reference section, according to trailer\n" if ! /xref\r?\n?/ ;
84
85 while () {
86 $_=PDF::Core::PDFGetline ($fd,\$offset);
87 s/^\n//;
88 s/^\r//;
89 last if (m/\btrailer\b/) ;
90#
91# An Object
92#
93 /^\d+\s+\d+\s+n\r?\n?/ && do { my $buf =$_;
94 my $ind = $initial_number + ($obj_counter++);
95 ( not defined $self->{Objects}[$ind] )&&
96 do { $self->{Objects}[$ind] = int substr($buf,0,10);
97 $self->{Gen_Num}[$ind] = int substr($buf,11,5);
98 };
99 $_=$buf;
100 s/^.{18}//;
101 next ;
102 };
103#
104# A Freed Object
105#
106 /^\d+\s+\d+\s+f\r?\n?/ && do { my $buf =$_;
107 my $objects_generation_nr = substr($buf,11,5);
108 my $Num=substr($buf,0,10);
109 my $ind = $initial_number + ($obj_counter++);
110 # $ind = $ind . "_" . $objects_generation_nr;
111 $self->{Objects}[$ind] = - $Num;
112 $self->{Gen_Num}[$ind] = $objects_generation_nr;
113 $_=$buf;
114 s/^.{18}//;
115 next ;
116 };
117#
118# A subsection
119#
120 /^\d+\s+\d+\r?\n?/ && do {
121 my $buf = $_ ;
122 $initial_number = $buf;
123 $initial_number=~ s/^(\d+)\s+\d+\r?\n?.*/$1/;
124 $global_obj_counter += $obj_counter;
125 $obj_counter=0;
126 next ;
127 };
128 }
129
130 $global_obj_counter +=$obj_counter;
131#
132# Now the trailer for updates
133#
134
135#
136# Skip to start of dictionary.
137#
138 until (m/<</)
139 {
140 $_=PDF::Core::PDFGetline ($fd,\$offset);
141 }
142
143#
144# Read the dictionary
145#
146 my %trailer = ( PDF::Core::PDFGetPrimitive ($fd, $offset) );
147
148 if ($self->{"Trailer"}{"/Root"} eq "")
149 {
150 $self->{"Trailer"} = \%trailer;
151 #
152 # This code is here for backward compatibility only. If the content
153 # of the root trailer is needed, use $self->{"Trailer"} instead.
154 #
155 $self->{"Cross_Reference_Size"} = $trailer{"/Size"};
156 $self->{"Root_Object"} = $trailer{"/Root"};
157 $self->{"Crypt_Object"} = $trailer{"/Encrypt"};
158 }
159 if ($trailer{"/Prev"} =~ m/^\d+$/)
160 {
161 $self->{"Updated"} = 1;
162 my $old_seek = tell $fd;
163 $global_obj_counter += ReadCrossReference_pass1 ($fd,
164 $trailer{"/Prev"}, $self );
165 seek $fd, $old_seek, 0;
166 }
167
168
169 return $global_obj_counter;
170}
171
172#################################################################
173sub LoadPageSubtree (\*$;%)
174 {
175 my $self = shift;
176 my $ref = shift;
177 my %inheritance = @_ ;
178
179 my $data = $self->GetObject ($ref);
180
181 # Check which attributes are inherited. Adobe did not add any new
182 # inherited attributes in version 1.2 or later, so this list is
183 # complete.
184
185 # Do simple values.
186 foreach my $key ("/Rotate", "/Dur", "/Hid", "/Trans",
187 "/MediaBox", "/CropBox") {
188
189 if (defined ($data->{$key})){
190
191 # Check if it is an indirect reference
192 if ($data->{$key} =~ m/^\d+ \d+ R$/) {
193
194 my $dataref = $data->{$key};
195
196 do {
197
198 $dataref = $self->GetObject ($dataref);
199
200 } while ($dataref =~ m/^\d+ \d+ R$/);
201
202 if (UNIVERSAL::isa ($data, "ARRAY")){
203
204 $inheritance{$key} = [];
205
206 foreach my $i (@{$data}){
207 # Each element may be a reference.
208 while ($i =~ m/^\d+ \d+ R$/){
209 $i = $self->GetObject ($i);
210 }
211
212 push @{$inheritance{$key}}, $i;
213 }
214 } else {
215 $inheritance{$key} = $dataref;
216 }
217 } else { #not an indirect reference
218 $inheritance{$key} = $data->{$key};
219 }
220 }
221 }
222
223 # If this objects contains ressources, replace information in inheritance
224 $inheritance{"Resource_Object"} = $data->{"/Resources"}
225 if (defined ($data->{"/Resources"}));
226
227 if ($data->{"/Type"} eq "/Pages")
228 {
229 # It's just an intermediate Node
230 foreach my $kid (@{$data->{"/Kids"}})
231 {
232 $self->LoadPageSubtree ($kid, %inheritance);
233 }
234 }
235 elsif ($data->{"/Type"} eq "/Page")
236 {
237 # We have a real page!
238 $inheritance{"Page_Object"} = $ref;
239 push @{$self->{"Page"}}, +{ %inheritance };
240 }
241 else
242 {
243 # Strange stuff. Complain and discard.
244 carp "While loading pages got object of type '", $data->{"/Type"}, "'";
245 }
246 }
247
248#################################################################
249=pod
250
251=head2 TargetFile ( filename )
252
253This method links the filename to the pdf descriptor and parses all
254kind of header information.
255
256=cut
257
258sub TargetFile {
259 my $self = shift;
260 my $file = shift;
261
262 croak "Already linked to the file ",$self->{File_Name},"\n"
263 if $self->{File_Name} ;
264
265 my $offset;
266
267 if ( $file ) {
268 open(FILE, "< $file") or croak "can't open $file: $!";
269 binmode FILE;
270 $self->{File_Name} = $file ;
271 $self->{File_Handler} = \*FILE;
272 my $buf;
273 read(FILE,$buf,4);
274 if ( $buf ne "%PDF" ) {
275 print "File $_[0] is not PDF compliant !\n" if $PDF::Verbose ;
276 return 0 ;
277 }
278 read(FILE,$buf,4);
279 $buf =~ s/-//;
280 $self->{Header}= $buf;
281 seek FILE,-50,2;
282 read( FILE, $offset, 50 );
283 $offset =~ s/[^s]*startxref\r?\n?(\d*)\r?\n?%%EOF\r?\n?/$1/;
284
285 $self->{"Last_XRef_Offset"} = $offset;
286 ReadCrossReference_pass1 (\*FILE, $offset, $self);
287 $self->{"Info"} = $self->GetObject ($self->{"Trailer"}{"/Info"});
288 $self->{"Catalog"} = $self->GetObject ($self->{"Trailer"}{"/Root"});
289 $self->{"PageTree"} = $self->GetObject ($self->{"Catalog"}{"/Pages"});
290 return 1;
291 } else {
292 croak "I need a file name (!)";
293 }
294}
295
296#################################################################
297=pod
298
299=head2 LoadPageInfo
300
301This function loads the information for all pages. This process can
302take some time for big PDF-files.
303
304=cut
305
306sub LoadPageInfo (\*)
307 {
308 my $self = shift;
309
310 # Reset Page Array
311 $#{$self->{"Page"}} = -1;
312
313 # Recurse
314 $self->LoadPageSubtree ($self->{"Catalog"}{"/Pages"});
315 }
316
317
318
319#################################################################
320=pod
321
322=head2 Version
323
324Returns the PDF version used for writing the object file.
325
326=cut
327
328sub Version {
329 return ($_[0]->{Header});
330}
331
332#################################################################
333=pod
334
335=head2 IsaPDF
336
337Returns true, if the file could be parsed and is a PDF-file.
338
339=cut
340
341sub IsaPDF {
342 return ($_[0]->{Header} != undef) ;
343}
344
345#################################################################
346=pod
347
348=head2 IscryptPDF
349
350Returns true if the PDF contains a crypt object. This indicates that
351the data of the PDF-File is encrypted. In this case, not all function
352work as expected.
353
354=cut
355
356sub IscryptPDF {
357 return ($_[0]->{Crypt_Object} != undef) ;
358}
359
360#################################################################
361=pod
362
363=head2 GetInfo ( key )
364
365Returns the various information contained in the info section of a PDF
366file (if present). A PDF file can have:
367
368 a title ==> GetInfo ("Title")
369 a subject ==> GetInfo ("Subject")
370 an author ==> GetInfo("Author")
371 a creation date ==> GetInfo("CreationDate")
372 a creator ==> GetInfo("Creator")
373 a producer ==> GetInfo("Producer")
374 a modification date ==> GetInfo("ModDate")
375 some keywords ==> GetInfo("Keywords")
376
377=cut
378
379sub GetInfo (\*$)
380 {
381 my $self = shift;
382 my $type = shift;
383
384 return PDF::Core::UnQuoteString ($self->{"Info"}{"/" . $type})
385 }
386
387#################################################################
388=pod
389
390=head2 Pages
391
392Returns the number of pages of the PDF-file.
393
394=cut
395
396sub Pages
397 {
398 my $self = shift;
399
400 return $self->{"PageTree"}{"/Count"};
401 }
402
403
404
405#################################################################
406=pod
407
408=head2 PageSize ( [ page ] )
409
410Returns the size of a page in the PDF-file. If no parameter is given,
411the default size of the root page will be returned. This value may be
412overridden for any page.
413
414If the size of an individual page is requested and the page data is
415not already loaded, the method B<LoadPageInfo> will be executed. This
416may take some time for large PDF-files. The size of the root page is
417always available and will never execute B<LoadPageInfo>.
418
419=cut
420
421sub PageSize (;$)
422 {
423 my $self = shift;
424 my $page = shift;
425
426 if ($page > 0)
427 {
428 return undef if ($page > $self->{"PageTree"}{"/Count"});
429 $self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0);
430
431 return @{$self->{"Page"}[$page - 1]{"/MediaBox"}}
432 if (defined $self->{"Page"}[$page - 1]{"/MediaBox"});
433 }
434 else
435 {
436 return @{$self->{"PageTree"}{"/MediaBox"}}
437 if (defined $self->{"PageTree"}{"/MediaBox"});
438 }
439
440 return undef;
441 }
442
443
444#################################################################
445=pod
446
447=head2 PageRotation ( [ page ] )
448
449Returns the rotation of a page in the PDF-file. If no parameter is given,
450the default rotation of the root page will be returned. This value may be
451overridden for any page.
452
453If the rotation of an individual page is requested and the page data is
454not already loaded, the method B<LoadPageInfo> will be executed. This
455may take some time for large PDF-files. The rotation of the root page is
456always available and will never execute B<LoadPageInfo>.
457
458=cut
459sub PageRotation (;$)
460 {
461 my $self = shift;
462 my $page = shift;
463
464 my $rotate = 0;
465
466 if ($page > 0)
467 {
468 return undef if ($page > $self->{"PageTree"}{"/Count"});
469 $self->LoadPageInfo unless ($#{$self->{"Page"}} >= 0);
470
471 $rotate = $self->{"Page"}[$page - 1]{"/Rotate"};
472 }
473 else
474 {
475 $rotate = $self->{"PageTree"}{"/Rotate"};
476 }
477
478 print "Rotation ", 0 + $rotate if ($PDF::Verbose);
479
480 return 0 + $rotate;
481 }
482#################################################################
4831;
484__END__
485
486=head1 Variables
487
488The only available variable is :
489
490=over
491
492=item B<$PDF::Parse::VERSION>
493
494Contains the version of the library installed
495
496=back
497
498
499=head1 Copyright
500
501 Copyright (c) 1998 - 2000 Antonio Rosella Italy [email protected], Johannes Blach [email protected]
502
503This library is free software; you can redistribute it and/or
504modify it under the same terms as Perl itself.
505
506=head1 Availability
507
508The latest version of this library is likely to be available from:
509
510http://www.geocities.com/CapeCanaveral/Hangar/4794/
511
512=cut
513
Note: See TracBrowser for help on using the repository browser.