source: gsdl/trunk/bin/script/indexes/Core.pm@ 18440

Last change on this file since 18440 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1#
2# PDF::Core.pm, version 1.11 February 2000 antro
3#
4# Copyright (c) 1998 - 2000 Antonio Rosella Italy [email protected], Johannes Blach [email protected]
5#
6# Free usage under the same Perl Licence condition.
7#
8
9package PDF::Core;
10
11$PDF::Core::VERSION = "1.11";
12
13=pod
14
15=head1 NAME
16
17PDF::Core - Core Library for PDF library
18
19=head1 SYNOPSIS
20
21 use PDF::Core;
22
23 $pdf=PDF::Core->new ;
24 $pdf=PDF->new(filename);
25
26 $res= $pdf->GetObject($ref);
27
28 $name = UnQuoteName($pdfname);
29 $string = UnQuoteString($pdfstring);
30
31 $pdfname = QuoteName($name);
32 $pdfhexstring = QuoteHexString($string);
33 $pdfstring = QuoteString($string);
34
35 $obj = PDFGetPrimitive (filehandle, \$offset);
36 $line = PDFGetLine (filehandle, \$offset);
37
38
39=head1 DESCRIPTION
40
41The main purpose of the PDF::Core library is to provide the data structure
42and the constructor for the more general PDF library.
43
44=cut
45
46require 5.005;
47use strict;
48use Carp;
49use Exporter ();
50
51use vars qw(@ISA @EXPORT_OK $UseObjectCache);
52
53@ISA = qw(Exporter);
54
55@EXPORT_OK = qw( GetObject );
56
57#
58# Object caching
59#
60# If this variable is true, all processed objects will be added to the
61# object cache. If only header information of a PDF are read or very
62# big PDF are processed, turning off the cache reduces the memory usage.
63#
64$UseObjectCache = 1;
65
66
67#################################################################
68#
69# Helper functions
70#
71#################################################################
72
73=pod
74
75=head1 Helper functions
76
77This functions are not part of the class, but perform useful services.
78
79=cut
80
81#
82# Modification by johi: 18.12.1999
83#
84
85#################################################################
86=pod
87
88=head2 UnQuoteName ( string )
89
90This function processes quoted characters in a PDF-name. PDF-names returned by
91B<GetObject> are already processed by this function.
92
93Returns a string.
94
95=cut
96
97sub UnQuoteName ($)
98 {
99 my $value = shift;
100 $value =~ s/#([\da-f]{2})/chr(hex($1))/ige;
101 return $value;
102 }
103
104#################################################################
105=pod
106
107=head2 UnQuoteString ( string )
108
109This function extracts the text from PDF-strings and PDF-hexstrings.
110It will process all quoted characters and remove the enclosing braces.
111
112WARNING: The current version doesn't handle unicode strings properly.
113
114Returns a string.
115
116=cut
117
118sub UnQuoteString ($)
119 {
120#
121# Translate quoted character.
122#
123 my $param = shift;
124 my $value;
125 if (($value) = $param =~ m/^<(.*)>$/)
126 {
127 $value =~ tr/0-9A-Fa-f//cd;
128 $value .= "0" if (length ($value) % 2);
129 $value =~ s/([\da-f]{2})/chr(hex($1))/ige;
130 }
131 elsif (($value) = $param =~ m/^\((.*)\)$/)
132 {
133 my %quoted = ("n" => "\n", "r" => "\r",
134 "t" => "\t", "b" => "\b",
135 "f" => "\f", "\\" => "\\",
136 "(" => "(", ")" => ")");
137 $value =~ s/\\([nrtbf\\()]|[0-7]{1,3})/
138 defined ($quoted{$1}) ? $quoted{$1} : chr(oct($1))/gex;
139 }
140 else
141 {
142 $value = $param;
143 }
144
145 return $value;
146 }
147
148#################################################################
149=pod
150
151=head2 QuoteName ( string )
152
153This function quotes problematic characters in a PDF-name. This
154function should be used before writing a PDF-name back to a PDF-file.
155
156Returns a string.
157
158=cut
159
160sub QuoteName ($)
161 {
162 my $value = shift;
163 $value =~ s/(?<!\A)([\x00-\x20\x7f-\xff%()\[\]<>\/{}#])/
164 sprintf ("#%2.2X", ord($1))/gex;
165 return $value;
166 }
167
168#################################################################
169=pod
170
171=head2 QuoteHexString ( string )
172
173This function translates a string into a PDF-hexstring.
174
175Returns a string.
176
177=cut
178
179sub QuoteHexString ($)
180 {
181 my $value = shift;
182
183 $value =~ s/(.)/sprintf ("%2.2X", ord($1))/ge;
184 return ("<" . $value . ">");
185 }
186
187#################################################################
188=pod
189
190=head2 QuoteString ( string )
191
192This function translates a string into a PDF-string. Problematic
193character will be quoted.
194
195WARNING: The current version doesn't handle unicode strings properly.
196
197Returns a string.
198
199=cut
200
201sub QuoteString ($)
202 {
203 #
204 # Only \character style quotes will be added. The really important
205 # characters to quote are: ()\
206 #
207 my $value = shift;
208
209 my %quote = ("\n" => "\\n", "\r" => "\\r",
210 "\t" => "\\t", "\b" => "\\b",
211 "\f" => "\\f", "\\" => "\\\\",
212 "(" => "\\(", ")" => "\\)");
213 $value =~ s/([\n\r\t\b\f\\()])/$quote{$1}/g;
214 return ("(" . $value . ")");
215 }
216
217#################################################################
218=pod
219
220=head2 PDFGetPrimitive ( filehandle, offset )
221
222This internal function is used while parsing a PDF-file. If you are
223not writing extentions for this library and are parsing some special
224parts of the PDF-file, stay away and use B<GetObject> instead.
225
226This function has many quirks and limitations. Check the source for details.
227
228=cut
229
230sub PDFGetPrimitive (*\$)
231 {
232 my $fd = shift;
233 my $offset = shift;
234
235 binmode $fd;
236 seek $fd, $$offset, 0;
237
238 my $state = 0;
239 my $buffer;
240 my @collector;
241 my $lastchar;
242
243 while ()
244 {
245 # File offset is positioned on start of stream.
246 last if ($state == -4);
247
248 $state = 0;
249
250 # Process last element
251 if ($#collector >= 0)
252 {
253 my $lastvalue = $collector[$#collector];
254
255 if ($lastvalue eq "R")
256 {
257 # Process references
258 if ($#collector >= 2
259 && $collector[$#collector - 1] =~ m/\d+/
260 && $collector[$#collector - 2] =~ m/\d+/)
261 {
262 $collector[$#collector - 2] .= join (" ",
263 "", @collector[$#collector - 1, $#collector]);
264 $#collector -= 2;
265 }
266 else
267 {
268 carp "Bad reference at offset ", $$offset;
269 }
270 }
271 elsif ($lastvalue eq "endobj")
272 {
273 # End of object
274 last;
275 }
276 elsif ($lastvalue eq "stream")
277 {
278 # End of object
279 $state = -4;
280 }
281 }
282
283 # Set state for next element
284 if ($buffer eq "[")
285 {
286 # Read array
287 $buffer = "";
288 push @collector, [ PDFGetPrimitive ($fd, $offset) ];
289 }
290 elsif ($buffer eq "<<")
291 {
292 # Read dictionary
293 $buffer = "";
294 push @collector, { PDFGetPrimitive ($fd, $offset) };
295 }
296 elsif ($buffer eq "(")
297 {
298 # Here comes a string
299 $state = 1;
300 $lastchar = "";
301 }
302 elsif ($buffer eq "<")
303 {
304 # Here comes a hex string
305 $state = -1;
306 }
307 elsif ($buffer eq ">")
308 {
309 # Wait for next > to terminate dictionary
310 $state = -2;
311 }
312 elsif ($buffer eq "%")
313 {
314 # Skip comments
315 $state = -3;
316 $buffer = "";
317 }
318 elsif ($buffer eq "]")
319 {
320 last;
321 }
322 elsif ($buffer eq ">>")
323 {
324 last;
325 }
326
327 # Read next item
328 while (read ($fd, $_, 1))
329 {
330 $$offset++;
331
332 if ($state == 0)
333 {
334 # Normal mode
335 if (m/[^\x00-\x20\x7f-\xff%()\[\]<>\/]/)
336 {
337 # Normal character inside a name or number
338 $buffer .= $_;
339 }
340 elsif (m/[\/\(\[\]\<\>%]/)
341 {
342 if ($buffer ne "")
343 {
344 # A new item starts
345 if ($buffer =~ m/^\//)
346 {
347 push @collector, UnQuoteName ($buffer);
348 }
349 else
350 {
351 push @collector, $buffer;
352 }
353 }
354 $buffer = $_;
355 last;
356 }
357 elsif (m/\s/)
358 {
359 # All kind of whitespaces are ignored
360 if ($buffer ne "")
361 {
362 # The old item is done starts
363 if ($buffer =~ m/^\//)
364 {
365 push @collector, UnQuoteName ($buffer);
366 }
367 else
368 {
369 push @collector, $buffer;
370 }
371 $buffer = "";
372 last;
373 }
374 }
375 else
376 {
377 # Strange character. Should not exist.
378 # Complain and move on.
379 carp "Strange character '", $_, "' at offset ",
380 $$offset, " in mode ", $state, " detected";
381 $buffer .= $_;
382 }
383 }
384 elsif ($state > 0)
385 {
386 # We have a string
387
388 if ($lastchar =~ m/\\[\r\n]+/ && m/[^\r\n]/)
389 {
390 # Clean up after line continuation
391 $lastchar = "";
392 }
393
394 if ($lastchar =~ m/\\[\r\n]*/)
395 {
396 # Process character after backslash
397 if (m/[\r\n]/)
398 {
399 # end of line
400 $lastchar .= $_;
401 }
402 else
403 {
404 # Just a quote
405 $buffer .= $lastchar . $_;
406 $lastchar = "";
407 }
408 }
409 else
410 {
411 if ($_ eq "\\")
412 {
413 # Quoted string starts
414 $lastchar = $_;
415 }
416 elsif ($_ eq "(")
417 {
418 # Count braces
419 $buffer .= $_;
420 $state ++;
421 }
422 elsif ($_ eq ")")
423 {
424 # End of string
425 $buffer .= $_;
426 unless (-- $state)
427 {
428 push @collector, $buffer;
429 $buffer = "";
430 last;
431 }
432 }
433 else
434 {
435 $buffer .= $_;
436 }
437 }
438 }
439 elsif ($state == -1)
440 {
441 if (m/[0-9a-f\s]/i)
442 {
443 # Hex character
444 $buffer .= $_;
445 }
446 elsif ($_ eq ">")
447 {
448 # End of string
449 $buffer .= $_;
450 push @collector, $buffer;
451 $buffer = "";
452 last;
453 }
454 elsif ($_ eq "<" && $buffer eq "<")
455 {
456 # This is not a string, but a dictionary instead
457 $buffer .= $_;
458 last;
459 }
460 else
461 {
462 # Should not be there. Complain and add it to the $buffer
463 carp "Bad character '", $_ , "' in hex string";
464 $buffer .= $_;
465 }
466 }
467 elsif ($state == -2)
468 {
469 # Wait for second > to terminate dictionary
470
471 # Some sanity checks
472 carp "Character '", $_, "' appeared while waiting for '>'"
473 if ($_ ne ">");
474 carp "Buffer contains '", $buffer, "' and not '>'"
475 if ($buffer ne ">");
476
477 $buffer = ">>";
478 last;
479 }
480 elsif ($state == -3)
481 {
482 # Skip comments;
483 last if (m/[\r\n]/);
484 }
485 elsif ($state == -4)
486 {
487 # Wait for newline to start stream
488
489 if ($_ eq "\n")
490 {
491 # Some sanity checks
492 carp "Text '", $buffer,
493 "' appeared while waiting for start of stream"
494 if ($buffer ne "");
495
496 $buffer = "";
497 last;
498 }
499 elsif (m/\S/)
500 {
501 $buffer .= $_;
502 }
503 }
504 else
505 {
506 # Unhandled status. Complain and reset
507 carp "Unhandled status ", $state;
508 }
509 }
510 if ($_ eq "")
511 {
512 # Unhandled status. Complain and reset
513 carp "Premature end of file reached";
514
515 if ($buffer ne "")
516 {
517 push @collector, $buffer;
518 $buffer = "";
519 }
520 last;
521 }
522 }
523
524 return @collector;
525 }
526
527#################################################################
528=pod
529
530=head2 PDFGetline ( filehandle, offset )
531
532This internal function was used to read a line from a PDF-file. It has
533many limitations and you should stay away from it, if you don't know
534what you are doing. Use B<GetObject> or B<PDFGetPrimitive> instead.
535
536=cut
537
538sub PDFGetline {
539#
540# BUG WARNING:
541#
542# This function returns only one line, which doesn't mean anything most of the
543# time. Except for the fileheader and the xref-table, linebreaks can (and will!)
544# occur everywhere in a PDF and are just whitespace. You may find only part of a
545# PDF-primitve on one line, or more than one of them.
546#
547# If you want to read PDF-Primitves, use the function PDFGetPrimitive instead.
548#
549 my $fd = shift;
550 my $offset=shift;
551
552 my $buffer;
553 my $endflag=1;
554
555 binmode $fd;
556 seek $fd, $$offset, 0;
557
558 read($fd,$buffer,2);
559 $buffer =~ s/^\r?\n?// ;
560
561 $$offset +=2;
562
563 while ($endflag) {
564 read($fd,$_,1);
565 $$offset++;
566 $endflag = 0 if ( $_ eq "\r" || $_ eq "\n");
567 $buffer = $buffer . $_ ;
568 }
569 return $buffer;
570 }
571
572#################################################################
573#
574# Constructors
575#
576#################################################################
577
578=pod
579
580=head1 Constructor
581
582=cut
583
584#################################################################
585=pod
586
587=head2 new ( [ filename ] )
588
589This is the constructor of a new PDF object. If the filename is
590missing, it returns an empty PDF descriptor ( can be filled with
591$pdf->TargetFile). Otherwise, It acts as the B<PDF::Parse::TargetFile>
592method.
593
594=cut
595
596sub new {
597
598 my %PDF_Fields = (
599 File_Name => undef, # Name of file
600 File_Handler => undef, # Open handle to file
601 Header => undef, # Identification string
602
603 Objects => [], # Offset of objects
604 Gen_Num => [], # Genereation number of objects
605 Object_Length => [], # Length of processed objects
606 Object_Cache => {}, # Cache for objects.
607 Page => [], # Information about all pages. Useful.
608
609 Updated => 0, # Is the PDF updated
610 Last_XRef_Offset => undef, # File offset of active Xref table
611 Trailer => {}, # Content of active trailer
612 Info => {}, # Content of active info object
613 Catalog => {}, # Content of catalog
614 PageTree => {}, # Content of root page
615 );
616my $that = shift;
617my $class=ref($that) || $that ;
618 my $self = \%PDF_Fields ;
619 my $buf2=bless $self, $class;
620 if ( @_ ) { # I have the filename
621 $buf2->TargetFile($_[0]) ;
622 }
623 return bless $self, $class;
624};
625
626#################################################################
627sub DESTROY {
628#
629# Close the file if not empty
630#
631 my $self = shift;
632 close ( $self->{File_Handler} ) if $self->{File_Handler} ;
633}
634
635#################################################################
636#
637# Methods
638#
639#################################################################
640
641=pod
642
643=head1 Methods
644
645The available methods are:
646
647=cut
648
649#################################################################
650=pod
651
652=head2 GetObject (reference)
653
654This methods returns the PDF-object for B<reference>. The string
655B<reference> must match the regular expression /^\d+ \d+ R$/,
656where the first number is the object number, the second number the
657generation number.
658
659The return value is a PDF-primitive, the type depends on the content
660of the object:
661
662=over
663
664=item B<undef>
665
666The object could not be found or an error. Not all referenced objects
667need to be present in a PDF-file. This value can be ignored.
668
669=item B<Hash Reference>
670
671If (UNIVERSAL::isa ($retval, "HASH") is true, the object is a
672PDF-dictionary. The keys of the hash should be either a PDF name (eg:
673/MediaBox) or a generated value like Stream_Offset. Everything else is
674an error.
675
676The values of the hash can be any PDF-primitive, including PDF-arrays
677and other dictionaries.
678
679This is the most common value returned by GetObject. If the key
680Stream_Offset exists, the dictionary is followed by stream data,
681starting at the file offeset indicated by this value.
682
683=item B<Array Reference>
684
685If (UNIVERSAL::isa ($retval, "ARRAY") is true, the object is a
686PDF-array. Each element may be of a different type, and may contain
687further references to arrays or any other PDF-primitive.
688
689=item B<String matching /^\d+ \d+ R$/>
690
691This is a reference to another PDF-Object. This value can be passed to
692GetObject. This kind of value may appear instead of most other types.
693Some PDF-writing programs seem to have special fun writing references
694when a simple number is expected. If the final number is need, use
695code like this to resolve references:
696
697while ($len =~ m/^\d+ \d+ R$/) {$len = $self->GetObject ($len); }
698
699Example: 22 0 R
700
701=item B<String matching /^\//>
702
703This is a Name in a PDF dictionary. This string is already processed
704by B<UnQuotName> and may differ from the value in the PDF-file. In
705some very old andstrange non-standard PDF-files, this may lead to
706confusion.
707
708Example: /MediaBox
709
710=item B<String matching /^\(.*\)$/>
711
712This is a string. It may contain newlines, quoted characters und other
713strange stuff. Use PDF::UnQuoteString to extract the text.
714
715Example: (This is\na string with two \(2\) lines.)
716
717=item B<String matching /^E<lt>.*E<gt>$/>
718
719This is a hex encoded string. Use PDF::UnQuoteString to extract the text.
720
721Example: E<lt>48 45 4c4C4 F1cE<gt>
722
723=item B<String matching /^[\d.\+\-]+$/>
724
725This is probably a number.
726
727Example: 611
728
729=item B<String matching none of the above>
730
731this is either a PDF bareword (eg. true, false, ...) or a value
732generated by this method like Stream_Offset.
733
734Example: true
735
736=back
737
738To improve performance GetObject uses an internal cache for objects.
739Repeated requests for the same objects are not read form the file but
740satisfied from the cache. With the Variable B<$PDF::Core::UseObjectCache>,
741the caching mechanism can be turned off.
742
743B<WARNING>
744
745Special care must be taken, when returned objects are modified. If the
746object contains sub-objects, the sub-objects are not duplicated and
747all changes affect all other copies of this object. Use your own copy,
748if you need to modify those values.
749
750=cut
751
752sub GetObject (\*$;$)
753 {
754 my $self = shift;
755 my $ref = shift;
756 my $force = shift;
757
758#
759# Is PDF file open?
760#
761 croak "PDF-file not open." unless ($self->{"File_Handler"});
762
763#
764# Check reference
765#
766 my ($ind, $gen);
767 unless (($ind,$gen) = $ref =~ m/^(\d+) (\d+) R$/)
768 {
769 carp "Bad object reference '", $_, "'";
770 return undef;
771 }
772 if ($ind > $#{$self->{"Gen_Num"}} || $self->{"Gen_Num"}[$ind] != $gen)
773 {
774 #
775 # The page does not exist. According to the PDF specification,
776 # this is not an error.
777 #
778 return undef;
779 }
780
781 # Remove leading zero for cache key.
782 $ind += 0;
783 # Check cache
784 if ($UseObjectCache && ! $force
785 && defined($self->{"Object_Cache"}{$ind}))
786 {
787 return $self->{"Object_Cache"}{$ind};
788 }
789
790 my $offset = $self->{"Objects"}[$ind];
791 my @data = PDFGetPrimitive ($self->{"File_Handler"}, $offset);
792
793 unless ($#data == 4 && $data[0] == $ind
794 && $data[1] == $gen && $data[2] eq "obj")
795 {
796 carp "Object mismatch: Got '", join (" ", @data[0..2]),
797 "' instead of '", join (" ", $ind, $gen, "obj"), "'";
798 return;
799 }
800
801 #
802 # An object is not always a dictionary. In such cases,
803 # adding additional keys breaks the content.
804 #
805 if (UNIVERSAL::isa ($data[3], "HASH"))
806 {
807 if ($data[4] eq "stream")
808 {
809 #
810 # Find end of a stream object
811 #
812 $data[3]{"Stream_Offset"} = $offset;
813 my $len = $data[3]{"/Length"};
814
815 # Length can be a reference to another object.
816 # Resolve references in this case till something else appears.
817 while ($len =~ m/^\d+ \d+ R$/)
818 {
819 $len = $self->GetObject ($len);
820 }
821
822 # Skip stream
823 if ($len =~ m/^\d+$/)
824 {
825 $offset += $len;
826 }
827 else
828 {
829 carp "Strange: /Length resolves to '", $len, "' in object ",
830 join (" ", @data[0..2]);
831 }
832
833 my @enddata = PDFGetPrimitive ($self->{"File_Handler"}, $offset);
834 $data[4] = $enddata[$#enddata];
835 }
836 }
837
838 #
839 # Save length of object.
840 #
841 $self->{"Object_Length"}[$ind] = $offset - $self->{"Objects"}[$ind];
842
843 carp "Bad object termination '", $data[4], "' in object ",
844 join (" ", @data[0..2]) if ($data[4] ne "endobj");
845
846
847 # Update cache
848 $self->{"Object_Cache"}{$ind} = $data[3] if ($UseObjectCache);
849
850 return $data[3];
851 }
852
853#
854# End of Modification by johi: 18.12.1999
855#
856#################################################################
857
858
8591;
860__END__
861
862=pod
863
864=head1 Variables
865
866Available variables are:
867
868=over 4
869
870=item B<$PDF::Core::VERSION>
871
872Contains the version of the library installed
873
874=item B<$PDF::Core::UseObjectCache>
875
876If this variable is true, all processed objects will be added to the
877object cache. If only header information of a PDF are read or very big
878PDF are processed, turning off the cache reduces the memory usage.
879
880=back 4
881
882=head1 Copyright
883
884 Copyright (c) 1998 - 2000 Antonio Rosella Italy [email protected], Johannes Blach [email protected]
885
886This library is free software; you can redistribute it and/or
887modify it under the same terms as Perl itself.
888
889=head1 Availability
890
891The latest version of this library is likely to be available from:
892
893http://www.geocities.com/CapeCanaveral/Hangar/4794/
894
895=cut
Note: See TracBrowser for help on using the repository browser.