source: trunk/gsdl/bin/script/indexes/amend_pdf.pl@ 1971

Last change on this file since 1971 was 1971, checked in by jmt14, 23 years ago

added files: Core.pm PDF.pm Parse.pm amend_pdf.pl

buildkpi.pl buildkpiS.pl buildkpiK.pl relation.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.7 KB
Line 
1#!/usr/local/bin/perl
2#Jessica Thurston
3#usage: perl amend_pdf.pl filename
4#This program takes a pdf file 'filename'
5#and modifys it to include outline entries
6#(bookmarks).
7#This is achieved using a pdf library which
8#parses the pdf file into objects allowing
9#this script to more easily manipulate the pdf
10#objects and then modify the file by means
11#of incremental updates. (modifys the necessary
12#pdf objects and appends them to the end of
13#the file, then updates the xreference table and
14#trailer). This library can be downloaded at
15# http://www.sanface.com/PDF-lib/
16#A file url.txt stored in the same
17#directory is read containing two columns of data:
18#title of related pdf document url of related pdf document
19#The pdf document is modified so that the title of the
20#related pdf documents are displayed as bookmarks and
21#when the user clicks on it the web browser opens at
22#the url stored in the same column of url.txt.
23#
24#This program was designed to be used with the Greenstone
25#digital library in the following ways:
26#-When a user has built a collection containing
27# pdf documents, they have a choice of either displaying
28# the page in the web browser in gml form or saving the
29# original pdf document to view in an acrobat pdf reader.
30#-The related document file url.txt can be downloaded
31# in the same way if the user has included in the format
32# string for that page (ie search list) in the collection
33# configuration file [urllink] related documents [/urllink].
34# This will create a html link 'related documents' which
35# when clicked on will enable the user to save a file url.txt
36# containing the related documents for that particular document.
37#-This file has been written upon running the perl script
38# relationPDF.pl which will also amend the gml documents to
39# include the 'urllink' information enabling the user to
40# include the related document links in config file format
41# strings.
42#-So once this related doc info has been included the user
43# can download both the pdf file (doc.pdf) and the related
44# document file for that particluar document (url.txt).
45#-perl amend_pdf.pl doc.pdf is then run in the command line
46# modifying the pdf document to the desired effect.
47#bugs:
48#-cannot be run more than once on the same file
49#-doesn't make use of free objects (free objects
50# are quite unlikely though, I didn't come across
51# any while looking through pdfs)
52#-many other things I haven't bothered testing
53
54use Carp;
55use Getopt::Long;
56use PDF;
57
58$objects = 0;
59$offsets = 1;
60
61
62#for each command line argument add the outlines
63#contained in url.txt
64#foreach (@ARGV) {
65# create_outlines($_);
66#}
67
68if(defined $ARGV[0]){
69 $pdf_document = $ARGV[0];
70} else {
71 print STDERR "You must supply pdf document filename\n";
72 return;
73}
74if(defined $ARGV[1]){
75 $related_docs = $ARGV[1];
76} else {
77print STDERR "You must supply related document filename\n";
78return;
79}
80create_outlines($pdf_document);
81
82exit(1);
83
84
85#this function first checks to see is the file is indeed
86#a pdf file - if not then exits. If it is a pdf then it
87#checks to see if it is encrypted. If it is then exits.
88#Next we check if the outlines dictionary exists in the
89#document catalog.
90#-If the outline entry exists but there are no actual outlines
91# present then call add_outlines but pass on the existing
92# outline dictionary.
93#-otherwise call modify_outlines to modify the existing outline
94# structure to include new outlines.
95#If outlines dictionary doesn't exist call add_outlines.
96sub create_outlines {
97
98 my $filename = shift;
99
100 #parse pdf filename
101 my @filelist = split("/", $filename);
102 my $file = pop @filelist;
103
104 print STDERR "pdf file to be manipulated: $file\n";
105 print STDERR "pdf file to be saved: tmp_$file\n";
106
107 #make a copy of that file in this directory
108 print STDERR "cp $filename $file\n";
109 `cp $filename $file\n`;
110 `cp $filename tmp_$file\n`;
111
112 #create and parse a new 'pdf' object
113 my $PDFfile = PDF->new($file);
114
115 #if it is a pdf file
116 if ($PDFfile->{"Header"}){
117
118 if($PDFfile->IscryptPDF){
119 #an encrypted file so cannot continue with
120 #adding the related doc outlines to it
121 print STDERR "file \"$file\" is encrypted \n";
122 return;
123 }
124
125 #display some relevant information about the document
126 print STDERR "Author: ", $PDFfile->GetInfo ("Author"), "\n";
127 print STDERR "Title: ", $PDFfile->GetInfo ("Title"), "\n";
128 print STDERR "Subject: ", $PDFfile->GetInfo ("Subject"), "\n";
129 print STDERR "Updated: ", $PDFfile->{"Updated"}, "\n";
130
131 #if the pdf document already includes an outline dictionary
132 if (defined $PDFfile->{"Catalog"}{"/Outlines"}) {
133
134 #get the outline object from the indirect ref
135 my $outline_data = $PDFfile->GetObject($PDFfile->{"Catalog"}{"/Outlines"});
136 #obtain the number of existing outlines
137 $PDFfile->{"Outlines"}{"/Count"} = $outline_data->{"/Count"};
138 print STDERR "number of existing outlines: $PDFfile->{\"Outlines\"}{\"/Count\"}\n";
139
140 #this means that the pdf file had an outline dictionary but
141 #did not actually include any outlines.
142 if($PDFfile->{"Outlines"}{"/Count"} == 0){
143 #obtain object number of outline dictionary and pass to add_outlines
144 my $dictionary = split(/\s/, $PDFfile->{"Catalog"}{"/Outlines"});
145
146 add_outlines($PDFfile, $file, $dictionary);
147 } else {
148 #collect other outline data to pass to modify_outlines
149 $PDFfile->{"Outlines"}{"/First"} = $outline_data->{"/First"};
150 $PDFfile->{"Outlines"}{"/Last"} = $outline_data->{"/Last"};
151
152 #modify last outline entry
153 modify_outlines($outline_data, $PDFfile, $file);
154 }
155 } else { #there was no outline dictionary thus no outlines so add some
156 print STDERR "no bookmarks in \"$file\" \n";
157 add_outlines($PDFfile, $file, 0);
158 }
159 } else { #the file was not a pdf file
160 print STDERR "$file is not a pdf file!!\n";
161 }
162
163
164 #copy back the file we modified and the saved copy
165 my $copypath = join("/", @filelist);
166 print STDERR "cp $file $filename\n";
167 print STDERR "cp tmp_$file $copypath/tmp_$file\n";
168 `cp $file $filename\n`;
169 `cp tmp_$file $copypath/tmp_$file\n`;
170
171 #remove the copy of that file in this directory
172 #print STDERR "rm $file\n";
173 `rm $file\n`;
174 `rm tmp_$file\n`;
175}
176
177
178#add_outlines function recieves following params
179#PDF file information, filename of pdf file, outline dictionary
180#(if outline dict. is 0 that means it must be created).
181#First the function decides if we are to create new dictionary
182#or not. Next read in the url file. Open up the pdf document.
183#Modify the catalog so that it includes the outline dictionary
184#(this is done even if the catalog doesn't need modifying
185# so that the objects in the objects/offset table will not
186# be all mucked up when it comes time to write the xreference
187# table)
188#Then append outline dictionary to end of file (either modified
189#or created). Then append the top heirarchy related document
190#outline. Then the new file url bookmarks. Call xref_table
191#to write new xreference table. Call trailer to write new trailer.
192
193#table[0][0] - document catalog
194#table[0][1] - document catalog offset
195#table[1][0] - outline dictionary
196#table[1][1] - outline dictionary offset
197#table[2][0] - related document outline
198#table[2][1] - related document outline offset
199#table[3][0] etc - url outlines, read from file
200#table[3][1] etc - url outlines offset, read from file
201
202sub add_outlines (*\$){
203
204 my $PDFfile = shift;
205 my $file = shift;
206
207 my @table;
208 $table[1][$objects] = shift;
209
210 #read in file of url's
211 my @urls = read_file();
212 my $url_num = $#urls + 1;
213
214 #if outline dictionary was not present in catalog
215 if ($table[1][$objects] == 0){ #get new object number for new dictionary
216 $table[1][$objects] = $PDFfile->{"Trailer"}{"/Size"}; #outline dictionary to be created
217 $table[2][$objects] = $table[1][$objects] + 1; #related document outline
218 } else { #else get object number for related doc outline to be appended
219 $table[2][$objects] = $PDFfile->{"Trailer"}{"/Size"}; #related document outline
220 }
221
222 #open up pdf file for appending
223 open(FILE, ">> $file") or croak "can't open $file: $!";
224 binmode \*FILE;
225
226 #obtain object number and offset for the document catalog
227 $table[0][$objects] = split(/\s/, $PDFfile->{"Trailer"}{"/Root"});
228 $table[0][$offsets] = tell \*FILE;
229
230 #print the modified or original catalog back to the file (appended)
231 print FILE "$table[0][$objects] 0 obj
232";
233 print FILE "<<
234";
235 print FILE "/Pages $PDFfile->{\"Catalog\"}{\"/Pages\"}
236";
237 print FILE "/Outlines $table[1][$objects] 0 R
238"; #only line actually added
239 print FILE "/Type /Catalog
240";
241 print FILE "/DefaultGray $PDFfile->{\"Catalog\"}{\"/DefaultGray\"}
242"
243 if (defined ($PDFfile->{"Catalog"}{"/DefaultGray"}));
244 print FILE "/DefaultRGB $PDFfile->{\"Catalog\"}{\"/DefaultRGB\"}
245"
246 if (defined ($PDFfile->{"Catalog"}{"/DefaultRGB"}));
247 print FILE "/PageLabels $PDFfile->{\"Catalog\"}{\"/PageLabels\"}
248"
249 if (defined ($PDFfile->{"Catalog"}{"/PageLabels"}));
250 print FILE "/Names $PDFfile->{\"Catalog\"}{\"/Names\"}
251"
252 if (defined ($PDFfile->{"Catalog"}{"/Names"}));
253 print FILE "/Dests $PDFfile->{\"Catalog\"}{\"/Dests\"}
254"
255 if (defined ($PDFfile->{"Catalog"}{"/Dests"}));
256 print FILE "/ViewerPreferences $PDFfile->{\"Catalog\"}{\"/ViewerPreferences\"}
257"
258 if (defined ($PDFfile->{"Catalog"}{"/ViewerPreferences"}));
259 print FILE "/PageLayout $PDFfile->{\"Catalog\"}{\"/PageLayout\"}
260"
261 if (defined ($PDFfile->{"Catalog"}{"/PageLayout"}));
262 print FILE "/PageMode $PDFfile->{\"Catalog\"}{\"/PageMode\"}
263"
264 if (defined ($PDFfile->{"Catalog"}{"/PageMode"}));
265 print FILE "/Threads $PDFfile->{\"Catalog\"}{\"/Threads\"}
266"
267 if (defined ($PDFfile->{"Catalog"}{"/Threads"}));
268 print FILE "/OpenAction $PDFfile->{\"Catalog\"}{\"/OpenAction\"}
269"
270 if (defined ($PDFfile->{"Catalog"}{"/OpenAction"}));
271 print FILE "/URI $PDFfile->{\"Catalog\"}{\"/URI\"}
272"
273 if (defined ($PDFfile->{"Catalog"}{"/URI"}));
274 print FILE "/Acroform $PDFfile->{\"Catalog\"}{\"/Acroform\"}
275"
276 if (defined ($PDFfile->{"Catalog"}{"/Acroform"}));
277 print FILE "/StructTreeRoot $PDFfile->{\"Catalog\"}{\"/StructTreeRoot\"}
278"
279 if (defined ($PDFfile->{"Catalog"}{"/StructTreeRoot"}));
280 print FILE "/SpiderInfo $PDFfile->{\"Catalog\"}{\"/SpiderInfo\"}
281"
282 if (defined ($PDFfile->{"Catalog"}{"/SpiderInfo"}));
283 print FILE ">>
284";
285 print FILE "endobj
286";
287
288 #obtain offset for outline dictionary
289 $table[1][$offsets] = tell \*FILE;
290
291 #append newly created outline dictionary
292 print FILE "$table[1][$objects] 0 obj
293";
294 print FILE "<<
295";
296 print FILE "/Type /Outlines
297";
298 print FILE "/Count ", $url_num + 1, "
299";
300 print FILE "/First $table[2][$objects] 0 R
301";
302 print FILE "/Last $table[2][$objects] 0 R
303";
304 print FILE ">>
305";
306 print FILE "endobj
307";
308
309
310 #get the related document outline object num and offset
311 my $obj = $table[2][$objects];
312 $table[2][$offsets] = tell \*FILE;
313
314 #append the top heirarchy related document outline to file
315 print FILE "$table[2][$objects] 0 obj
316";
317 print FILE "<<
318";
319 print FILE "/Title (Related Documents)
320";
321 print FILE "/Parent $table[1][$objects] 0 R
322";
323 print FILE "/Count ", $url_num, "
324";
325 print FILE "/First ", $obj + 1, " 0 R
326";
327 print FILE "/Last ", $table[2][$objects] + $url_num, " 0 R
328";
329 print FILE ">>
330";
331 print FILE "endobj
332";
333
334 my $ind = 3;
335 $obj++;
336
337
338 #store the object nums and offsets of the new related document
339 #outlines and write them to the file (must be outline with
340 #an action eg go to specific url)
341 for $i ( 0 .. $#urls ) {
342 $table[$ind][$offsets] = tell \*FILE;
343 $table[$ind][$objects] = $obj;
344 print FILE "$obj 0 obj
345";
346 print FILE "<<
347";
348 print FILE "/Title ($urls[$i][0])
349";
350 print FILE "/Parent $table[2][$objects] 0 R
351";
352 print FILE "/Next ", $obj + 1, " 0 R
353" if (($obj + 1) <= ($table[2][$objects] + $url_num));
354 print FILE "/Prev ", $obj - 1, " 0 R
355" if(($obj - 1) != ($table[2][$objects]));
356 print FILE "/A << /Type /Action
357";
358 print FILE "/S /URI
359";
360 print FILE "/URI ($urls[$i][1])
361";
362 print FILE ">>
363";
364 print FILE ">>
365";
366 print FILE "endobj
367";
368 $obj++;
369 $ind++;
370 }
371
372
373 #append new X-reference table
374 my $xref_offset = tell \*FILE;
375 xref_table(\*FILE, $url_num, @table);
376
377 #print trailer
378 trailer(\*FILE, $PDFfile, $obj);
379 print FILE "$xref_offset
380";
381 print FILE "%%EOF
382";
383
384 #close FILE;
385}
386
387#modify_outlines function recieves following params
388#PDF file information, outline data, pdf filename
389#First the function obtains the object data for the
390#last outline entry. Next read in the url file.
391#Open up the pdf document.Then append the modified
392#outline dictionary to end of file. Next append the
393#modified last outline. Then append the
394#top heirarchy related document outline. Then the
395#new file url bookmarks. Call xref_table
396#to write new xreference table.
397#Call trailer to write new trailer.
398
399#table[0][0] - outline dictionary
400#table[0][1] - outline dictionary offset
401#table[1][0] - last outline
402#table[1][1] - last outline offset
403#table[2][0] - related document outline
404#table[2][1] - related document outline offset
405#table[3][0] etc - url outlines, read from file
406#table[3][1] etc - url outlines offset, read from file
407
408sub modify_outlines (*\$){
409
410 my ($outline_data, $PDFfile, $file) = @_;
411
412 #collect the data for the last outline (which must be modified)
413 $PDFfile->{"Outlines"}{"/Last"} = $outline_data->{"/Last"};
414 my $last_data = $PDFfile->GetObject($PDFfile->{"Outlines"}{"/Last"});
415
416 #read in file of url's
417 my @urls = read_file();
418 my $url_num = $#urls + 1;
419
420 my @table;
421
422 #This number is the number to use for the next created object
423 #eg the related doc heirarchy
424 $table[2][$objects] = $PDFfile->{"Trailer"}{"/Size"};
425
426 #open up the pdf file for appending
427 open(FILE, ">> $file") or croak "can't open $file: $!";
428 binmode \*FILE;
429
430 #store the object number and offset of the outline dictionary
431 $table[0][$objects] = split(/\s/, $PDFfile->{"Catalog"}{"/Outlines"});
432 $table[0][$offsets] = tell \*FILE;
433
434 #write the outline dictionary back to the file (appending)
435 print FILE "$table[0][$objects] 0 obj
436";
437 print FILE "<<
438";
439 print FILE "/Type /Outlines
440";
441 print FILE "/Count ", $PDFfile->{"Outlines"}{"/Count"} + 1, "
442"
443 if (defined ($PDFfile->{"Outlines"}{"/Count"}));
444 print FILE "/First $PDFfile->{\"Outlines\"}{\"/First\"}
445"
446 if (defined ($PDFfile->{"Outlines"}{"/First"}));
447 print FILE "/Last $table[2][$objects] 0 R
448";
449 print FILE ">>
450";
451 print FILE "endobj
452";
453
454 #store the last outline entry object num and file offset
455 my @last_entry = split(/\s/, $PDFfile->{"Outlines"}{"/Last"});
456 $table[1][$objects] = $last_entry[0];
457 $table[1][$offsets] = tell \*FILE;
458
459 #append modified last outline entry
460 print FILE "$table[1][$objects] 0 obj
461";
462 print FILE "<<
463";
464 print FILE "/Title $last_data->{\"/Title\"}
465";
466 print FILE "/Dest $last_data->{\"/Dest\"}
467" if(defined $last_data->{"/Dest"});
468 print FILE "/Parent $last_data->{\"/Parent\"}
469";
470 print FILE "/Prev $last_data->{\"/Prev\"}
471";
472 print FILE "/Next $table[2][$objects] 0 R
473";
474 print FILE "/First $last_data->{\"/First\"}
475" if(defined $last_data->{"/First"});
476 print FILE "/Last $last_data->{\"/Last\"}
477" if(defined $last_data->{"/Last"});
478 print FILE "/Count $last_data->{\"/Count\"}
479"if(defined $last_data->{"/Count"});
480 print FILE "/A $last_data->{\"/A\"}
481" if(defined $last_data->{"/A"});
482 print FILE "/SE $last_data->{\"/SE\"}
483" if(defined $last_data->{"/SE"});
484 print FILE ">>
485";
486 print FILE "endobj
487";
488
489 #store the object num and offset of the related
490 #document top level outline
491 my $obj = $table[2][$objects] + 1;
492 $table[2][$offsets] = tell \*FILE;
493
494 #append related document top level outline
495 print FILE "$table[2][$objects] 0 obj
496";
497 print FILE "<<
498";
499 print FILE "/Title (Related Documents)
500";
501 print FILE "/Parent $last_data->{\"/Parent\"}
502";
503 print FILE "/Count ", $url_num, "
504";
505 print FILE "/First $obj 0 R
506";
507 print FILE "/Last ", $table[2][$objects] + $url_num , " 0 R
508";
509 print FILE ">>
510";
511 print FILE "endobj
512";
513
514
515 my $ind = 3;
516
517 #store the object nums and offsets of the new related document
518 #outlines and write them to the file (must be outline with
519 #an action eg go to specific url)
520 for $i ( 0 .. $#urls ) {
521 $table[$ind][$objects] = $obj;
522 $table[$ind][$offsets] = tell \*FILE;
523 print FILE "$obj 0 obj
524";
525 print FILE "<<
526";
527 print FILE "/Title ($urls[$i][0])
528";
529 print FILE "/Parent $table[2][$objects] 0 R
530";
531 print FILE "/Next ", $obj + 1, " 0 R
532" if (($obj + 1) <= ($table[2][$objects] + $url_num));
533 print FILE "/Prev ", $obj - 1, " 0 R
534" if (($obj - 1) != ($table[2][$objects]));
535 print FILE "/A << /Type /Action
536";
537 print FILE "/S /URI
538";
539 print FILE "/URI ($urls[$i][1])
540";
541 print FILE ">>
542";
543 print FILE ">>
544";
545 print FILE "endobj
546";
547 $obj++;
548 $ind++;
549 }
550
551 #append new X-reference table
552 my $xref_offset = tell \*FILE;
553 xref_table(\*FILE, $url_num, @table);
554
555 #print trailer
556 trailer(\*FILE, $PDFfile, $obj);
557 print FILE "$xref_offset
558";
559 print FILE "%%EOF
560";
561
562 close FILE;
563
564}
565
566
567#This function reads a file 'url.txt' which contains
568#two columns of data in the following format:
569#related document title related document url
570#each array of title, url is stored in an array
571#(to obtain the title of the first related
572#document in the file)
573#eg table[1st related document][title]
574#(to obtain the url of the second related
575#document in the file)
576#eg table[2nd related document][url]
577#this table is then returned to the calling
578#function.
579sub read_file {
580
581 #create two-dimensional array for urls
582 my @urls;
583
584 #open up url.txt to start reading
585 open(URL, "$related_docs") or croak "Can't open $related_docs: $!";
586
587 my $counter = 0;
588
589 #read in url.txt and store into the arry
590 while (<URL>){
591 chomp;
592 if(/([^\t]*)\t(.*)/){
593 push @urls, [ split ]; #split on /s into arrays of arrays
594 }
595 }
596
597 close URL;
598
599 return (@urls);
600
601}
602
603
604#this function takes as parameters the filehandle
605#to the pdf document, the number of related documents
606#to this pdf doc and a table of object numbers and
607#their offsets. Using this information it writes
608#(appends) a new xreference table to the pdf
609#document.
610
611sub xref_table (*\$) {
612
613 my ($fd, $num, @table) = @_;
614 my $offset;
615
616 #print the new xref table (append to file)
617 print $fd "xref
618";
619 print $fd "0 1
620";
621 print $fd "0000000000 65535 f
622";
623 print $fd "$table[0][$objects] 1
624";
625 $offset = '0'x(10-length($table[0][$offsets])).$table[0][$offsets];
626 print $fd "$offset 00000 n
627";
628 print $fd "$table[1][$objects] 1
629";
630 $offset = '0'x(10-length($table[1][$offsets])).$table[1][$offsets];
631 print $fd "$offset 00000 n
632";
633 print $fd "$table[2][$objects] ", $num + 1, "
634";
635 for $i ( 2 .. ($num + 2) ) { #add 2 on because already written 2 to file
636 $offset = '0'x(10-length($table[$i][$offsets])).$table[$i][$offsets];
637 print $fd "$offset 00000 n
638";
639 }
640
641}
642
643
644#This function recieves as parameters the filehandle
645#to the pdf document, parsed information about the
646#document and the new size (last object number + 1)
647#of the pdf file. Using this information it appends
648#a new trailer to the end of the pdf document.
649
650sub trailer (*\$) {
651
652 my ($fd, $PDFfile, $new_size) = @_;
653
654 #append the new trailer to the end of the file
655 print $fd "trailer
656";
657 print $fd "<<
658";
659 print $fd "/Size ", $new_size, "
660";
661 print $fd "/Root $PDFfile->{\"Trailer\"}{\"/Root\"}
662";
663 print $fd "/Info $PDFfile->{\"Trailer\"}{\"/Info\"}
664"
665 if (defined ($PDFfile->{"Trailer"}{"/Info"}));
666 print $fd "/ID [$PDFfile->{\"Trailer\"}{\"/ID\"}[0]$PDFfile->{\"Trailer\"}{\"/ID\"}[1]]
667"
668 if (defined ($PDFfile->{"Trailer"}{"/ID"}));
669 print $fd "/Prev $PDFfile->{\"Last_XRef_Offset\"}
670";
671 print $fd "/Encrypt $PDFfile->{\"Trailer\"}{\"/Encrypt\"}
672"
673 if (defined ($PDFfile->{"Trailer"}{"/Encrypt"}));
674 print $fd ">>
675";
676 print $fd "startxref
677";
678
679}
680
681
682
Note: See TracBrowser for help on using the repository browser.