Context Navigation

WritePDF.pl@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago
Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.
Property svn:executable set to ``*
File size: 29.0 KB

Line
1	#------------------------------------------------------------------------------
2	# File: WritePDF.pl
3	#
4	# Description: Write PDF meta information
5	#
6	# Revisions: 12/08/2007 - P. Harvey Created
7	#
8	# References: 1) http://partners.adobe.com/public/developer/pdf/index_reference.html
9	#
10	# Notes: The special "PDF-update" group can be deleted to revert exiftool updates
11	#------------------------------------------------------------------------------
12	package Image::ExifTool::PDF;
13
14	use strict;
15	use vars qw($lastFetched);
16
17	sub WriteObject($$);
18	sub EncodeString($);
19	sub CryptObject($);
20
21	# comments to mark beginning and end of ExifTool incremental update
22	my $beginComment = '%BeginExifToolUpdate';
23	my $endComment = '%EndExifToolUpdate ';
24
25	my $keyExt; # crypt key extension
26	my $pdfVer; # version of PDF file we are currently writing
27
28	# internal tags used in dictionary objects
29	my %myDictTags = (
30	_tags => 1, _stream => 1, _decrypted => 1, _needCrypt => 1,
31	_filtered => 1, _entry_size => 1, _table => 1,
32	);
33
34	# map for directories that we can add
35	my %pdfMap = (
36	XMP => 'PDF',
37	);
38
39	#------------------------------------------------------------------------------
40	# Validate raw PDF values for writing (string date integer real boolean name)
41	# Inputs: 0) ExifTool object ref, 1) tagInfo hash ref, 2) raw value ref
42	# Returns: error string or undef (and possibly changes value) on success
43	sub CheckPDF($$$)
44	{
45	my ($et, $tagInfo, $valPtr) = @_;
46	my $format = $$tagInfo{Writable} \|\| $tagInfo->{Table}->{WRITABLE};
47	if (not $format) {
48	return 'No writable format';
49	} elsif ($format eq 'string') {
50	# (encode later because list-type string tags need to be encoded as a unit)
51	} elsif ($format eq 'date') {
52	# be flexible about this for now
53	return 'Bad date format' unless $$valPtr =~ /^\d{4}/;
54	} elsif ($format eq 'integer') {
55	return 'Not an integer' unless Image::ExifTool::IsInt($$valPtr);
56	} elsif ($format eq 'real') {
57	return 'Not a real number' unless $$valPtr =~ /^[+-]?(?=\d\|\.\d)\d(\.\d)?$/;
58	} elsif ($format eq 'boolean') {
59	$$valPtr = ($$valPtr and $$valPtr !~ /^f/i) ? 'true' : 'false';
60	} elsif ($format eq 'name') {
61	return 'Invalid PDF name' if $$valPtr =~ /\0/;
62	} else {
63	return "Invalid PDF format '${format}'";
64	}
65	return undef; # value is OK
66	}
67
68	#------------------------------------------------------------------------------
69	# Format value for writing to PDF file
70	# Inputs: 0) ExifTool ref, 1) value, 2) format string (string,date,integer,real,boolean,name)
71	# Returns: formatted value or undef on error
72	# Notes: Called at write time, so $pdfVer may be checked
73	sub WritePDFValue($$$)
74	{
75	my ($et, $val, $format) = @_;
76	if (not $format) {
77	return undef;
78	} elsif ($format eq 'string') {
79	# encode as UCS2 if it contains any special characters
80	$val = "\xfe\xff" . $et->Encode($val,'UCS2','MM') if $val =~ /[\x80-\xff]/;
81	EncodeString(\$val);
82	} elsif ($format eq 'date') {
83	# convert date to "D:YYYYmmddHHMMSS+-HH'MM'" format
84	$val =~ s/([-+]\d{2}):(\d{2})/${1}'${2}'/; # change timezone delimiters if necessary
85	$val =~ tr/ ://d; # remove spaces and colons
86	$val = "D:$val"; # add leading "D:"
87	EncodeString(\$val);
88	} elsif ($format =~ /^(integer\|real\|boolean)$/) {
89	# no reformatting necessary
90	} elsif ($format eq 'name') {
91	return undef if $val =~ /\0/;
92	if ($pdfVer >= 1.2) {
93	$val =~ s/([\t\n\f\r ()<>[\]{}\/%#])/sprintf('#%.2x',ord $1)/sge;
94	} else {
95	return undef if $val =~ /[\t\n\f\r ()<>[\]{}\/%]/;
96	}
97	$val = "/$val"; # add leading '/'
98	} else {
99	return undef;
100	}
101	return $val;
102	}
103
104	#------------------------------------------------------------------------------
105	# Encode PDF string
106	# Inputs: 0) reference to PDF string
107	# Returns: (updates string with encoded data)
108	sub EncodeString($)
109	{
110	my $strPt = shift;
111	if (ref $$strPt eq 'ARRAY') {
112	my $str;
113	foreach $str (@{$$strPt}) {
114	EncodeString(\$str);
115	}
116	return;
117	}
118	Crypt($strPt, $keyExt, 1); # encrypt if necessary
119	# encode as hex if we have any control characters (except tab)
120	if ($$strPt=~/[\x00-\x08\x0a-\x1f\x7f\xff]/) {
121	# encode as hex
122	my $str='';
123	my $len = length $$strPt;
124	my $i = 0;
125	for (;;) {
126	my $n = $len - $i or last;
127	$n = 40 if $n > 40; # break into reasonable-length lines
128	$str .= $/ if $i;
129	$str .= unpack('H*', substr($$strPt, $i, $n));
130	$i += $n;
131	}
132	$$strPt = "<$str>";
133	} else {
134	$$strPt =~ s/([()\\])/\\$1/g; # must escape round brackets and backslashes
135	$$strPt = "($$strPt)";
136	}
137	}
138
139	#------------------------------------------------------------------------------
140	# Encrypt an object
141	# Inputs: 0) PDF object (encrypts in place)
142	# Notes: Encrypts according to "_needCrypt" dictionary entry,
143	# then deletes "_needCrypt" when done
144	sub CryptObject($)
145	{
146	my $obj = $_[0];
147	if (not ref $obj) {
148	# only literal strings and hex strings are encrypted
149	if ($obj =~ /^[(<]/) {
150	undef $lastFetched; # (reset this just in case)
151	my $val = ReadPDFValue($obj);
152	EncodeString(\$val);
153	$_[0] = $val;
154	}
155	} elsif (ref $obj eq 'HASH') {
156	my $tag;
157	my $needCrypt = $$obj{_needCrypt};
158	foreach $tag (keys %$obj) {
159	next if $myDictTags{$tag};
160	# re-encrypt necessary objects only (others are still encrypted)
161	# (this is really annoying, but is necessary because objects stored
162	# in encrypted streams are decrypted when extracting, but strings stored
163	# as direct objects are decrypted later since they must be decoded
164	# before being decrypted)
165	if ($needCrypt) {
166	next unless defined $$needCrypt{$tag} ? $$needCrypt{$tag} : $$needCrypt{'*'};
167	}
168	CryptObject($$obj{$tag});
169	}
170	delete $$obj{_needCrypt}; # avoid re-re-crypting
171	} elsif (ref $obj eq 'ARRAY') {
172	my $val;
173	foreach $val (@$obj) {
174	CryptObject($val);
175	}
176	}
177	}
178
179	#------------------------------------------------------------------------------
180	# Get free entries from xref stream dictionary that we wrote previously
181	# Inputs: 0) xref dictionary reference
182	# Returns: free entry hash (keys are object numbers, values are xref entry list refs)
183	sub GetFreeEntries($)
184	{
185	my $dict = shift;
186	my %xrefFree;
187	# from the start we have only written xref stream entries in 'CNn' format,
188	# so we can simplify things for now and only support this type of entry
189	my $w = $$dict{W};
190	if (ref $w eq 'ARRAY' and "@$w" eq '1 4 2') {
191	my $size = $$dict{_entry_size}; # this will be 7 for 'CNn'
192	my $index = $$dict{Index};
193	my $len = length $$dict{_stream};
194	# scan the table for free objects
195	my $num = scalar(@$index) / 2;
196	my $pos = 0;
197	my ($i, $j);
198	for ($i=0; $i<$num; ++$i) {
199	my $start = $$index[$i*2];
200	my $count = $$index[$i*2+1];
201	for ($j=0; $j<$count; ++$j) {
202	last if $pos + $size > $len;
203	my @t = unpack("x$pos CNn", $$dict{_stream});
204	# add entry if object was free
205	$xrefFree{$start+$j} = [ $t[1], $t[2], 'f' ] if $t[0] == 0;
206	$pos += $size; # step to next entry
207	}
208	}
209	}
210	return \%xrefFree;
211	}
212
213	#------------------------------------------------------------------------------
214	# Write PDF object
215	# Inputs: 0) output file or scalar ref, 1) PDF object
216	# Returns: true on success
217	# Notes: inserts white space before object, but none afterward
218	sub WriteObject($$)
219	{
220	my ($outfile, $obj) = @_;
221	if (ref $obj eq 'SCALAR') {
222	Write($outfile, ' ', $$obj) or return 0;
223	} elsif (ref $obj eq 'ARRAY') {
224	# write array
225	Write($outfile, @$obj > 10 ? $/ : ' ', '[') or return 0;
226	my $item;
227	foreach $item (@$obj) {
228	WriteObject($outfile, $item) or return 0;
229	}
230	Write($outfile, ' ]') or return 0;
231	} elsif (ref $obj eq 'HASH') {
232	# write dictionary
233	my $tag;
234	Write($outfile, $/, '<<') or return 0;
235	# prepare object as required if it has a stream
236	if ($$obj{_stream}) {
237	# encrypt stream if necessary (must be done before determining Length)
238	CryptStream($obj, $keyExt) if $$obj{_decrypted};
239	# write "Length" entry in dictionary
240	$$obj{Length} = length $$obj{_stream};
241	push @{$$obj{_tags}}, 'Length';
242	# delete Filter-related entries since we don't yet write filtered streams
243	delete $$obj{Filter};
244	delete $$obj{DecodeParms};
245	delete $$obj{DL};
246	}
247	# don't write my internal entries
248	my %wrote = %myDictTags;
249	# write tags in original order, adding new ones later alphabetically
250	foreach $tag (@{$$obj{_tags}}, sort keys %$obj) {
251	# ignore already-written or missing entries
252	next if $wrote{$tag} or not defined $$obj{$tag};
253	Write($outfile, $/, "/$tag") or return 0;
254	WriteObject($outfile, $$obj{$tag}) or return 0;
255	$wrote{$tag} = 1;
256	}
257	Write($outfile, $/, '>>') or return 0;
258	if ($$obj{_stream}) {
259	# write object stream
260	# (a single 0x0d may not follow 'stream', so use 0x0d+0x0a here to be sure)
261	Write($outfile, $/, "stream\x0d\x0a") or return 0;
262	Write($outfile, $$obj{_stream}, $/, 'endstream') or return 0;
263	}
264	} else {
265	# write string, number, name or object reference
266	Write($outfile, ' ', $obj);
267	}
268	return 1;
269	}
270
271	#------------------------------------------------------------------------------
272	# Write PDF File
273	# Inputs: 0) ExifTool object reference, 1) dirInfo reference
274	# Returns: 1 on success, 0 if not valid PDF file, -1 on write error
275	# Notes: dictionary structure: Main --+--> Info
276	# +--> Root --> Metadata
277	sub WritePDF($$)
278	{
279	my ($et, $dirInfo) = @_;
280	my $raf = $$dirInfo{RAF};
281	my $outfile = $$dirInfo{OutFile};
282	my ($buff, %capture, %newXRef, %newObj, $objRef);
283	my ($out, $id, $gen, $obj);
284
285	# make sure this is a PDF file
286	my $pos = $raf->Tell();
287	$raf->Read($buff, 1024) >= 8 or return 0;
288	$buff =~ /^(\s*)%PDF-(\d+\.\d+)/ or return 0;
289	$$et{PDFBase} = length $1;
290	$raf->Seek($pos, 0);
291
292	# create a new ExifTool object and use it to read PDF and XMP information
293	my $newTool = new Image::ExifTool;
294	$newTool->Options(List => 1);
295	$newTool->Options(Password => $et->Options('Password'));
296	$$newTool{PDF_CAPTURE} = \%capture;
297	my $info = $newTool->ImageInfo($raf, 'XMP', 'PDF:*', 'Error', 'Warning');
298	# not a valid PDF file unless we got a version number
299	# (note: can't just check $$info{PDFVersion} due to possibility of XMP-pdf:PDFVersion)
300	my $vers = $newTool->GetInfo('PDF:PDFVersion');
301	# take highest version number if multiple versions in an incremental save
302	($pdfVer) = sort { $b <=> $a } values %$vers;
303	$pdfVer or $et->Error('Missing PDF:PDFVersion'), return 0;
304	# check version number
305	if ($pdfVer > 1.7) {
306	$et->Warn("The PDF $pdfVer specification is not freely available", 1);
307	# (so writing by ExifTool is based on trial and error)
308	}
309	# fail if we had any serious errors while extracting information
310	if ($capture{Error} or $$info{Error}) {
311	$et->Error($capture{Error} \|\| $$info{Error});
312	return 1;
313	}
314	# make sure we have everything we need to rewrite this file
315	foreach $obj (qw(Main Root xref)) {
316	next if $capture{$obj};
317	# any warning we received may give a clue about why this object is missing
318	$et->Error($$info{Warning}) if $$info{Warning};
319	$et->Error("Can't find $obj object");
320	return 1;
321	}
322	$et->InitWriteDirs(\%pdfMap, 'XMP');
323
324	# copy file up to start of previous exiftool update or end of file
325	# (comment, startxref & EOF with 11-digit offsets and 2-byte newlines is 63 bytes)
326	$raf->Seek(-64,2) and $raf->Read($buff,64) and $raf->Seek(0,0) or return -1;
327	my $rtn = 1;
328	my $prevUpdate;
329	# (now $endComment is before "startxref", but pre-7.41 we wrote it after the EOF)
330	if ($buff =~ /$endComment(\d+)\s+(startxref\s+\d+\s+%%EOF\s+)?$/s) {
331	$prevUpdate = $1;
332	# rewrite the file up to the original EOF
333	Image::ExifTool::CopyBlock($raf, $outfile, $prevUpdate + $$et{PDFBase}) or $rtn = -1;
334	# verify that we are now at the start of an ExifTool update
335	unless ($raf->Read($buff, length $beginComment) and $buff eq $beginComment) {
336	$et->Error('Previous ExifTool update is corrupted');
337	return $rtn;
338	}
339	$raf->Seek($prevUpdate+$$et{PDFBase}, 0) or $rtn = -1;
340	if ($$et{DEL_GROUP}{'PDF-update'}) {
341	$et->VPrint(0, " Reverted previous ExifTool updates\n");
342	++$$et{CHANGED};
343	return $rtn;
344	}
345	} elsif ($$et{DEL_GROUP}{'PDF-update'}) {
346	$et->Error('File contains no previous ExifTool update');
347	return $rtn;
348	} else {
349	# rewrite the whole file
350	while ($raf->Read($buff, 65536)) {
351	Write($outfile, $buff) or $rtn = -1;
352	}
353	}
354	$out = $et->Options('TextOut') if $et->Options('Verbose');
355	#
356	# create our new PDF objects to write
357	#
358	my $xref = $capture{xref};
359	my $mainDict = $capture{Main};
360	my $metaRef = $capture{Root}->{Metadata};
361	my $nextObject;
362
363	# start by finding reference for info object in case it was deleted
364	# in a previous edit so we can re-use it here if adding PDF Info
365	my $prevInfoRef;
366	if ($prevUpdate) {
367	unless ($capture{Prev}) {
368	$et->Error("Can't locate trailer dictionary prior to last edit");
369	return $rtn;
370	}
371	$prevInfoRef = $capture{Prev}->{Info};
372	# start from previous size so the xref table doesn't continue
373	# to grow if we repeatedly add and delete the Metadata object
374	$nextObject = $capture{Prev}->{Size};
375	# don't re-use Meta reference if object was added in a previous update
376	undef $metaRef if $metaRef and $$metaRef=~/^(\d+)/ and $1 >= $nextObject;
377	} else {
378	$prevInfoRef = $$mainDict{Info};
379	$nextObject = $$mainDict{Size};
380	}
381
382	# delete entire PDF group if specified
383	my $infoChanged = 0;
384	if ($$et{DEL_GROUP}{PDF} and $capture{Info}) {
385	delete $capture{Info};
386	$info = { XMP => $$info{XMP} }; # remove extracted PDF tags
387	print $out " Deleting PDF Info dictionary\n" if $out;
388	++$infoChanged;
389	}
390
391	# create new Info dictionary if necessary
392	$capture{Info} = { _tags => [ ] } unless $capture{Info};
393	my $infoDict = $capture{Info};
394
395	# must pre-determine Info reference to be used in encryption
396	my $infoRef = $prevInfoRef \|\| \ "$nextObject 0 R";
397	$keyExt = $$infoRef;
398
399	# must encrypt all values in dictionary if they came from an encrypted stream
400	CryptObject($infoDict) if $$infoDict{_needCrypt};
401
402	# must set line separator before calling WritePDFValue()
403	local $/ = $capture{newline};
404
405	# rewrite PDF Info tags
406	my $newTags = $et->GetNewTagInfoHash(\%Image::ExifTool::PDF::Info);
407	my $tagID;
408	foreach $tagID (sort keys %$newTags) {
409	my $tagInfo = $$newTags{$tagID};
410	my $nvHash = $et->GetNewValueHash($tagInfo);
411	my (@vals, $deleted);
412	my $tag = $$tagInfo{Name};
413	my $val = $$info{$tag};
414	my $tagKey = $tag;
415	unless (defined $val) {
416	# must check for tag key with copy number
417	($tagKey) = grep /^$tag/, keys %$info;
418	$val = $$info{$tagKey} if $tagKey;
419	}
420	if (defined $val) {
421	my @oldVals;
422	if (ref $val eq 'ARRAY') {
423	@oldVals = @$val;
424	$val = shift @oldVals;
425	}
426	for (;;) {
427	if ($et->IsOverwriting($nvHash, $val) > 0) {
428	$deleted = 1;
429	$et->VerboseValue("- PDF:$tag", $val);
430	++$infoChanged;
431	} else {
432	push @vals, $val;
433	}
434	last unless @oldVals;
435	$val = shift @oldVals;
436	}
437	# don't write this out if we deleted all values
438	delete $$infoDict{$tagID} unless @vals;
439	} elsif ($$nvHash{EditOnly}) {
440	next;
441	}
442	# decide whether we want to write this tag
443	# (native PDF information is always preferred, so don't check IsCreating)
444	next unless $deleted or $$tagInfo{List} or not exists $$infoDict{$tagID};
445
446	# add new values to existing ones
447	my @newVals = $et->GetNewValue($nvHash);
448	if (@newVals) {
449	push @vals, @newVals;
450	++$infoChanged;
451	if ($out) {
452	foreach $val (@newVals) {
453	$et->VerboseValue("+ PDF:$tag", $val);
454	}
455	}
456	}
457	unless (@vals) {
458	# remove this entry from the Info dictionary if no values remain
459	delete $$infoDict{$tagID};
460	next;
461	}
462	# format value(s) for writing to PDF file
463	my $writable = $$tagInfo{Writable} \|\| $Image::ExifTool::PDF::Info{WRITABLE};
464	if (not $$tagInfo{List}) {
465	$val = WritePDFValue($et, shift(@vals), $writable);
466	} elsif ($$tagInfo{List} eq 'array') {
467	foreach $val (@vals) {
468	$val = WritePDFValue($et, $val, $writable);
469	defined $val or undef(@vals), last;
470	}
471	$val = @vals ? \@vals : undef;
472	} else {
473	$val = WritePDFValue($et, join($et->Options('ListSep'), @vals), $writable);
474	}
475	if (defined $val) {
476	$$infoDict{$tagID} = $val;
477	++$infoChanged;
478	} else {
479	$et->Warn("Error converting $$tagInfo{Name} value");
480	}
481	}
482	if ($infoChanged) {
483	$$et{CHANGED} += $infoChanged;
484	} elsif ($prevUpdate) {
485	# must still write Info dictionary if it was previously updated
486	my $oldPos = LocateObject($xref, $$infoRef);
487	$infoChanged = 1 if $oldPos and $oldPos > $prevUpdate;
488	}
489
490	# create new Info dictionary if necessary
491	if ($infoChanged) {
492	# increment object count if we used a new object here
493	if (scalar(keys %{$capture{Info}}) > 1) {
494	$newObj{$$infoRef} = $capture{Info};# save to write later
495	$$mainDict{Info} = $infoRef; # add reference to trailer dictionary
496	++$nextObject unless $prevInfoRef;
497	} else {
498	# remove Info from Main (trailer) dictionary
499	delete $$mainDict{Info};
500	# write free entry in xref table if Info existed prior to all edits
501	$newObj{$$infoRef} = '' if $prevInfoRef;
502	}
503	}
504
505	# rewrite XMP
506	my %xmpInfo = (
507	DataPt => $$info{XMP},
508	Parent => 'PDF',
509	);
510	my $xmpTable = Image::ExifTool::GetTagTable('Image::ExifTool::XMP::Main');
511	my $oldChanged = $$et{CHANGED};
512	my $newXMP = $et->WriteDirectory(\%xmpInfo, $xmpTable);
513	$newXMP = $$info{XMP} ? ${$$info{XMP}} : '' unless defined $newXMP;
514
515	# WriteDirectory() will increment CHANGED erroneously if non-existent
516	# XMP is deleted as a block -- so check for this
517	unless ($newXMP or $$info{XMP}) {
518	$$et{CHANGED} = $oldChanged;
519	$et->VPrint(0, " (XMP not changed -- still empty)\n");
520	}
521	my ($metaChanged, $rootChanged);
522
523	if ($$et{CHANGED} != $oldChanged and defined $newXMP) {
524	$metaChanged = 1;
525	} elsif ($prevUpdate and $capture{Root}->{Metadata}) {
526	# must still write Metadata dictionary if it was previously updated
527	my $oldPos = LocateObject($xref, ${$capture{Root}->{Metadata}});
528	$metaChanged = 1 if $oldPos and $oldPos > $prevUpdate;
529	}
530	if ($metaChanged) {
531	if ($newXMP) {
532	unless ($metaRef) {
533	# allocate new PDF object
534	$metaRef = \ "$nextObject 0 R";
535	++$nextObject;
536	$capture{Root}->{Metadata} = $metaRef;
537	$rootChanged = 1; # set flag to replace Root dictionary
538	}
539	# create the new metadata dictionary to write later
540	$newObj{$$metaRef} = {
541	Type => '/Metadata',
542	Subtype => '/XML',
543	# Length => length $newXMP, (set by WriteObject)
544	_tags => [ qw(Type Subtype) ],
545	_stream => $newXMP,
546	_decrypted => 1, # (this will be ignored if EncryptMetadata is false)
547	};
548	} elsif ($capture{Root}->{Metadata}) {
549	# free existing metadata object
550	$newObj{${$capture{Root}->{Metadata}}} = '';
551	delete $capture{Root}->{Metadata};
552	$rootChanged = 1; # set flag to replace Root dictionary
553	}
554	}
555	# add new Root dictionary if necessary
556	my $rootRef = $$mainDict{Root};
557	unless ($rootRef) {
558	$et->Error("Can't find Root dictionary");
559	return $rtn;
560	}
561	if (not $rootChanged and $prevUpdate) {
562	# must still write Root dictionary if it was previously updated
563	my $oldPos = LocateObject($xref, $$rootRef);
564	$rootChanged = 1 if $oldPos and $oldPos > $prevUpdate;
565	}
566	$newObj{$$rootRef} = $capture{Root} if $rootChanged;
567	#
568	# write incremental update if anything was changed
569	#
570	if ($$et{CHANGED}) {
571	# remember position of original EOF
572	my $oldEOF = Tell($outfile) - $$et{PDFBase};
573	Write($outfile, $beginComment) or $rtn = -1;
574
575	# write new objects
576	foreach $objRef (sort keys %newObj) {
577	$objRef =~ /^(\d+) (\d+)/ or $rtn = -1, last;
578	($id, $gen) = ($1, $2);
579	if (not $newObj{$objRef}) {
580	++$gen if $gen < 65535;
581	# write free entry in xref table
582	$newXRef{$id} = [ 0, $gen, 'f' ];
583	next;
584	}
585	# create new entry for xref table
586	$newXRef{$id} = [ Tell($outfile) - $$et{PDFBase} + length($/), $gen, 'n' ];
587	$keyExt = "$id $gen obj"; # (must set for stream encryption)
588	Write($outfile, $/, $keyExt) or $rtn = -1;
589	WriteObject($outfile, $newObj{$objRef}) or $rtn = -1;
590	Write($outfile, $/, 'endobj') or $rtn = -1;
591	}
592
593	# Prev points to old xref table
594	$$mainDict{Prev} = $capture{startxref} unless $prevUpdate;
595
596	# add xref entry for head of free-object list
597	$newXRef{0} = [ 0, 65535, 'f' ];
598
599	# must insert free xref entries from previous exiftool update if applicable
600	if ($prevUpdate) {
601	my $mainFree;
602	# extract free entries from our previous Main xref stream
603	if ($$mainDict{Type} and $$mainDict{Type} eq '/XRef') {
604	$mainFree = GetFreeEntries($xref->{dicts}->[0]);
605	} else {
606	# free entries from Main xref table already captured for us
607	$mainFree = $capture{mainFree};
608	}
609	foreach $id (sort { $a <=> $b } keys %$mainFree) {
610	$newXRef{$id} = $$mainFree{$id} unless $newXRef{$id};
611	}
612	}
613
614	# connect linked list of free object in our xref table
615	my $prevFree = 0;
616	foreach $id (sort { $b <=> $a } keys %newXRef) { # (reverse sort)
617	next unless $newXRef{$id}->[2] eq 'f'; # skip if not free
618	# no need to add free entry for objects added by us
619	# in previous edits then freed again
620	if ($id >= $nextObject) {
621	delete $newXRef{$id}; # Note: deleting newXRef entry!
622	next;
623	}
624	$newXRef{$id}->[0] = $prevFree;
625	$prevFree = $id;
626	}
627
628	# prepare our main dictionary for writing
629	$$mainDict{Size} = $nextObject; # update number of objects
630	# must change the ID if it exists
631	if (ref $$mainDict{ID} eq 'ARRAY' and @{$$mainDict{ID}} > 1) {
632	# increment first byte since this is an easy change to make
633	$id = $mainDict->{ID}->[1];
634	if ($id =~ /^<([0-9a-f]{2})/i) {
635	my $byte = unpack('H2',chr((hex($1) + 1) & 0xff));
636	substr($id, 1, 2) = $byte;
637	} elsif ($id =~ /^\((.)/s and $1 ne '\\' and $1 ne ')' and $1 ne '(') {
638	my $ch = chr((ord($1) + 1) & 0xff);
639	# avoid generating characters that could cause problems
640	$ch = 'a' if $ch =~ /[()\\\x00-\x08\x0a-\x1f\x7f\xff]/;
641	substr($id, 1, 1) = $ch;
642	}
643	$mainDict->{ID}->[1] = $id;
644	}
645
646	# remember position of xref table in file (we will write this next)
647	my $startxref = Tell($outfile) - $$et{PDFBase} + length($/);
648
649	# must write xref as a stream in xref-stream-only files
650	if ($$mainDict{Type} and $$mainDict{Type} eq '/XRef') {
651
652	# create entry for the xref stream object itself
653	$newXRef{$nextObject++} = [ Tell($outfile) - $$et{PDFBase} + length($/), 0, 'n' ];
654	$$mainDict{Size} = $nextObject;
655	# create xref stream and Index entry
656	$$mainDict{W} = [ 1, 4, 2 ]; # int8u, int32u, int16u ('CNn')
657	$$mainDict{Index} = [ ];
658	$$mainDict{_stream} = '';
659	my @ids = sort { $a <=> $b } keys %newXRef;
660	while (@ids) {
661	my $startID = $ids[0];
662	for (;;) {
663	$id = shift @ids;
664	my ($pos, $gen, $type) = @{$newXRef{$id}};
665	if ($pos > 0xffffffff) {
666	$et->Error('Huge files not yet supported');
667	last;
668	}
669	$$mainDict{_stream} .= pack('CNn', $type eq 'f' ? 0 : 1, $pos, $gen);
670	last if not @ids or $ids[0] != $id + 1;
671	}
672	# add Index entries for this section of the xref stream
673	push @{$$mainDict{Index}}, $startID, $id - $startID + 1;
674	}
675	# write the xref stream object
676	$keyExt = "$id 0 obj"; # (set anyway, but xref stream should NOT be encrypted)
677	Write($outfile, $/, $keyExt) or $rtn = -1;
678	WriteObject($outfile, $mainDict) or $rtn = -1;
679	Write($outfile, $/, 'endobj') or $rtn = -1;
680
681	} else {
682
683	# write new xref table
684	Write($outfile, $/, 'xref', $/) or $rtn = -1;
685	# lines must be exactly 20 bytes, so pad newline if necessary
686	my $endl = (length($/) == 1 ? ' ' : '') . $/;
687	my @ids = sort { $a <=> $b } keys %newXRef;
688	while (@ids) {
689	my $startID = $ids[0];
690	$buff = '';
691	for (;;) {
692	$id = shift @ids;
693	$buff .= sprintf("%.10d %.5d %s%s", @{$newXRef{$id}}, $endl);
694	last if not @ids or $ids[0] != $id + 1;
695	}
696	# write this (contiguous-numbered object) section of the xref table
697	Write($outfile, $startID, ' ', $id - $startID + 1, $/, $buff) or $rtn = -1;
698	}
699
700	# write main (trailer) dictionary
701	Write($outfile, 'trailer') or $rtn = -1;
702	WriteObject($outfile, $mainDict) or $rtn = -1;
703	}
704	# write trailing comment (marker to allow edits to be reverted)
705	Write($outfile, $/, $endComment, $oldEOF, $/) or $rtn = -1;
706
707	# write pointer to main xref table and EOF marker
708	Write($outfile, 'startxref', $/, $startxref, $/, '%%EOF', $/) or $rtn = -1;
709
710	} elsif ($prevUpdate) {
711
712	# nothing new changed, so copy over previous incremental update
713	$raf->Seek($prevUpdate+$$et{PDFBase}, 0) or $rtn = -1;
714	while ($raf->Read($buff, 65536)) {
715	Write($outfile, $buff) or $rtn = -1;
716	}
717	}
718	if ($rtn > 0 and $$et{CHANGED} and ($$et{DEL_GROUP}{PDF} or $$et{DEL_GROUP}{XMP})) {
719	$et->Warn('ExifTool PDF edits are reversible. Deleted tags may be recovered!', 1);
720	}
721	undef $newTool;
722	undef %capture;
723	return $rtn;
724	}
725
726
727	1; # end
728
729	__END__
730
731	=head1 NAME
732
733	Image::ExifTool::WritePDF.pl - Write PDF meta information
734
735	=head1 SYNOPSIS
736
737	These routines are autoloaded by Image::ExifTool::PDF.
738
739	=head1 DESCRIPTION
740
741	This file contains routines to write PDF metadata.
742
743	=head1 NOTES
744
745	When writing a PDF, exiftool does not modify the existing data. Instead,
746	the PDF file is appended with an incremental update which can easily be
747	removed to revert the file (by using ExifTool to delete the special
748	C<PDF-update> pseudo group).
749
750	=head1 AUTHOR
751
752	Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
753
754	This library is free software; you can redistribute it and/or modify it
755	under the same terms as Perl itself.
756
757	=head1 REFERENCES
758
759	=over 4
760
761	=item L<http://partners.adobe.com/public/developer/pdf/index_reference.html>
762
763	=back
764
765	=head1 SEE ALSO
766
767	L<Image::ExifTool::PDF(3pm)\|Image::ExifTool::PDF>,
768	L<Image::ExifTool(3pm)\|Image::ExifTool>
769
770	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/WritePDF.pl@ 34921

Download in other formats: