source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/DjVu.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

  • Property svn:executable set to *
File size: 12.4 KB
Line 
1#------------------------------------------------------------------------------
2# File: DjVu.pm
3#
4# Description: Read DjVu archive meta information
5#
6# Revisions: 09/25/2008 - P. Harvey Created
7#
8# References: 1) http://djvu.sourceforge.net/ (DjVu v3 specification, Nov 2005)
9# 2) http://www.djvu.org/
10#
11# Notes: DjVu files are recognized and the IFF structure is processed
12# by Image::ExifTool::AIFF
13#------------------------------------------------------------------------------
14
15package Image::ExifTool::DjVu;
16
17use strict;
18use vars qw($VERSION);
19use Image::ExifTool qw(:DataAccess :Utils);
20
21$VERSION = '1.06';
22
23sub ParseAnt($);
24sub ProcessAnt($$$);
25sub ProcessMeta($$$);
26sub ProcessBZZ($$$);
27
28# DjVu chunks that we parse (ref 4)
29%Image::ExifTool::DjVu::Main = (
30 GROUPS => { 2 => 'Image' },
31 NOTES => q{
32 Information is extracted from the following chunks in DjVu images. See
33 L<http://www.djvu.org/> for the DjVu specification.
34 },
35 INFO => {
36 SubDirectory => { TagTable => 'Image::ExifTool::DjVu::Info' },
37 },
38 FORM => {
39 TypeOnly => 1, # extract chunk type only, then descend into chunk
40 SubDirectory => { TagTable => 'Image::ExifTool::DjVu::Form' },
41 },
42 ANTa => {
43 SubDirectory => { TagTable => 'Image::ExifTool::DjVu::Ant' },
44 },
45 ANTz => {
46 Name => 'CompressedAnnotation',
47 SubDirectory => {
48 TagTable => 'Image::ExifTool::DjVu::Ant',
49 ProcessProc => \&ProcessBZZ,
50 }
51 },
52 INCL => 'IncludedFileID',
53);
54
55# information in the DjVu INFO chunk
56%Image::ExifTool::DjVu::Info = (
57 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
58 GROUPS => { 2 => 'Image' },
59 FORMAT => 'int8u',
60 PRIORITY => 0, # first INFO block takes priority
61 0 => {
62 Name => 'ImageWidth',
63 Format => 'int16u',
64 },
65 2 => {
66 Name => 'ImageHeight',
67 Format => 'int16u',
68 },
69 4 => {
70 Name => 'DjVuVersion',
71 Description => 'DjVu Version',
72 Format => 'int8u[2]',
73 # (this may be just one byte as with version 0.16)
74 ValueConv => '$val=~/(\d+) (\d+)/ ? "$2.$1" : "0.$val"',
75 },
76 6 => {
77 Name => 'SpatialResolution',
78 Format => 'int16u',
79 ValueConv => '(($val & 0xff)<<8) + ($val>>8)', # (little-endian!)
80 },
81 8 => {
82 Name => 'Gamma',
83 ValueConv => '$val / 10',
84 },
85 9 => {
86 Name => 'Orientation',
87 Mask => 0x07, # (upper 5 bits reserved)
88 PrintConv => {
89 1 => 'Horizontal (normal)',
90 2 => 'Rotate 180',
91 5 => 'Rotate 90 CW',
92 6 => 'Rotate 270 CW',
93 },
94 },
95);
96
97# information in the DjVu FORM chunk
98%Image::ExifTool::DjVu::Form = (
99 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
100 GROUPS => { 2 => 'Image' },
101 0 => {
102 Name => 'SubfileType',
103 Format => 'undef[4]',
104 Priority => 0,
105 PrintConv => {
106 DJVU => 'Single-page image',
107 DJVM => 'Multi-page document',
108 PM44 => 'Color IW44',
109 BM44 => 'Grayscale IW44',
110 DJVI => 'Shared component',
111 THUM => 'Thumbnail image',
112 },
113 },
114);
115
116# tags found in the DjVu annotation chunk (ANTz or ANTa)
117%Image::ExifTool::DjVu::Ant = (
118 PROCESS_PROC => \&Image::ExifTool::DjVu::ProcessAnt,
119 GROUPS => { 2 => 'Image' },
120 NOTES => 'Information extracted from annotation chunks.',
121 # Note: For speed, ProcessAnt() pre-scans for known tag ID's, so if any
122 # new tags are added here they must also be added to the pre-scan check
123 metadata => {
124 SubDirectory => { TagTable => 'Image::ExifTool::DjVu::Meta' }
125 },
126 xmp => {
127 Name => 'XMP',
128 SubDirectory => { TagTable => 'Image::ExifTool::XMP::Main' }
129 },
130);
131
132# tags found in the DjVu annotation metadata
133%Image::ExifTool::DjVu::Meta = (
134 PROCESS_PROC => \&Image::ExifTool::DjVu::ProcessMeta,
135 GROUPS => { 1 => 'DjVu-Meta', 2 => 'Image' },
136 NOTES => q{
137 This table lists the standard DjVu metadata tags, but ExifTool will extract
138 any tags that exist even if they don't appear here. The DjVu v3
139 documentation endorses tags borrowed from two standards: 1) BibTeX
140 bibliography system tags (all lowercase Tag ID's in the table below), and 2)
141 PDF DocInfo tags (capitalized Tag ID's).
142 },
143 # BibTeX tags (ref http://en.wikipedia.org/wiki/BibTeX)
144 address => { Groups => { 2 => 'Location' } },
145 annote => { Name => 'Annotation' },
146 author => { Groups => { 2 => 'Author' } },
147 booktitle => { Name => 'BookTitle' },
148 chapter => { },
149 crossref => { Name => 'CrossRef' },
150 edition => { },
151 eprint => { Name => 'EPrint' },
152 howpublished=> { Name => 'HowPublished' },
153 institution => { },
154 journal => { },
155 key => { },
156 month => { Groups => { 2 => 'Time' } },
157 note => { },
158 number => { },
159 organization=> { },
160 pages => { },
161 publisher => { },
162 school => { },
163 series => { },
164 title => { },
165 type => { },
166 url => { Name => 'URL' },
167 volume => { },
168 year => { Groups => { 2 => 'Time' } },
169 # PDF tags (same as Image::ExifTool::PDF::Info)
170 Title => { },
171 Author => { Groups => { 2 => 'Author' } },
172 Subject => { },
173 Keywords => { },
174 Creator => { },
175 Producer => { },
176 CreationDate => {
177 Name => 'CreateDate',
178 Groups => { 2 => 'Time' },
179 # RFC 3339 date/time format
180 ValueConv => 'require Image::ExifTool::XMP; Image::ExifTool::XMP::ConvertXMPDate($val)',
181 PrintConv => '$self->ConvertDateTime($val)',
182 },
183 ModDate => {
184 Name => 'ModifyDate',
185 Groups => { 2 => 'Time' },
186 ValueConv => 'require Image::ExifTool::XMP; Image::ExifTool::XMP::ConvertXMPDate($val)',
187 PrintConv => '$self->ConvertDateTime($val)',
188 },
189 Trapped => {
190 # remove leading '/' from '/True' or '/False'
191 ValueConv => '$val=~s{^/}{}; $val',
192 },
193);
194
195#------------------------------------------------------------------------------
196# Parse DjVu annotation "s-expression" syntax (recursively)
197# Inputs: 0) data ref (with pos($$dataPt) set to start of annotation)
198# Returns: reference to list of tokens/references, or undef if no tokens,
199# and the position in $$dataPt is set to end of last token
200# Notes: The DjVu annotation syntax is not well documented, so I make
201# a number of assumptions here!
202sub ParseAnt($)
203{
204 my $dataPt = shift;
205 my (@toks, $tok, $more);
206 # (the DjVu annotation syntax really sucks, and requires that every
207 # single token be parsed in order to properly scan through the items)
208Tok: for (;;) {
209 # find the next token
210 last unless $$dataPt =~ /(\S)/sg; # get next non-space character
211 if ($1 eq '(') { # start of list
212 $tok = ParseAnt($dataPt);
213 } elsif ($1 eq ')') { # end of list
214 $more = 1;
215 last;
216 } elsif ($1 eq '"') { # quoted string
217 $tok = '';
218 for (;;) {
219 # get string up to the next quotation mark
220 # this doesn't work in perl 5.6.2! grrrr
221 # last Tok unless $$dataPt =~ /(.*?)"/sg;
222 # $tok .= $1;
223 my $pos = pos($$dataPt);
224 last Tok unless $$dataPt =~ /"/sg;
225 $tok .= substr($$dataPt, $pos, pos($$dataPt)-1-$pos);
226 # we're good unless quote was escaped by odd number of backslashes
227 last unless $tok =~ /(\\+)$/ and length($1) & 0x01;
228 $tok .= '"'; # quote is part of the string
229 }
230 # must protect unescaped "$" and "@" symbols, and "\" at end of string
231 $tok =~ s{\\(.)|([\$\@]|\\$)}{'\\'.($2 || $1)}sge;
232 # convert C escape sequences (allowed in quoted text)
233 $tok = eval qq{"$tok"};
234 } else { # key name
235 pos($$dataPt) = pos($$dataPt) - 1;
236 # allow anything in key but whitespace, braces and double quotes
237 # (this is one of those assumptions I mentioned)
238 $tok = $$dataPt =~ /([^\s()"]+)/sg ? $1 : undef;
239 }
240 push @toks, $tok if defined $tok;
241 }
242 # prevent further parsing unless more after this
243 pos($$dataPt) = length $$dataPt unless $more;
244 return @toks ? \@toks : undef;
245}
246
247#------------------------------------------------------------------------------
248# Process DjVu annotation chunk (ANTa or decoded ANTz)
249# Inputs: 0) ExifTool object reference, 1) DirInfo reference, 2) tag table ref
250# Returns: 1 on success
251sub ProcessAnt($$$)
252{
253 my ($et, $dirInfo, $tagTablePtr) = @_;
254 my $dataPt = $$dirInfo{DataPt};
255
256 # quick pre-scan to check for metadata or XMP
257 return 1 unless $$dataPt =~ /\(\s*(metadata|xmp)[\s("]/s;
258
259 # parse annotations into a tree structure
260 pos($$dataPt) = 0;
261 my $toks = ParseAnt($dataPt) or return 0;
262
263 # process annotations individually
264 my $ant;
265 foreach $ant (@$toks) {
266 next unless ref $ant eq 'ARRAY' and @$ant >= 2;
267 my $tag = shift @$ant;
268 next if ref $tag or not defined $$tagTablePtr{$tag};
269 if ($tag eq 'metadata') {
270 # ProcessMeta() takes array reference
271 $et->HandleTag($tagTablePtr, $tag, $ant);
272 } else {
273 next if ref $$ant[0]; # only process simple values
274 $et->HandleTag($tagTablePtr, $tag, $$ant[0]);
275 }
276 }
277 return 1;
278}
279
280#------------------------------------------------------------------------------
281# Process DjVu metadata
282# Inputs: 0) ExifTool object reference, 1) DirInfo reference, 2) tag table ref
283# Returns: 1 on success
284# Notes: input dirInfo DataPt is a reference to a list of pre-parsed metadata entries
285sub ProcessMeta($$$)
286{
287 my ($et, $dirInfo, $tagTablePtr) = @_;
288 my $dataPt = $$dirInfo{DataPt};
289 return 0 unless ref $$dataPt eq 'ARRAY';
290 $et->VerboseDir('Metadata', scalar @$$dataPt);
291 my ($item, $err);
292 foreach $item (@$$dataPt) {
293 # make sure item is a simple tag/value pair
294 $err=1, next unless ref $item eq 'ARRAY' and @$item >= 2 and
295 not ref $$item[0] and not ref $$item[1];
296 # add any new tags to the table
297 unless ($$tagTablePtr{$$item[0]}) {
298 my $name = $$item[0];
299 $name =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
300 length $name or $err = 1, next;
301 AddTagToTable($tagTablePtr, $$item[0], { Name => ucfirst($name) });
302 }
303 $et->HandleTag($tagTablePtr, $$item[0], $$item[1]);
304 }
305 $err and $et->Warn('Ignored invalid metadata entry(s)');
306 return 1;
307}
308
309#------------------------------------------------------------------------------
310# Process BZZ-compressed data (in DjVu images)
311# Inputs: 0) ExifTool object reference, 1) DirInfo reference, 2) tag table ref
312# Returns: 1 on success
313sub ProcessBZZ($$$)
314{
315 my ($et, $dirInfo, $tagTablePtr) = @_;
316 require Image::ExifTool::BZZ;
317 my $buff = Image::ExifTool::BZZ::Decode($$dirInfo{DataPt});
318 unless (defined $buff) {
319 $et->Warn("Error decoding $$dirInfo{DirName}");
320 return 0;
321 }
322 my $verbose = $et->Options('Verbose');
323 if ($verbose >= 3) {
324 # dump the decoded data in very verbose mode
325 $et->VerboseDir("Decoded $$dirInfo{DirName}", 0, length $buff);
326 $et->VerboseDump(\$buff);
327 }
328 $$dirInfo{DataPt} = \$buff;
329 $$dirInfo{DataLen} = $$dirInfo{DirLen} = length $buff;
330 # process the data using the default process proc for this table
331 my $processProc = $$tagTablePtr{PROCESS_PROC} or return 0;
332 return &$processProc($et, $dirInfo, $tagTablePtr);
333}
334
3351; # end
336
337__END__
338
339=head1 NAME
340
341Image::ExifTool::DjVu - Read DjVu meta information
342
343=head1 SYNOPSIS
344
345This module is used by Image::ExifTool
346
347=head1 DESCRIPTION
348
349This module contains definitions required by Image::ExifTool to extract meta
350information from DjVu images. Parsing of the DjVu IFF structure is done by
351Image::ExifTool::AIFF.
352
353=head1 AUTHOR
354
355Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
356
357This library is free software; you can redistribute it and/or modify it
358under the same terms as Perl itself.
359
360=head1 REFERENCES
361
362=over 4
363
364=item L<http://djvu.sourceforge.net/>
365
366=item L<http://www.djvu.org/>
367
368=back
369
370=head1 SEE ALSO
371
372L<Image::ExifTool::TagNames/DjVu Tags>,
373L<Image::ExifTool::AIFF(3pm)|Image::ExifTool::AIFF>,
374L<Image::ExifTool(3pm)|Image::ExifTool>
375
376=cut
377
Note: See TracBrowser for help on using the repository browser.