source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/OOXML.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

  • Property svn:executable set to *
File size: 13.6 KB
Line 
1#------------------------------------------------------------------------------
2# File: OOXML.pm
3#
4# Description: Read Office Open XML+ZIP files
5#
6# Revisions: 2009/10/31 - P. Harvey Created
7#------------------------------------------------------------------------------
8
9package Image::ExifTool::OOXML;
10
11use strict;
12use vars qw($VERSION);
13use Image::ExifTool qw(:DataAccess :Utils);
14use Image::ExifTool::XMP;
15use Image::ExifTool::ZIP;
16
17$VERSION = '1.08';
18
19# test for recognized OOXML document extensions
20my %isOOXML = (
21 DOCX => 1, DOCM => 1,
22 DOTX => 1, DOTM => 1,
23 POTX => 1, POTM => 1,
24 PPAX => 1, PPAM => 1,
25 PPSX => 1, PPSM => 1,
26 PPTX => 1, PPTM => 1, THMX => 1,
27 XLAM => 1,
28 XLSX => 1, XLSM => 1, XLSB => 1,
29 XLTX => 1, XLTM => 1,
30);
31
32# generate reverse lookup for file type based on MIME
33my %fileType;
34{
35 my $type;
36 foreach $type (keys %isOOXML) {
37 $fileType{$Image::ExifTool::mimeType{$type}} = $type;
38 }
39}
40
41# XML attributes to queue
42my %queuedAttrs;
43my %queueAttrs = (
44 fmtid => 1,
45 pid => 1,
46 name => 1,
47);
48
49# keep track of items in a vector (to accumulate as a list)
50my $vectorCount;
51my @vectorVals;
52
53# Office Open XML tags
54%Image::ExifTool::OOXML::Main = (
55 GROUPS => { 0 => 'XML', 1 => 'XML', 2 => 'Document' },
56 PROCESS_PROC => \&Image::ExifTool::XMP::ProcessXMP,
57 VARS => { NO_ID => 1 },
58 NOTES => q{
59 The Office Open XML (OOXML) format was introduced with Microsoft Office 2007
60 and is used by file types such as DOCX, PPTX and XLSX. These are
61 essentially ZIP archives containing XML files. The table below lists some
62 tags which have been observed in OOXML documents, but ExifTool will extract
63 any tags found from XML files of the OOXML document properties ("docProps")
64 directory.
65
66 B<Tips:>
67
68 1) Structural ZIP tags may be ignored (if desired) with C<--ZIP:all> on the
69 command line.
70
71 2) Tags may be grouped by their document number in the ZIP archive with the
72 C<-g3> or C<-G3> option.
73 },
74 # These tags all have 1:1 correspondence with FlashPix tags except for:
75 # OOXML FlashPix
76 # --------------- -------------
77 # DocSecurity Security
78 # Application Software
79 # dc:Description Comments
80 # dc:Creator Author
81 Application => { },
82 AppVersion => { },
83 category => { },
84 Characters => { },
85 CharactersWithSpaces => { },
86 CheckedBy => { },
87 Client => { },
88 Company => { },
89 created => {
90 Name => 'CreateDate',
91 Groups => { 2 => 'Time' },
92 Format => 'date',
93 PrintConv => '$self->ConvertDateTime($val)',
94 },
95 createdType => { Hidden => 1, RawConv => 'undef' }, # ignore this XML type name
96 DateCompleted => {
97 Groups => { 2 => 'Time' },
98 Format => 'date',
99 PrintConv => '$self->ConvertDateTime($val)',
100 },
101 Department => { },
102 Destination => { },
103 Disposition => { },
104 Division => { },
105 DocSecurity => {
106 # (http://msdn.microsoft.com/en-us/library/documentformat.openxml.extendedproperties.documentsecurity.aspx)
107 PrintConv => {
108 0 => 'None',
109 1 => 'Password protected',
110 2 => 'Read-only recommended',
111 4 => 'Read-only enforced',
112 8 => 'Locked for annotations',
113 },
114 },
115 DocumentNumber=> { },
116 Editor => { Groups => { 2 => 'Author'} },
117 ForwardTo => { },
118 Group => { },
119 HeadingPairs=> { },
120 HiddenSlides=> { },
121 HyperlinkBase=>{ },
122 HyperlinksChanged => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
123 keywords => { },
124 Language => { },
125 lastModifiedBy => { Groups => { 2 => 'Author'} },
126 lastPrinted => {
127 Groups => { 2 => 'Time' },
128 Format => 'date',
129 PrintConv => '$self->ConvertDateTime($val)',
130 },
131 Lines => { },
132 LinksUpToDate=>{ PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
133 Mailstop => { },
134 Manager => { },
135 Matter => { },
136 MMClips => { },
137 modified => {
138 Name => 'ModifyDate',
139 Groups => { 2 => 'Time' },
140 Format => 'date',
141 PrintConv => '$self->ConvertDateTime($val)',
142 },
143 modifiedType=> { Hidden => 1, RawConv => 'undef' }, # ignore this XML type name
144 Notes => { },
145 Office => { },
146 Owner => { Groups => { 2 => 'Author'} },
147 Pages => { },
148 Paragraphs => { },
149 PresentationFormat => { },
150 Project => { },
151 Publisher => { },
152 Purpose => { },
153 ReceivedFrom=> { },
154 RecordedBy => { },
155 RecordedDate=> {
156 Groups => { 2 => 'Time' },
157 Format => 'date',
158 PrintConv => '$self->ConvertDateTime($val)',
159 },
160 Reference => { },
161 revision => { Name => 'RevisionNumber' },
162 ScaleCrop => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
163 SharedDoc => { PrintConv => { 'false' => 'No', 'true' => 'Yes' } },
164 Slides => { },
165 Source => { },
166 Status => { },
167 TelephoneNumber => { },
168 Template => { },
169 TitlesOfParts=>{ },
170 TotalTime => {
171 Name => 'TotalEditTime',
172 PrintConv => 'ConvertTimeSpan($val, 60)',
173 },
174 Typist => { },
175 Words => { },
176);
177
178#------------------------------------------------------------------------------
179# Generate a tag ID for this XML tag
180# Inputs: 0) tag property name list ref
181# Returns: tagID and outtermost interesting namespace (or '' if no namespace)
182sub GetTagID($)
183{
184 my $props = shift;
185 my ($tag, $prop, $namespace);
186 foreach $prop (@$props) {
187 # split name into namespace and property name
188 # (Note: namespace can be '' for property qualifiers)
189 my ($ns, $nm) = ($prop =~ /(.*?):(.*)/) ? ($1, $2) : ('', $prop);
190 next if $ns eq 'vt'; # ignore 'vt' properties
191 if (defined $tag) {
192 $tag .= ucfirst($nm); # add to tag name
193 } elsif ($prop ne 'Properties' and $prop ne 'cp:coreProperties' and
194 $prop ne 'property')
195 {
196 $tag = $nm;
197 # save namespace of first property to contribute to tag name
198 $namespace = $ns unless $namespace;
199 }
200 }
201 return ($tag, $namespace || '');
202}
203
204#------------------------------------------------------------------------------
205# We found an XMP property name/value
206# Inputs: 0) ExifTool object ref, 1) tag table ref
207# 2) reference to array of XMP property names (last is current property)
208# 3) property value, 4) attribute hash ref (not used here)
209# Returns: 1 if valid tag was found
210sub FoundTag($$$$;$)
211{
212 my ($et, $tagTablePtr, $props, $val, $attrs) = @_;
213 return 0 unless @$props;
214 my $verbose = $et->Options('Verbose');
215
216 my $tag = $$props[-1];
217 $et->VPrint(0, " | - Tag '", join('/',@$props), "'\n") if $verbose > 1;
218
219 # un-escape XML character entities
220 $val = Image::ExifTool::XMP::UnescapeXML($val);
221 # convert OOXML-escaped characters (eg. "_x0000d_" is a newline)
222 $val =~ s/_x([0-9a-f]{4})_/Image::ExifTool::PackUTF8(hex($1))/gie;
223 # convert from UTF8 to ExifTool Charset
224 $val = $et->Decode($val, 'UTF8');
225 # queue this attribute for later if necessary
226 if ($queueAttrs{$tag}) {
227 $queuedAttrs{$tag} = $val;
228 return 0;
229 }
230 my $ns;
231 ($tag, $ns) = GetTagID($props);
232 if (not $tag) {
233 # all properties are in ignored namespaces
234 # so 'name' from our queued attributes for the tag
235 my $name = $queuedAttrs{name} or return 0;
236 $name =~ s/(^| )([a-z])/$1\U$2/g; # start words with uppercase
237 ($tag = $name) =~ tr/-_a-zA-Z0-9//dc;
238 return 0 unless length $tag;
239 unless ($$tagTablePtr{$tag}) {
240 my %tagInfo = (
241 Name => $tag,
242 Description => $name,
243 );
244 # format as a date/time value if type is 'vt:filetime'
245 if ($$props[-1] eq 'vt:filetime') {
246 $tagInfo{Groups} = { 2 => 'Time' },
247 $tagInfo{Format} = 'date',
248 $tagInfo{PrintConv} = '$self->ConvertDateTime($val)';
249 }
250 $et->VPrint(0, " | [adding $tag]\n") if $verbose;
251 AddTagToTable($tagTablePtr, $tag, \%tagInfo);
252 }
253 } elsif ($tag eq 'xmlns') {
254 # ignore namespaces (for now)
255 return 0;
256 } elsif (ref $Image::ExifTool::XMP::Main{$ns} eq 'HASH' and
257 $Image::ExifTool::XMP::Main{$ns}{SubDirectory})
258 {
259 # use standard XMP table if it exists
260 my $table = $Image::ExifTool::XMP::Main{$ns}{SubDirectory}{TagTable};
261 no strict 'refs';
262 if ($table and %$table) {
263 $tagTablePtr = Image::ExifTool::GetTagTable($table);
264 }
265 } elsif (@$props > 2 and grep /^vt:vector$/, @$props) {
266 # handle vector properties (accumulate as lists)
267 if ($$props[-1] eq 'vt:size') {
268 $vectorCount = $val;
269 undef @vectorVals;
270 return 0;
271 } elsif ($$props[-1] eq 'vt:baseType') {
272 return 0; # ignore baseType
273 } elsif ($vectorCount) {
274 --$vectorCount;
275 if ($vectorCount) {
276 push @vectorVals, $val;
277 return 0;
278 }
279 $val = [ @vectorVals, $val ] if @vectorVals;
280 # Note: we will lose any improper-sized vector elements here
281 }
282 }
283 # add any unknown tags to table
284 if ($$tagTablePtr{$tag}) {
285 my $tagInfo = $$tagTablePtr{$tag};
286 if (ref $tagInfo eq 'HASH') {
287 # reformat date/time values
288 my $fmt = $$tagInfo{Format} || $$tagInfo{Writable} || '';
289 $val = Image::ExifTool::XMP::ConvertXMPDate($val) if $fmt eq 'date';
290 }
291 } else {
292 $et->VPrint(0, " [adding $tag]\n") if $verbose;
293 AddTagToTable($tagTablePtr, $tag, { Name => ucfirst $tag });
294 }
295 # save the tag
296 $et->HandleTag($tagTablePtr, $tag, $val);
297
298 # start fresh for next tag
299 undef $vectorCount;
300 undef %queuedAttrs;
301
302 return 1;
303}
304
305#------------------------------------------------------------------------------
306# Extract information from an OOXML file
307# Inputs: 0) ExifTool object reference, 1) dirInfo reference
308# Returns: 1
309# Notes: Upon entry to this routine, the file type has already been verified
310# and the dirInfo hash contains 2 elements unique to this process proc:
311# MIME - mime type of main document from "[Content_Types].xml"
312# ZIP - reference to Archive::Zip object for this file
313sub ProcessDOCX($$)
314{
315 my ($et, $dirInfo) = @_;
316 my $zip = $$dirInfo{ZIP};
317 my $tagTablePtr = GetTagTable('Image::ExifTool::OOXML::Main');
318 my $mime = $$dirInfo{MIME} || $Image::ExifTool::mimeType{DOCX};
319
320 # set the file type ('DOCX' by default)
321 my $fileType = $fileType{$mime};
322 if ($fileType) {
323 # THMX is a special case because its contents.main MIME types is PPTX
324 if ($fileType eq 'PPTX' and $$et{FILE_EXT} and $$et{FILE_EXT} eq 'THMX') {
325 $fileType = 'THMX';
326 }
327 } else {
328 $et->VPrint(0, "Unrecognized MIME type: $mime\n");
329 # get MIME type according to file extension
330 $fileType = $$et{FILE_EXT};
331 # default to 'DOCX' if this isn't a known OOXML extension
332 $fileType = 'DOCX' unless $fileType and $isOOXML{$fileType};
333 }
334 $et->SetFileType($fileType);
335
336 # must catch all Archive::Zip warnings
337 local $SIG{'__WARN__'} = \&Image::ExifTool::ZIP::WarnProc;
338 # extract meta information from all files in ZIP "docProps" directory
339 my $docNum = 0;
340 my @members = $zip->members();
341 my $member;
342 foreach $member (@members) {
343 # get filename of this ZIP member
344 my $file = $member->fileName();
345 next unless defined $file;
346 $et->VPrint(0, "File: $file\n");
347 # set the document number and extract ZIP tags
348 $$et{DOC_NUM} = ++$docNum;
349 Image::ExifTool::ZIP::HandleMember($et, $member);
350 # process only XML and JPEG/WMF thumbnail images in "docProps" directory
351 next unless $file =~ m{^docProps/(.*\.xml|(thumbnail\.(jpe?g|wmf)))$}i;
352 # get the file contents (CAREFUL! $buff MUST be local since we hand off a value ref)
353 my ($buff, $status) = $zip->contents($member);
354 $status and $et->Warn("Error extracting $file"), next;
355 # extract docProps/thumbnail.(jpg|mwf) as PreviewImage|PreviewMWF
356 if ($file =~ /\.(jpe?g|wmf)$/i) {
357 my $tag = $file =~ /\.wmf$/i ? 'PreviewWMF' : 'PreviewImage';
358 $et->FoundTag($tag, \$buff);
359 next;
360 }
361 # process XML files (docProps/app.xml, docProps/core.xml, docProps/custom.xml)
362 my %dirInfo = (
363 DataPt => \$buff,
364 DirLen => length $buff,
365 DataLen => length $buff,
366 XMPParseOpts => {
367 FoundProc => \&FoundTag,
368 },
369 );
370 $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
371 undef $buff; # (free memory now)
372 }
373 delete $$et{DOC_NUM};
374 return 1;
375}
376
3771; # end
378
379__END__
380
381=head1 NAME
382
383Image::ExifTool::OOXML - Read Office Open XML+ZIP files
384
385=head1 SYNOPSIS
386
387This module is used by Image::ExifTool
388
389=head1 DESCRIPTION
390
391This module contains definitions required by Image::ExifTool to extract meta
392information from Office Open XML files. This is the format of Word, Excel
393and PowerPoint files written by Microsoft Office 2007 -- essentially ZIP
394archives of XML files.
395
396=head1 AUTHOR
397
398Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
399
400This library is free software; you can redistribute it and/or modify it
401under the same terms as Perl itself.
402
403=head1 SEE ALSO
404
405L<Image::ExifTool::TagNames/OOXML Tags>,
406L<Image::ExifTool::TagNames/FlashPix Tags>,
407L<Image::ExifTool(3pm)|Image::ExifTool>
408
409=cut
410
Note: See TracBrowser for help on using the repository browser.