source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/Image/ExifTool/ZIP.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

  • Property svn:executable set to *
File size: 20.5 KB
Line 
1#------------------------------------------------------------------------------
2# File: ZIP.pm
3#
4# Description: Read ZIP archive meta information
5#
6# Revisions: 10/28/2007 - P. Harvey Created
7#
8# References: 1) http://www.pkware.com/documents/casestudies/APPNOTE.TXT
9# 2) http://www.cpanforum.com/threads/9046
10# 3) http://www.gzip.org/zlib/rfc-gzip.html
11# 4) http://DataCompression.info/ArchiveFormats/RAR202.txt
12#------------------------------------------------------------------------------
13
14package Image::ExifTool::ZIP;
15
16use strict;
17use vars qw($VERSION $warnString);
18use Image::ExifTool qw(:DataAccess :Utils);
19
20$VERSION = '1.07';
21
22sub WarnProc($) { $warnString = $_[0]; }
23
24# file types for recognized Open Document "mimetype" values
25my %openDocType = (
26 'application/vnd.oasis.opendocument.presentation' => 'ODP',
27 'application/vnd.oasis.opendocument.spreadsheet' => 'ODS',
28 'application/vnd.oasis.opendocument.text' => 'ODT',
29);
30
31# ZIP metadata blocks
32%Image::ExifTool::ZIP::Main = (
33 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
34 GROUPS => { 2 => 'Other' },
35 FORMAT => 'int16u',
36 NOTES => q{
37 The following tags are extracted from ZIP archives. ExifTool also extracts
38 additional meta information from compressed documents inside some ZIP-based
39 files such Office Open XML (DOCX, PPTX and XLSX), Open Document (ODP, ODS
40 and ODT), iWork (KEY, PAGES, NUMBERS), and Capture One Enhanced Image
41 Package (EIP). The ExifTool family 3 groups may be used to organize the
42 output by embedded document number (ie. the exiftool C<-g3> option).
43 },
44 2 => 'ZipRequiredVersion',
45 3 => {
46 Name => 'ZipBitFlag',
47 PrintConv => '$val ? sprintf("0x%.4x",$val) : $val',
48 },
49 4 => {
50 Name => 'ZipCompression',
51 PrintConv => {
52 0 => 'None',
53 1 => 'Shrunk',
54 2 => 'Reduced with compression factor 1',
55 3 => 'Reduced with compression factor 2',
56 4 => 'Reduced with compression factor 3',
57 5 => 'Reduced with compression factor 4',
58 6 => 'Imploded',
59 7 => 'Tokenized',
60 8 => 'Deflated',
61 9 => 'Enhanced Deflate using Deflate64(tm)',
62 10 => 'Imploded (old IBM TERSE)',
63 12 => 'BZIP2',
64 14 => 'LZMA (EFS)',
65 18 => 'IBM TERSE (new)',
66 19 => 'IBM LZ77 z Architecture (PFS)',
67 96 => 'JPEG recompressed', #2
68 97 => 'WavPack compressed', #2
69 98 => 'PPMd version I, Rev 1',
70 },
71 },
72 5 => {
73 Name => 'ZipModifyDate',
74 Format => 'int32u',
75 Groups => { 2 => 'Time' },
76 ValueConv => sub {
77 my $val = shift;
78 return sprintf('%.4d:%.2d:%.2d %.2d:%.2d:%.2d',
79 ($val >> 25) + 1980, # year
80 ($val >> 21) & 0x0f, # month
81 ($val >> 16) & 0x1f, # day
82 ($val >> 11) & 0x1f, # hour
83 ($val >> 5) & 0x3f, # minute
84 $val & 0x1f # second
85 );
86 },
87 PrintConv => '$self->ConvertDateTime($val)',
88 },
89 7 => { Name => 'ZipCRC', Format => 'int32u', PrintConv => 'sprintf("0x%.8x",$val)' },
90 9 => { Name => 'ZipCompressedSize', Format => 'int32u' },
91 11 => { Name => 'ZipUncompressedSize', Format => 'int32u' },
92 13 => {
93 Name => 'ZipFileNameLength',
94 # don't store a tag -- just extract the value for use with ZipFileName
95 Hidden => 1,
96 RawConv => '$$self{ZipFileNameLength} = $val; undef',
97 },
98 # 14 => 'ZipExtraFieldLength',
99 15 => {
100 Name => 'ZipFileName',
101 Format => 'string[$$self{ZipFileNameLength}]',
102 },
103);
104
105# GNU ZIP tags (ref 3)
106%Image::ExifTool::ZIP::GZIP = (
107 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
108 GROUPS => { 2 => 'Other' },
109 NOTES => q{
110 These tags are extracted from GZIP (GNU ZIP) archives, but currently only
111 for the first file in the archive.
112 },
113 2 => {
114 Name => 'Compression',
115 PrintConv => {
116 8 => 'Deflated',
117 },
118 },
119 3 => {
120 Name => 'Flags',
121 PrintConv => { BITMASK => {
122 0 => 'Text',
123 1 => 'CRC16',
124 2 => 'ExtraFields',
125 3 => 'FileName',
126 4 => 'Comment',
127 }},
128 },
129 4 => {
130 Name => 'ModifyDate',
131 Format => 'int32u',
132 Groups => { 2 => 'Time' },
133 ValueConv => 'ConvertUnixTime($val,1)',
134 PrintConv => '$self->ConvertDateTime($val)',
135 },
136 8 => {
137 Name => 'ExtraFlags',
138 PrintConv => {
139 0 => '(none)',
140 2 => 'Maximum Compression',
141 4 => 'Fastest Algorithm',
142 },
143 },
144 9 => {
145 Name => 'OperatingSystem',
146 PrintConv => {
147 0 => 'FAT filesystem (MS-DOS, OS/2, NT/Win32)',
148 1 => 'Amiga',
149 2 => 'VMS (or OpenVMS)',
150 3 => 'Unix',
151 4 => 'VM/CMS',
152 5 => 'Atari TOS',
153 6 => 'HPFS filesystem (OS/2, NT)',
154 7 => 'Macintosh',
155 8 => 'Z-System',
156 9 => 'CP/M',
157 10 => 'TOPS-20',
158 11 => 'NTFS filesystem (NT)',
159 12 => 'QDOS',
160 13 => 'Acorn RISCOS',
161 255 => 'unknown',
162 },
163 },
164 10 => 'ArchivedFileName',
165 11 => 'Comment',
166);
167
168# RAR tags (ref 4)
169%Image::ExifTool::ZIP::RAR = (
170 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
171 GROUPS => { 2 => 'Other' },
172 NOTES => 'These tags are extracted from RAR archive files.',
173 0 => {
174 Name => 'CompressedSize',
175 Format => 'int32u',
176 },
177 4 => {
178 Name => 'UncompressedSize',
179 Format => 'int32u',
180 },
181 8 => {
182 Name => 'OperatingSystem',
183 PrintConv => {
184 0 => 'MS-DOS',
185 1 => 'OS/2',
186 2 => 'Win32',
187 3 => 'Unix',
188 },
189 },
190 13 => {
191 Name => 'ModifyDate',
192 Format => 'int32u',
193 Groups => { 2 => 'Time' },
194 ValueConv => sub {
195 my $val = shift;
196 return sprintf('%.4d:%.2d:%.2d %.2d:%.2d:%.2d',
197 ($val >> 25) + 1980, # year
198 ($val >> 21) & 0x0f, # month
199 ($val >> 16) & 0x1f, # day
200 ($val >> 11) & 0x1f, # hour
201 ($val >> 5) & 0x3f, # minute
202 $val & 0x1f # second
203 );
204 },
205 PrintConv => '$self->ConvertDateTime($val)',
206 },
207 18 => {
208 Name => 'PackingMethod',
209 PrintHex => 1,
210 PrintConv => {
211 0x30 => 'Stored',
212 0x31 => 'Fastest',
213 0x32 => 'Fast',
214 0x33 => 'Normal',
215 0x34 => 'Good Compression',
216 0x35 => 'Best Compression',
217 },
218 },
219 19 => {
220 Name => 'FileNameLength',
221 Format => 'int16u',
222 Hidden => 1,
223 RawConv => '$$self{FileNameLength} = $val; undef',
224 },
225 25 => {
226 Name => 'ArchivedFileName',
227 Format => 'string[$$self{FileNameLength}]',
228 },
229);
230
231#------------------------------------------------------------------------------
232# Extract information from a RAR file (ref 4)
233# Inputs: 0) ExifTool object reference, 1) dirInfo reference
234# Returns: 1 on success, 0 if this wasn't a valid RAR file
235sub ProcessRAR($$)
236{
237 my ($exifTool, $dirInfo) = @_;
238 my $raf = $$dirInfo{RAF};
239 my ($flags, $buff);
240
241 return 0 unless $raf->Read($buff, 7) and $buff eq "Rar!\x1a\x07\0";
242
243 $exifTool->SetFileType();
244 SetByteOrder('II');
245 my $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::RAR');
246 my $docNum = 0;
247
248 for (;;) {
249 # read block header
250 $raf->Read($buff, 7) == 7 or last;
251 my ($type, $flags, $size) = unpack('xxCvv', $buff);
252 $size -= 7;
253 if ($flags & 0x8000) {
254 $raf->Read($buff, 4) == 4 or last;
255 $size += unpack('V',$buff) - 4;
256 }
257 last if $size < 0;
258 next unless $size; # ignore blocks with no data
259 # don't try to read very large blocks unless LargeFileSupport is enabled
260 if ($size > 0x80000000 and not $exifTool->Options('LargeFileSupport')) {
261 $exifTool->Warn('Large block encountered. Aborting.');
262 last;
263 }
264 # process the block
265 if ($type == 0x74) { # file block
266 # read maximum 4 KB from a file block
267 my $n = $size > 4096 ? 4096 : $size;
268 $raf->Read($buff, $n) == $n or last;
269 # add compressed size to start of data so we can extract it with the other tags
270 $buff = pack('V',$size) . $buff;
271 $$exifTool{DOC_NUM} = ++$docNum;
272 $exifTool->ProcessDirectory({ DataPt => \$buff }, $tagTablePtr);
273 $size -= $n;
274 } elsif ($type == 0x75 and $size > 6) { # comment block
275 $raf->Read($buff, $size) == $size or last;
276 # save comment, only if "Stored" (this is untested)
277 if (Get8u(\$buff, 3) == 0x30) {
278 $exifTool->FoundTag('Comment', substr($buff, 6));
279 }
280 next;
281 }
282 # seek to the start of the next block
283 $raf->Seek($size, 1) or last if $size;
284 }
285 $$exifTool{DOC_NUM} = 0;
286
287 return 1;
288}
289
290#------------------------------------------------------------------------------
291# Extract information from a GNU ZIP file (ref 3)
292# Inputs: 0) ExifTool object reference, 1) dirInfo reference
293# Returns: 1 on success, 0 if this wasn't a valid GZIP file
294sub ProcessGZIP($$)
295{
296 my ($exifTool, $dirInfo) = @_;
297 my $raf = $$dirInfo{RAF};
298 my ($flags, $buff);
299
300 return 0 unless $raf->Read($buff, 10) and $buff =~ /^\x1f\x8b\x08/;
301
302 $exifTool->SetFileType();
303 SetByteOrder('II');
304
305 my $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::GZIP');
306 $exifTool->HandleTag($tagTablePtr, 2, Get8u(\$buff, 2));
307 $exifTool->HandleTag($tagTablePtr, 3, $flags = Get8u(\$buff, 3));
308 $exifTool->HandleTag($tagTablePtr, 4, Get32u(\$buff, 4));
309 $exifTool->HandleTag($tagTablePtr, 8, Get8u(\$buff, 8));
310 $exifTool->HandleTag($tagTablePtr, 9, Get8u(\$buff, 9));
311
312 # extract file name and comment if they exist
313 if ($flags & 0x18) {
314 if ($flags & 0x04) {
315 # skip extra field
316 $raf->Read($buff, 2) == 2 or return 1;
317 my $len = Get16u(\$buff, 0);
318 $raf->Read($buff, $len) == $len or return 1;
319 }
320 $raf->Read($buff, 4096) or return 1;
321 my $pos = 0;
322 my $tagID;
323 # loop for ArchivedFileName (10) and Comment (11) tags
324 foreach $tagID (10, 11) {
325 my $mask = $tagID == 10 ? 0x08 : 0x10;
326 next unless $flags & $mask;
327 my $end = $buff =~ /\0/g ? pos($buff) - 1 : length($buff);
328 # (the doc specifies the string should be ISO 8859-1,
329 # but in OS X it seems to be UTF-8, so don't translate
330 # it because I could just as easily screw it up)
331 my $str = substr($buff, $pos, $end - $pos);
332 $exifTool->HandleTag($tagTablePtr, $tagID, $str);
333 last if $end >= length $buff;
334 $pos = $end + 1;
335 }
336 }
337 return 1;
338}
339
340#------------------------------------------------------------------------------
341# Call HandleTags for attributes of an Archive::Zip member
342# Inputs: 0) ExifTool object ref, 1) member ref, 2) optional tag table ref
343sub HandleMember($$;$)
344{
345 my ($exifTool, $member, $tagTablePtr) = @_;
346 $tagTablePtr or $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::Main');
347 $exifTool->HandleTag($tagTablePtr, 2, $member->versionNeededToExtract());
348 $exifTool->HandleTag($tagTablePtr, 3, $member->bitFlag());
349 $exifTool->HandleTag($tagTablePtr, 4, $member->compressionMethod());
350 $exifTool->HandleTag($tagTablePtr, 5, $member->lastModFileDateTime());
351 $exifTool->HandleTag($tagTablePtr, 7, $member->crc32());
352 $exifTool->HandleTag($tagTablePtr, 9, $member->compressedSize());
353 $exifTool->HandleTag($tagTablePtr, 11, $member->uncompressedSize());
354 $exifTool->HandleTag($tagTablePtr, 15, $member->fileName());
355}
356
357#------------------------------------------------------------------------------
358# Extract information from an ZIP file
359# Inputs: 0) ExifTool object reference, 1) dirInfo reference
360# Returns: 1 on success, 0 if this wasn't a valid ZIP file
361sub ProcessZIP($$)
362{
363 my ($exifTool, $dirInfo) = @_;
364 my $raf = $$dirInfo{RAF};
365 my ($buff, $buf2, $zip, $docNum);
366
367 return 0 unless $raf->Read($buff, 30) and $buff =~ /^PK\x03\x04/;
368
369 my $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::Main');
370
371 # use Archive::Zip if avilable
372 for (;;) {
373 unless (eval 'require Archive::Zip' and eval 'require IO::File') {
374 if ($$exifTool{FILE_EXT} and $$exifTool{FILE_EXT} ne 'ZIP') {
375 $exifTool->Warn("Install Archive::Zip to decode compressed ZIP information");
376 }
377 last;
378 }
379 # Archive::Zip requires a seekable IO::File object
380 my $fh = $raf->{FILE_PT};
381 if ($fh and seek($fh, 0, 0)) {
382 unless (eval 'require IO::File') {
383 # (this shouldn't happen because IO::File is a prerequisite of Archive::Zip)
384 $exifTool->Warn("Install IO::File to decode compressed ZIP information");
385 last;
386 }
387 bless $fh, 'IO::File'; # Archive::Zip expects an IO::File object
388 } elsif (eval 'require IO::String') {
389 # read the whole file into memory (what else can I do?)
390 $raf->Slurp();
391 $fh = new IO::String ${$raf->{BUFF_PT}};
392 } else {
393 my $type = $fh ? 'pipe or socket' : 'scalar reference';
394 $exifTool->Warn("Install IO::String to decode compressed ZIP information from a $type");
395 last;
396 }
397 $exifTool->VPrint(1, " --- using Archive::Zip ---\n");
398 $zip = new Archive::Zip;
399 # catch all warnings! (Archive::Zip is bad for this)
400 local $SIG{'__WARN__'} = \&WarnProc;
401 my $status = $zip->readFromFileHandle($fh);
402 if ($status) {
403 undef $zip;
404 my %err = ( 1=>'Stream end error', 3=>'Format error', 4=>'IO error' );
405 my $err = $err{$status} || "Error $status";
406 $exifTool->Warn("$err reading ZIP file");
407 last;
408 }
409 $$dirInfo{ZIP} = $zip;
410
411 # check for an Office Open file (DOCX, etc)
412 # --> read '[Content_Types].xml' to determine the file type
413 my ($mime, @members);
414 my $cType = $zip->memberNamed('[Content_Types].xml');
415 if ($cType) {
416 ($buff, $status) = $zip->contents($cType);
417 if (not $status and $buff =~ /ContentType\s*=\s*(['"])([^"']+)\.main(\+xml)?\1/) {
418 $mime = $2;
419 }
420 }
421 # check for docProps if we couldn't find a MIME type
422 $mime or @members = $zip->membersMatching('^docProps/.*\.(xml|XML)$');
423 if ($mime or @members) {
424 $$dirInfo{MIME} = $mime;
425 require Image::ExifTool::OOXML;
426 Image::ExifTool::OOXML::ProcessDOCX($exifTool, $dirInfo);
427 delete $$dirInfo{MIME};
428 last;
429 }
430
431 # check for an EIP file
432 @members = $zip->membersMatching('^CaptureOne/.*\.(cos|COS)$');
433 if (@members) {
434 require Image::ExifTool::CaptureOne;
435 Image::ExifTool::CaptureOne::ProcessEIP($exifTool, $dirInfo);
436 last;
437 }
438
439 # check for an iWork file
440 @members = $zip->membersMatching('^(index\.(xml|apxl)|QuickLook/Thumbnail\.jpg)$');
441 if (@members) {
442 require Image::ExifTool::iWork;
443 Image::ExifTool::iWork::Process_iWork($exifTool, $dirInfo);
444 last;
445 }
446
447 # check for an Open Document file
448 my $mType = $zip->memberNamed('mimetype');
449 if ($mType) {
450 ($mime, $status) = $zip->contents($mType);
451 unless ($status) {
452 chomp $mime;
453 if ($openDocType{$mime}) {
454 $exifTool->SetFileType($openDocType{$mime}, $mime);
455 # extract Open Document metadata from "meta.xml"
456 my $meta = $zip->memberNamed('meta.xml');
457 if ($meta) {
458 ($buff, $status) = $zip->contents($meta);
459 unless ($status) {
460 my %dirInfo = (
461 DataPt => \$buff,
462 DirLen => length $buff,
463 DataLen => length $buff,
464 );
465 my $xmpTable = GetTagTable('Image::ExifTool::XMP::Main');
466 $exifTool->ProcessDirectory(\%dirInfo, $xmpTable);
467 }
468 }
469 # extract preview image(s) from "Thumbnails" directory
470 my $type;
471 my %tag = ( jpg => 'PreviewImage', png => 'PreviewPNG' );
472 foreach $type ('jpg', 'png') {
473 my $thumb = $zip->memberNamed("Thumbnails/thumbnail.$type");
474 next unless $thumb;
475 ($buff, $status) = $zip->contents($thumb);
476 $exifTool->FoundTag($tag{$type}, $buff) unless $status;
477 }
478 last;
479 }
480 }
481 }
482
483 # otherwise just extract general ZIP information
484 $exifTool->SetFileType();
485 @members = $zip->members();
486 $docNum = 0;
487 my $member;
488 foreach $member (@members) {
489 $$exifTool{DOC_NUM} = ++$docNum;
490 HandleMember($exifTool, $member, $tagTablePtr);
491 }
492 last;
493 }
494 # all done if we processed this using Archive::Zip
495 if ($zip) {
496 delete $$dirInfo{ZIP};
497 delete $$exifTool{DOC_NUM};
498 return 1;
499 }
500#
501# process the ZIP file by hand (funny, but this seems easier than using Archive::Zip)
502#
503 $docNum = 0;
504 $exifTool->VPrint(1, " -- processing as binary data --\n");
505 $raf->Seek(30, 0);
506 $exifTool->SetFileType();
507 SetByteOrder('II');
508
509 # A. Local file header:
510 # local file header signature 0) 4 bytes (0x04034b50)
511 # version needed to extract 4) 2 bytes
512 # general purpose bit flag 6) 2 bytes
513 # compression method 8) 2 bytes
514 # last mod file time 10) 2 bytes
515 # last mod file date 12) 2 bytes
516 # crc-32 14) 4 bytes
517 # compressed size 18) 4 bytes
518 # uncompressed size 22) 4 bytes
519 # file name length 26) 2 bytes
520 # extra field length 28) 2 bytes
521 for (;;) {
522 my $len = Get16u(\$buff, 26) + Get16u(\$buff, 28);
523 $raf->Read($buf2, $len) == $len or last;
524
525 $$exifTool{DOC_NUM} = ++$docNum;
526 $buff .= $buf2;
527 my %dirInfo = (
528 DataPt => \$buff,
529 DataPos => $raf->Tell() - 30 - $len,
530 DataLen => 30 + $len,
531 DirStart => 0,
532 DirLen => 30 + $len,
533 );
534 $exifTool->ProcessDirectory(\%dirInfo, $tagTablePtr);
535 my $flags = Get16u(\$buff, 6);
536 if ($flags & 0x08) {
537 # we don't yet support skipping stream mode data
538 # (when this happens, the CRC, compressed size and uncompressed
539 # sizes are set to 0 in the header. Instead, they are stored
540 # after the compressed data with an optional header of 0x08074b50)
541 $exifTool->Warn('Stream mode data encountered, file list may be incomplete');
542 last;
543 }
544 $len = Get32u(\$buff, 18); # file data length
545 $raf->Seek($len, 1) or last; # skip file data
546 $raf->Read($buff, 30) == 30 and $buff =~ /^PK\x03\x04/ or last;
547 }
548 delete $$exifTool{DOC_NUM};
549 return 1;
550}
551
5521; # end
553
554__END__
555
556=head1 NAME
557
558Image::ExifTool::ZIP - Read ZIP archive meta information
559
560=head1 SYNOPSIS
561
562This module is used by Image::ExifTool
563
564=head1 DESCRIPTION
565
566This module contains definitions required by Image::ExifTool to extract meta
567information from ZIP, GZIP and RAR archives. This includes ZIP-based file
568types like DOCX, PPTX, XLSX, ODP, ODS, ODT and EIP.
569
570=head1 AUTHOR
571
572Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
573
574This library is free software; you can redistribute it and/or modify it
575under the same terms as Perl itself.
576
577=head1 REFERENCES
578
579=over 4
580
581=item L<http://www.pkware.com/documents/casestudies/APPNOTE.TXT>
582
583=item L<http://www.gzip.org/zlib/rfc-gzip.html>
584
585=item L<http://DataCompression.info/ArchiveFormats/RAR202.txt>
586
587=back
588
589=head1 SEE ALSO
590
591L<Image::ExifTool::TagNames/ZIP Tags>,
592L<Image::ExifTool::TagNames/OOXML Tags>,
593L<Image::ExifTool(3pm)|Image::ExifTool>
594
595=cut
596
Note: See TracBrowser for help on using the repository browser.