source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Palm.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 11.9 KB
Line 
1#------------------------------------------------------------------------------
2# File: Palm.pm
3#
4# Description: Read Palm Database files
5#
6# Revisions: 2014/05/28 - P. Harvey Created
7#
8# References: 1) http://wiki.mobileread.com/wiki/PDB
9# 2) http://wiki.mobileread.com/wiki/MOBI
10#------------------------------------------------------------------------------
11
12package Image::ExifTool::Palm;
13
14use strict;
15use vars qw($VERSION);
16use Image::ExifTool qw(:DataAccess :Utils);
17
18$VERSION = '1.00';
19
20sub ProcessEXTH($$$);
21
22# type/creator ID's for Palm database files
23my %palmTypes = (
24 '.pdfADBE' => 'Adobe Reader',
25 'TEXtREAd' => 'PalmDOC',
26 'BVokBDIC' => 'BDicty',
27 'DB99DBOS' => 'DB (Database program)',
28 'PNRdPPrs' => 'eReader',
29 'DataPPrs' => 'eReader',
30 'vIMGView' => 'FireViewer (ImageViewer)',
31 'PmDBPmDB' => 'HanDBase',
32 'InfoINDB' => 'InfoView',
33 'ToGoToGo' => 'iSilo',
34 'SDocSilX' => 'iSilo 3',
35 'JbDbJBas' => 'JFile',
36 'JfDbJFil' => 'JFile Pro',
37 'DATALSdb' => 'LIST',
38 'Mdb1Mdb1' => 'MobileDB',
39 'BOOKMOBI' => 'Mobipocket',
40 'DataPlkr' => 'Plucker',
41 'DataSprd' => 'QuickSheet',
42 'SM01SMem' => 'SuperMemo',
43 'TEXtTlDc' => 'TealDoc',
44 'InfoTlIf' => 'TealInfo',
45 'DataTlMl' => 'TealMeal',
46 'DataTlPt' => 'TealPaint',
47 'dataTDBP' => 'ThinkDB',
48 'TdatTide' => 'Tides',
49 'ToRaTRPW' => 'TomeRaider',
50 'zTXTGPlm' => 'Weasel',
51 'BDOCWrdS' => 'WordSmith',
52);
53
54my %dateTimeInfo = (
55 # like QuickTime, the time zero should be Jan 1, 1904, but not all software writes this,
56 # so assume a time zero of Jan 1, 1970 if the date is before this
57 RawConv => q{
58 my $offset = (66 * 365 + 17) * 24 * 3600;
59 return $val - $offset if $val >= $offset;
60 return $val;
61 },
62 ValueConv => 'ConvertUnixTime($val, 1)', # (UTC written by "EPUB Converter", ref PH)
63 PrintConv => '$self->ConvertDateTime($val)',
64);
65
66# Palm Database header information
67%Image::ExifTool::Palm::Main = (
68 GROUPS => { 0 => 'Palm', 1 => 'Palm', 2 => 'Document' },
69 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
70 FORMAT => 'int32u',
71 NOTES => q{
72 Information extracted from Palm database files (PDB and PRC extensions),
73 Mobipocket electronic books (MOBI), and Amazon Kindle KF7 and KF8 books (AZW
74 and AZW3).
75 },
76 0 => { Name => 'DatabaseName', Format => 'string[32]' },
77 # 8 - int16u: file attributes (not very useful)
78 # 8.5 - int16u: version
79 9 => {
80 Name => 'CreateDate',
81 Groups => { 2 => 'Time' },
82 %dateTimeInfo,
83 },
84 10 => {
85 Name => 'ModifyDate',
86 Groups => { 2 => 'Time' },
87 %dateTimeInfo,
88 },
89 11 => {
90 Name => 'LastBackupDate',
91 Groups => { 2 => 'Time' },
92 %dateTimeInfo,
93 },
94 12 => 'ModificationNumber',
95 15 => {
96 Name => 'PalmFileType',
97 Format => 'undef[8]',
98 PrintConv => \%palmTypes,
99 },
100);
101
102
103# MOBI header tags
104%Image::ExifTool::Palm::MOBI = (
105 GROUPS => { 0 => 'Palm', 1 => 'MOBI', 2 => 'Document' },
106 NOTES => q{
107 Information extracted from the MOBI header of Mobipocket and Amazon Kindle
108 KF7 and KF8 files.
109 },
110 PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
111 FORMAT => 'int32u',
112 0 => {
113 Name => 'Compression',
114 Format => 'int16u',
115 PrintConv => {
116 1 => 'None',
117 2 => 'PalmDOC',
118 17480 => 'HUFF/CDIC',
119 },
120 },
121 1 => {
122 Name => 'UncompressedTextLength',
123 PrintConv => \&Image::ExifTool::ConvertFileSize,
124 },
125 3 => {
126 Name => 'Encryption',
127 PrintConv => {
128 0 => 'None',
129 1 => 'Old Mobipocket',
130 2 => 'Mobipocket',
131 },
132 },
133 6 => {
134 Name => 'MobiType',
135 PrintConv => {
136 2 => 'Mobipocket Book',
137 3 => 'PalmDoc Book',
138 4 => 'Audio',
139 232 => 'mobipocket? generated by kindlegen1.2',
140 248 => 'KF8: generated by kindlegen2',
141 257 => 'News',
142 258 => 'News_Feed',
143 259 => 'News_Magazine',
144 513 => 'PICS',
145 514 => 'WORD',
146 515 => 'XLS',
147 516 => 'PPT',
148 517 => 'TEXT',
149 518 => 'HTML',
150 },
151 },
152 7 => {
153 Name => 'CodePage',
154 RawConv => '$$self{CodePage} = $val',
155 PrintConv => {
156 # just define commonly used code pages
157 # (a much more complete list may be found in FlashPix.pm)
158 1252 => 'Windows Latin 1 (Western European)',
159 65001 => 'Unicode (UTF-8)',
160 },
161 },
162 9 => 'MobiVersion',
163 21 => 'BookName', # this is actually an offset, but replace it with the string later
164 26 => 'MinimumVersion',
165);
166
167# MOBI extended header tags
168%Image::ExifTool::Palm::EXTH = (
169 GROUPS => { 0 => 'Palm', 1 => 'MOBI', 2 => 'Document' },
170 FORMAT => 'string',
171 NOTES => 'Information extracted from the MOBI extended header.',
172 PROCESS_PROC => \&ProcessEXTH,
173 1 => 'DRMServerID',
174 2 => 'DRMCommerceID',
175 3 => 'DRM_E-BookBaseID',
176 100 => { Name => 'Author', Groups => { 2 => 'Author' } },
177 101 => 'Publisher',
178 102 => 'Imprint',
179 103 => 'Description',
180 104 => 'ISBN',
181 105 => { Name => 'Subject', List => 1 },
182 106 => {
183 Name => 'PublishDate',
184 Groups => { 2 => 'Time' },
185 ValueConv => q{
186 require Image::ExifTool::XMP;
187 Image::ExifTool::XMP::ConvertXMPDate($val, 1);
188 },
189 PrintConv => '$self->ConvertDateTime($val)',
190 },
191 107 => 'Review',
192 108 => 'Contributor',
193 109 => { Name => 'Rights', Groups => { 2 => 'Author' } },
194 110 => 'SubjectCode',
195 111 => 'BookType',
196 112 => 'Source',
197 113 => 'ASIN',
198 114 => 'BookVersion',
199 115 => { Name => 'SampleFlag', Format => 'int32u' },
200 116 => { Name => 'StartReading', Format => 'int32u' },
201 117 => 'Adult',
202 118 => 'RetailPrice',
203 119 => 'RetailPriceCurrency',
204 # 121 => 'KF8BoundaryOffset',
205 125 => { Name => 'ResourceCount', Format => 'int32u' },
206 129 => 'KF8CoverURI',
207 200 => 'DictionaryShortName',
208 # 201 => { Name => 'CoverOffset', Format => 'int32u' },
209 # 202 => { Name => 'ThumbOffset', Format => 'int32u' },
210 # 203 => 'HasFakeCover',
211 204 => {
212 Name => 'CreatorSoftware',
213 Format => 'int32u',
214 PrintConv => {
215 1 => 'Mobigen',
216 2 => 'Mobipocket',
217 200 => 'Kindlegen (Windows)',
218 201 => 'Kindlegen (Linux)',
219 202 => 'Kindlegen (Mac)',
220 },
221 },
222 205 => { Name => 'CreatorMajorVersion', Format => 'int32u' },
223 206 => { Name => 'CreatorMinorVersion', Format => 'int32u' },
224 207 => { Name => 'CreatorBuildNumber', Format => 'int32u' },
225 208 => 'Watermark',
226 209 => 'Tamper-proofKeys',
227 # 300 => 'FontSignature',
228 401 => { Name => 'ClippingLimit', Format => 'int8u' },
229 402 => 'PublisherLimit',
230 404 => {
231 Name => 'TextToSpeech',
232 Format => 'int8u',
233 PrintConv => { 0 => 'Enabled', 1 => 'Disabled' },
234 },
235 405 => { Name => 'RentalFlag', Format => 'int8u' }, #?
236 406 => 'RentalExpirationDate',
237 501 => { Name => 'CDEType', Format => 'int32u' },
238 502 => 'LastUpdateTime',
239 503 => 'UpdatedTitle',
240 504 => 'ASIN2',
241 524 => 'Language',
242 525 => 'Alignment',
243 535 => 'CreatorBuildNumber2',
244);
245
246#------------------------------------------------------------------------------
247# Process the MOBI extended header
248# Inputs: 0) ExifTool ref, 1) dirInfo ref, 2) tag table ref
249# Returns: 1 (EXTH should have already been validated)
250sub ProcessEXTH($$$)
251{
252 my ($et, $dirInfo, $tagTablePtr) = @_;
253 my $dataPt = $$dirInfo{DataPt};
254 my $dataPos = $$dirInfo{DataPos};
255 my $enc = $$dirInfo{Encoding} || 'UTF8';
256 my $dirLen = length $$dataPt;
257 my ($index, $pos);
258
259 $et->VerboseDir('EXTH', $$dirInfo{NumEntries}, $dirLen);
260
261 # process the EXTH entries
262 for ($index=0, $pos=0; ; ++$index) {
263 last if $pos + 8 > $dirLen;
264 my $tag = Get32u($dataPt, $pos);
265 my $len = Get32u($dataPt, $pos + 4);
266 last if $len < 8 or $pos + $len > $dirLen;
267 my $key = $et->HandleTag($tagTablePtr, $tag, undef,
268 DataPt => $dataPt,
269 DataPos => $dataPos,
270 Start => $pos + 8,
271 Size => $len - 8,
272 Index => $index,
273 );
274 # recode text if necessary
275 $$et{VALUE}{$key} = $et->Decode($$et{VALUE}{$key}, $enc) if $key;
276 $pos += $len;
277 }
278 return 1;
279}
280
281#------------------------------------------------------------------------------
282# Extract information from a Palm DB file
283# Inputs: 0) ExifTool ref, 1) dirInfo reference
284# Returns: 1 if this was a recognized PDB file, 0 otherwise
285sub ProcessPDB($$)
286{
287 my ($et, $dirInfo) = @_;
288 my $raf = $$dirInfo{RAF};
289 my ($buff, $buf2, $size, $enc);
290 my $verbose = $et->Options('Verbose');
291
292 # verify this is a valid Palm DB file
293 return 0 unless $raf->Read($buff, 86) == 86;
294 my $type = $palmTypes{substr($buff, 60, 8)};
295 return 0 unless $type;
296#
297# Read and process the Palm DB file header
298#
299 $et->SetFileType($type eq 'Mobipocket' ? 'MOBI' : 'PDB');
300 SetByteOrder('MM');
301
302 my $tagTablePtr = GetTagTable('Image::ExifTool::Palm::Main');
303 $et->ProcessDirectory({ DataPt => \$buff }, $tagTablePtr);
304
305 return 1 unless $type eq 'Mobipocket' and Get16u(\$buff, 76);
306#
307# Read and process MOBI header (should be the first record)
308#
309 my $offset = Get32u(\$buff, 78); # get offset to first record
310 unless ($raf->Seek($offset, 0) and $raf->Read($buff, 274) == 274) {
311 $et->Warn('Truncated MOBI header');
312 return 1;
313 }
314 unless (substr($buff, 16, 4) eq 'MOBI') {
315 $et->Warn('Invalid MOBI header');
316 return 1;
317 }
318 $tagTablePtr = GetTagTable('Image::ExifTool::Palm::MOBI');
319 $et->ProcessDirectory({ DataPt => \$buff }, $tagTablePtr);
320
321 # get text encoding
322 $enc = $Image::ExifTool::charsetName{"cp$$et{CodePage}"} if $$et{CodePage};
323 $enc = 'UTF8' unless $enc;
324
325 # extract the BookName string
326 my $off = Get32u(\$buff, 84);
327 my $len = Get32u(\$buff, 88);
328
329 $raf->Seek($offset+$off, 0) and $raf->Read($buf2, $len) == $len or $buf2 = '<err>';
330 $$et{VALUE}{BookName} = $et->Decode($buf2, $enc);
331#
332# Process the MOBI extended header if it exists
333#
334 # first, check the flag bit to see if the EXTH record should exist
335 my $flag = Get32u(\$buff, 128);
336 return 1 unless $flag & 0x40; # check extended header flag
337
338 $len = Get32u(\$buff, 20) + 16; # MOBI header length (including PalmDOC header)
339
340 unless ($raf->Seek($offset+$len, 0) and $raf->Read($buf2, 12) == 12 and
341 substr($buf2,0,4) eq 'EXTH' and ($size = Get32u(\$buf2, 4)) > 12)
342 {
343 $et->Warn('Invalid MOBI extended header');
344 return 1;
345 }
346
347 # read and process the MOBI extended header
348 $size -= 12;
349 $raf->Read($buff, $size) == $size or $et->Warn('Truncated MOBI extended header'), return 1;
350 my %dirInfo = (
351 DataPt => \$buff,
352 DataPos => $offset + $len + 12,
353 NumEntries => Get32u(\$buf2, 8),
354 Encoding => $enc,
355 );
356 $tagTablePtr = GetTagTable('Image::ExifTool::Palm::EXTH');
357 $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
358
359 return 1;
360}
361
3621; # end
363
364__END__
365
366=head1 NAME
367
368Image::ExifTool::Palm - Read Palm Database files
369
370=head1 SYNOPSIS
371
372This module is used by Image::ExifTool
373
374=head1 DESCRIPTION
375
376This module contains code to extract metadata from Palm database files (PDB
377and PRC extensions), Mobipocket electronic books (MOBI), and Amazon Kindle
378KF7 and KF8 books (AZW and AZW3).
379
380=head1 AUTHOR
381
382Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
383
384This library is free software; you can redistribute it and/or modify it
385under the same terms as Perl itself.
386
387=head1 REFERENCES
388
389=over 4
390
391=item L<http://wiki.mobileread.com/wiki/PDB>
392
393=item L<http://wiki.mobileread.com/wiki/MOBI>
394
395=back
396
397=head1 SEE ALSO
398
399L<Image::ExifTool::TagNames/Palm Tags>,
400L<Image::ExifTool(3pm)|Image::ExifTool>
401
402=cut
403
Note: See TracBrowser for help on using the repository browser.