source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/Audible.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 11.5 KB
Line 
1#------------------------------------------------------------------------------
2# File: Audible.pm
3#
4# Description: Read metadata from Audible audio books
5#
6# Revisions: 2015/04/05 - P. Harvey Created
7#
8# References: 1) https://github.com/jteeuwen/audible
9# 2) https://code.google.com/p/pyaudibletags/
10# 3) http://wiki.multimedia.cx/index.php?title=Audible_Audio
11#------------------------------------------------------------------------------
12
13package Image::ExifTool::Audible;
14
15use strict;
16use vars qw($VERSION);
17use Image::ExifTool qw(:DataAccess :Utils);
18
19$VERSION = '1.02';
20
21sub ProcessAudible_meta($$$);
22sub ProcessAudible_cvrx($$$);
23
24%Image::ExifTool::Audible::Main = (
25 GROUPS => { 2 => 'Audio' },
26 NOTES => q{
27 ExifTool will extract any information found in the metadata dictionary of
28 Audible .AA files, even if not listed in the table below.
29 },
30 # tags found in the metadata dictionary (chunk 2)
31 pubdate => { Name => 'PublishDate', Groups => { 2 => 'Time' } },
32 pub_date_start => { Name => 'PublishDateStart', Groups => { 2 => 'Time' } },
33 author => { Name => 'Author', Groups => { 2 => 'Author' } },
34 copyright => { Name => 'Copyright', Groups => { 2 => 'Author' } },
35 # also seen (ref PH):
36 # product_id, parent_id, title, provider, narrator, price, description,
37 # long_description, short_title, is_aggregation, title_id, codec, HeaderSeed,
38 # EncryptedBlocks, HeaderKey, license_list, CPUType, license_count, <12 hex digits>,
39 # parent_short_title, parent_title, aggregation_id, short_description, user_alias
40
41 # information extracted from other chunks
42 _chapter_count => { Name => 'ChapterCount' }, # from chunk 6
43 _cover_art => { # from chunk 11
44 Name => 'CoverArt',
45 Groups => { 2 => 'Preview' },
46 Binary => 1,
47 },
48);
49
50# 'tags' atoms observed in Audible .m4b audio books (ref PH)
51%Image::ExifTool::Audible::tags = (
52 GROUPS => { 0 => 'QuickTime', 2 => 'Audio' },
53 NOTES => 'Information found in "tags" atom of Audible M4B audio books.',
54 meta => {
55 Name => 'Audible_meta',
56 SubDirectory => { TagTable => 'Image::ExifTool::Audible::meta' },
57 },
58 cvrx => {
59 Name => 'Audible_cvrx',
60 SubDirectory => { TagTable => 'Image::ExifTool::Audible::cvrx' },
61 },
62 tseg => {
63 Name => 'Audible_tseg',
64 SubDirectory => { TagTable => 'Image::ExifTool::Audible::tseg' },
65 },
66);
67
68# 'meta' information observed in Audible .m4b audio books (ref PH)
69%Image::ExifTool::Audible::meta = (
70 PROCESS_PROC => \&ProcessAudible_meta,
71 GROUPS => { 0 => 'QuickTime', 2 => 'Audio' },
72 NOTES => 'Information found in Audible M4B "meta" atom.',
73 Album => 'Album',
74 ALBUMARTIST => { Name => 'AlbumArtist', Groups => { 2 => 'Author' } },
75 Artist => { Name => 'Artist', Groups => { 2 => 'Author' } },
76 Comment => 'Comment',
77 Genre => 'Genre',
78 itunesmediatype => { Name => 'iTunesMediaType', Description => 'iTunes Media Type' },
79 SUBTITLE => 'Subtitle',
80 Title => 'Title',
81 TOOL => 'CreatorTool',
82 Year => { Name => 'Year', Groups => { 2 => 'Time' } },
83 track => 'ChapterName', # (found in 'meta' of 'tseg' atom)
84);
85
86# 'cvrx' information observed in Audible .m4b audio books (ref PH)
87%Image::ExifTool::Audible::cvrx = (
88 PROCESS_PROC => \&ProcessAudible_cvrx,
89 GROUPS => { 0 => 'QuickTime', 2 => 'Audio' },
90 NOTES => 'Audible cover art information in M4B audio books.',
91 VARS => { NO_ID => 1 },
92 CoverArtType => 'CoverArtType',
93 CoverArt => {
94 Name => 'CoverArt',
95 Groups => { 2 => 'Preview' },
96 Binary => 1,
97 },
98);
99
100# 'tseg' information observed in Audible .m4b audio books (ref PH)
101%Image::ExifTool::Audible::tseg = (
102 GROUPS => { 0 => 'QuickTime', 2 => 'Audio' },
103 tshd => {
104 Name => 'ChapterNumber',
105 Format => 'int32u',
106 ValueConv => '$val + 1', # start counting from 1
107 },
108 meta => {
109 Name => 'Audible_meta2',
110 SubDirectory => { TagTable => 'Image::ExifTool::Audible::meta' },
111 },
112);
113
114#------------------------------------------------------------------------------
115# Process Audible 'meta' tags from M4B files (ref PH)
116# Inputs: 0) ExifTool object ref, 1) dirInfo ref, 2) tag table ref
117# Returns: 1 on success
118sub ProcessAudible_meta($$$)
119{
120 my ($et, $dirInfo, $tagTablePtr) = @_;
121 my $dataPt = $$dirInfo{DataPt};
122 my $dataPos = $$dirInfo{DataPos};
123 my $dirLen = length $$dataPt;
124 return 0 if $dirLen < 4;
125 my $num = Get32u($dataPt, 0);
126 $et->VerboseDir('Audible_meta', $num);
127 my $pos = 4;
128 my $index;
129 for ($index=0; $index<$num; ++$index) {
130 last if $pos + 3 > $dirLen;
131 my $unk = Get8u($dataPt, $pos); # ? (0x80 or 0x00)
132 last unless $unk == 0x80 or $unk == 0x00;
133 my $len = Get16u($dataPt, $pos + 1); # tag length
134 $pos += 3;
135 last if $pos + $len + 6 > $dirLen or not $len;
136 my $tag = substr($$dataPt, $pos, $len); # tag ID
137 my $ver = Get16u($dataPt, $pos + $len); # version?
138 last unless $ver == 0x0001;
139 my $size = Get32u($dataPt, $pos + $len + 2);# data size
140 $pos += $len + 6;
141 last if $pos + $size > $dirLen;
142 my $val = $et->Decode(substr($$dataPt, $pos, $size), 'UTF8');
143 unless ($$tagTablePtr{$tag}) {
144 my $name = Image::ExifTool::MakeTagName(($tag =~ /[a-z]/) ? $tag : lc($tag));
145 AddTagToTable($tagTablePtr, $tag, { Name => $name });
146 }
147 $et->HandleTag($tagTablePtr, $tag, $val,
148 DataPt => $dataPt,
149 DataPos => $dataPos,
150 Start => $pos,
151 Size => $size,
152 Index => $index,
153 );
154 $pos += $size;
155 }
156 return 1;
157}
158
159#------------------------------------------------------------------------------
160# Process Audible 'cvrx' cover art atom from M4B files (ref PH)
161# Inputs: 0) ExifTool object ref, 1) dirInfo ref, 2) tag table ref
162# Returns: 1 on success
163sub ProcessAudible_cvrx($$$)
164{
165 my ($et, $dirInfo, $tagTablePtr) = @_;
166 my $dataPt = $$dirInfo{DataPt};
167 my $dataPos = $$dirInfo{DataPos};
168 my $dirLen = length $$dataPt;
169 return 0 if 0x0a > $dirLen;
170 my $len = Get16u($dataPt, 0x08);
171 return 0 if 0x0a + $len + 6 > $dirLen;
172 my $size = Get32u($dataPt, 0x0a + $len + 2);
173 return 0 if 0x0a + $len + 6 + $size > $dirLen;
174 $et->VerboseDir('Audible_cvrx', undef, $dirLen);
175 $et->HandleTag($tagTablePtr, 'CoverArtType', undef,
176 DataPt => $dataPt,
177 DataPos => $dataPos,
178 Start => 0x0a,
179 Size => $len,
180 );
181 $et->HandleTag($tagTablePtr, 'CoverArt', undef,
182 DataPt => $dataPt,
183 DataPos => $dataPos,
184 Start => 0x0a + $len + 6,
185 Size => $size,
186 );
187 return 1;
188}
189
190#------------------------------------------------------------------------------
191# Read information from an Audible .AA file
192# Inputs: 0) ExifTool ref, 1) dirInfo ref
193# Returns: 1 on success, 0 if this wasn't a valid AA file
194sub ProcessAA($$)
195{
196 my ($et, $dirInfo) = @_;
197 my $raf = $$dirInfo{RAF};
198 my ($buff, $toc, $entry, $i);
199
200 # check magic number
201 return 0 unless $raf->Read($buff, 16) == 16 and $buff=~/^.{4}\x57\x90\x75\x36/s;
202 # check file size
203 if (defined $$et{VALUE}{FileSize}) {
204 # first 4 bytes of the file should be the filesize
205 unpack('N', $buff) == $$et{VALUE}{FileSize} or return 0;
206 }
207 $et->SetFileType();
208 SetByteOrder('MM');
209 my $bytes = 12 * Get32u(\$buff, 8); # table of contents size in bytes
210 $bytes > 0xc00 and $et->Warn('Invalid TOC'), return 1;
211 # read the table of contents
212 $raf->Read($toc, $bytes) == $bytes or $et->Warn('Truncated TOC'), return 1;
213 my $tagTablePtr = GetTagTable('Image::ExifTool::Audible::Main');
214 # parse table of contents (in $toc)
215 for ($entry=0; $entry<$bytes; $entry+=12) {
216 my $type = Get32u(\$toc, $entry);
217 next unless $type == 2 or $type == 6 or $type == 11;
218 my $offset = Get32u(\$toc, $entry + 4);
219 my $length = Get32u(\$toc, $entry + 8) or next;
220 $raf->Seek($offset, 0) or $et->Warn("Chunk $type seek error"), last;
221 if ($type == 6) { # offset table
222 next if $length < 4 or $raf->Read($buff, 4) != 4; # only read the chapter count
223 $et->HandleTag($tagTablePtr, '_chapter_count', Get32u(\$buff, 0));
224 next;
225 }
226 # read the chunk
227 $length > 100000000 and $et->Warn("Chunk $type too big"), next;
228 $raf->Read($buff, $length) == $length or $et->Warn("Chunk $type read error"), last;
229 if ($type == 11) { # cover art
230 next if $length < 8;
231 my $len = Get32u(\$buff, 0);
232 my $off = Get32u(\$buff, 4);
233 next if $off < $offset + 8 or $off - $offset + $len > $length;
234 $et->HandleTag($tagTablePtr, '_cover_art', substr($buff, $off-$offset, $len));
235 next;
236 }
237 # parse metadata dictionary (in $buff)
238 $length < 4 and $et->Warn('Bad dictionary'), next;
239 my $num = Get32u(\$buff, 0);
240 $num > 0x200 and $et->Warn('Bad dictionary count'), next;
241 my $pos = 4; # dictionary starts immediately after count
242 require Image::ExifTool::HTML; # (for UnescapeHTML)
243 $et->VerboseDir('Audible Metadata', $num);
244 for ($i=0; $i<$num; ++$i) {
245 my $tagPos = $pos + 9; # position of tag string
246 $tagPos > $length and $et->Warn('Truncated dictionary'), last;
247 # (1 unknown byte ignored at start of each dictionary entry)
248 my $tagLen = Get32u(\$buff, $pos + 1); # tag string length
249 my $valLen = Get32u(\$buff, $pos + 5); # value string length
250 my $valPos = $tagPos + $tagLen; # position of value string
251 my $nxtPos = $valPos + $valLen; # position of next entry
252 $nxtPos > $length and $et->Warn('Bad dictionary entry'), last;
253 my $tag = substr($buff, $tagPos, $tagLen);
254 my $val = substr($buff, $valPos, $valLen);
255 unless ($$tagTablePtr{$tag}) {
256 my $name = Image::ExifTool::MakeTagName($tag);
257 $name =~ s/_(.)/\U$1/g; # change from underscore-separated to mixed case
258 AddTagToTable($tagTablePtr, $tag, { Name => $name });
259 }
260 # unescape HTML character references and convert from UTF-8
261 $val = $et->Decode(Image::ExifTool::HTML::UnescapeHTML($val), 'UTF8');
262 $et->HandleTag($tagTablePtr, $tag, $val,
263 DataPos => $offset,
264 DataPt => \$buff,
265 Start => $valPos,
266 Size => $valLen,
267 Index => $i,
268 );
269 $pos = $nxtPos; # step to next dictionary entry
270 }
271 }
272 return 1;
273}
274
2751; # end
276
277__END__
278
279=head1 NAME
280
281Image::ExifTool::Audible - Read meta information from Audible audio books
282
283=head1 SYNOPSIS
284
285This module is used by Image::ExifTool
286
287=head1 DESCRIPTION
288
289This module contains definitions required by Image::ExifTool to read meta
290information from Audible audio books.
291
292=head1 AUTHOR
293
294Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
295
296This library is free software; you can redistribute it and/or modify it
297under the same terms as Perl itself.
298
299=head1 REFERENCES
300
301=over 4
302
303=item L<https://github.com/jteeuwen/audible>
304
305=item L<https://code.google.com/p/pyaudibletags/>
306
307=item L<http://wiki.multimedia.cx/index.php?title=Audible_Audio>
308
309=back
310
311=head1 SEE ALSO
312
313L<Image::ExifTool::TagNames/Audible Tags>,
314L<Image::ExifTool(3pm)|Image::ExifTool>
315
316=cut
317
Note: See TracBrowser for help on using the repository browser.