source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/VCard.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

File size: 17.3 KB
Line 
1#------------------------------------------------------------------------------
2# File: VCard.pm
3#
4# Description: Read vCard and iCalendar meta information
5#
6# Revisions: 2015/04/05 - P. Harvey Created
7# 2015/05/02 - PH Added iCalendar support
8#
9# References: 1) http://en.m.wikipedia.org/wiki/VCard
10# 2) http://tools.ietf.org/html/rfc6350
11# 3) http://tools.ietf.org/html/rfc5545
12#------------------------------------------------------------------------------
13
14package Image::ExifTool::VCard;
15
16use strict;
17use vars qw($VERSION);
18use Image::ExifTool qw(:DataAccess :Utils);
19
20$VERSION = '1.06';
21
22my %unescapeVCard = ( '\\'=>'\\', ','=>',', 'n'=>"\n", 'N'=>"\n" );
23
24# lookup for iCalendar components (used to generate family 1 group names if top level)
25my %isComponent = ( Event=>1, Todo=>1, Journal=>1, Freebusy=>1, Timezone=>1, Alarm=>1 );
26
27my %timeInfo = (
28 # convert common date/time formats to EXIF style
29 ValueConv => q{
30 $val =~ s/(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})(\d{2})(Z?)/$1:$2:$3 $4:$5:$6$7/g;
31 $val =~ s/(\d{4})(\d{2})(\d{2})/$1:$2:$3/g;
32 $val =~ s/(\d{4})-(\d{2})-(\d{2})/$1:$2:$3/g;
33 return $val;
34 },
35 PrintConv => '$self->ConvertDateTime($val)',
36);
37
38# vCard tags (ref 1/2/PH)
39# Note: The case of all tag ID's is normalized to lowercase with uppercase first letter
40%Image::ExifTool::VCard::Main = (
41 GROUPS => { 2 => 'Document' },
42 VARS => { NO_LOOKUP => 1 }, # omit tags from lookup
43 NOTES => q{
44 This table lists common vCard tags, but ExifTool will also extract any other
45 vCard tags found. Tag names may have "Pref" added to indicate the preferred
46 instance of a vCard property, and other "TYPE" parameters may also added to
47 the tag name. VCF files may contain multiple vCard entries which are
48 distinguished by the ExifTool family 3 group name (document number). See
49 L<http://tools.ietf.org/html/rfc6350> for the vCard 4.0 specification.
50 },
51 Version => { Name => 'VCardVersion', Description => 'VCard Version' },
52 Fn => { Name => 'FormattedName', Groups => { 2 => 'Author' } },
53 N => { Name => 'Name', Groups => { 2 => 'Author' } },
54 Bday => { Name => 'Birthday', Groups => { 2 => 'Time' }, %timeInfo },
55 Tz => { Name => 'TimeZone', Groups => { 2 => 'Time' } },
56 Adr => { Name => 'Address', Groups => { 2 => 'Location' } },
57 Geo => {
58 Name => 'Geolocation',
59 Groups => { 2 => 'Location' },
60 # when used as a parameter, VCard 4.0 adds a "geo:" prefix that we need to remove
61 ValueConv => '$val =~ s/^geo://; $val',
62 },
63 Anniversary => { },
64 Email => { },
65 Gender => { },
66 Impp => 'IMPP',
67 Lang => 'Language',
68 Logo => { },
69 Nickname => { },
70 Note => { },
71 Org => 'Organization',
72 Photo => { Groups => { 2 => 'Preview' } },
73 Prodid => 'Software',
74 Rev => 'Revision',
75 Sound => { },
76 Tel => 'Telephone',
77 Title => 'JobTitle',
78 Uid => 'UID',
79 Url => 'URL',
80 'X-ablabel' => { Name => 'ABLabel', PrintConv => '$val =~ s/^_\$!<(.*)>!\$_$/$1/; $val' },
81 'X-abdate' => { Name => 'ABDate', Groups => { 2 => 'Time' }, %timeInfo },
82 'X-aim' => 'AIM',
83 'X-icq' => 'ICQ',
84 'X-abuid' => 'AB_UID',
85 'X-abrelatednames' => 'ABRelatedNames',
86 'X-socialprofile' => 'SocialProfile',
87);
88
89%Image::ExifTool::VCard::VCalendar = (
90 GROUPS => { 1 => 'VCalendar', 2 => 'Document' },
91 VARS => {
92 NO_LOOKUP => 1, # omit tags from lookup
93 LONG_TAGS => 6, # some X-microsoft tags have unavoidably long ID's
94 },
95 NOTES => q{
96 The VCard module is also used to process iCalendar ICS files since they use
97 a format similar to vCard. The following table lists standard iCalendar
98 tags, but any existing tags will be extracted. Top-level iCalendar
99 components (eg. Event, Todo, Timezone, etc.) are used for the family 1 group
100 names, and embedded components (eg. Alarm) are added as a prefix to the tag
101 name. See L<http://tools.ietf.org/html/rfc5545> for the official iCalendar
102 2.0 specification.
103 },
104 Version => { Name => 'VCalendarVersion', Description => 'VCalendar Version' },
105 Calscale => 'CalendarScale',
106 Method => { },
107 Prodid => 'Software',
108 Attach => 'Attachment',
109 Categories => { },
110 Class => 'Classification',
111 Comment => { },
112 Description => { },
113 Geo => {
114 Name => 'Geolocation',
115 Groups => { 2 => 'Location' },
116 ValueConv => '$val =~ s/^geo://; $val',
117 },
118 Location => { Name => 'Location', Groups => { 2 => 'Location' } },
119 'Percent-complete' => 'PercentComplete',
120 Priority => { },
121 Resources => { },
122 Status => { },
123 Summary => { },
124 Completed => { Name => 'DateTimeCompleted', Groups => { 2 => 'Time' }, %timeInfo },
125 Dtend => { Name => 'DateTimeEnd', Groups => { 2 => 'Time' }, %timeInfo },
126 Due => { Name => 'DateTimeDue', Groups => { 2 => 'Time' }, %timeInfo },
127 Dtstart => { Name => 'DateTimeStart', Groups => { 2 => 'Time' }, %timeInfo },
128 Duration => { },
129 Freebusy => 'FreeBusyTime',
130 Transp => 'TimeTransparency',
131 Tzid => { Name => 'TimezoneID', Groups => { 2 => 'Time' } },
132 Tzname => { Name => 'TimezoneName', Groups => { 2 => 'Time' } },
133 Tzoffsetfrom=> { Name => 'TimezoneOffsetFrom', Groups => { 2 => 'Time' } },
134 Tzoffsetto => { Name => 'TimezoneOffsetTo', Groups => { 2 => 'Time' } },
135 Tzurl => { Name => 'TimeZoneURL', Groups => { 2 => 'Time' } },
136 Attendee => { },
137 Contact => { },
138 Organizer => { },
139 'Recurrence-id' => 'RecurrenceID',
140 'Related-to' => 'RelatedTo',
141 Url => 'URL',
142 Uid => 'UID',
143 Exdate => { Name => 'ExceptionDateTimes', Groups => { 2 => 'Time' }, %timeInfo },
144 Rdate => { Name => 'RecurrenceDateTimes', Groups => { 2 => 'Time' }, %timeInfo },
145 Rrule => { Name => 'RecurrenceRule', Groups => { 2 => 'Time' } },
146 Action => { },
147 Repeat => { },
148 Trigger => { },
149 Created => { Name => 'DateCreated', Groups => { 2 => 'Time' }, %timeInfo },
150 Dtstamp => { Name => 'DateTimeStamp', Groups => { 2 => 'Time' }, %timeInfo },
151 'Last-modified' => { Name => 'ModifyDate', Groups => { 2 => 'Time' }, %timeInfo },
152 Sequence => 'SequenceNumber',
153 'Request-status' => 'RequestStatus',
154 Acknowledged=> { Name => 'Acknowledged', Groups => { 2 => 'Time' }, %timeInfo },
155#
156# Observed X-tags (not a comprehensive list):
157#
158 'X-apple-calendar-color'=> 'CalendarColor',
159 'X-apple-default-alarm' => 'DefaultAlarm',
160 'X-apple-local-default-alarm' => 'LocalDefaultAlarm',
161 'X-microsoft-cdo-appt-sequence' => 'AppointmentSequence',
162 'X-microsoft-cdo-ownerapptid' => 'OwnerAppointmentID',
163 'X-microsoft-cdo-busystatus' => 'BusyStatus',
164 'X-microsoft-cdo-intendedstatus' => 'IntendedBusyStatus',
165 'X-microsoft-cdo-alldayevent' => 'AllDayEvent',
166 'X-microsoft-cdo-importance' => {
167 Name => 'Importance',
168 PrintConv => {
169 0 => 'Low',
170 1 => 'Normal',
171 2 => 'High',
172 },
173 },
174 'X-microsoft-cdo-insttype' => {
175 Name => 'InstanceType',
176 PrintConv => {
177 0 => 'Non-recurring Appointment',
178 1 => 'Recurring Appointment',
179 2 => 'Single Instance of Recurring Appointment',
180 3 => 'Exception to Recurring Appointment',
181 },
182 },
183 'X-microsoft-donotforwardmeeting' => 'DoNotForwardMeeting',
184 'X-microsoft-disallow-counter' => 'DisallowCounterProposal',
185 'X-microsoft-locations' => { Name => 'MeetingLocations', Groups => { 2 => 'Location' } },
186 'X-wr-caldesc' => 'CalendarDescription',
187 'X-wr-calname' => 'CalendarName',
188 'X-wr-relcalid' => 'CalendarID',
189 'X-wr-timezone' => { Name => 'TimeZone2', Groups => { 2 => 'Time' } },
190 'X-wr-alarmuid' => 'AlarmUID',
191);
192
193#------------------------------------------------------------------------------
194# Get vCard tag, creating if necessary
195# Inputs: 0) ExifTool ref, 1) tag table ref, 2) tag ID, 3) tag Name,
196# 4) source tagInfo ref, 5) lang code
197# Returns: tagInfo ref
198sub GetVCardTag($$$$;$$)
199{
200 my ($et, $tagTablePtr, $tag, $name, $srcInfo, $langCode) = @_;
201 my $tagInfo = $$tagTablePtr{$tag};
202 unless ($tagInfo) {
203 if ($srcInfo) {
204 $tagInfo = { %$srcInfo };
205 } else {
206 $tagInfo = { };
207 $et->VPrint(0, $$et{INDENT}, "[adding $tag]\n");
208 }
209 $$tagInfo{Name} = $name;
210 delete $$tagInfo{Description}; # create new description
211 AddTagToTable($tagTablePtr, $tag, $tagInfo);
212 }
213 # handle alternate languages (the "language" parameter)
214 $tagInfo = Image::ExifTool::GetLangInfo($tagInfo, $langCode) if $langCode;
215 return $tagInfo;
216}
217
218#------------------------------------------------------------------------------
219# Decode vCard text
220# Inputs: 0) ExifTool ref, 1) vCard text, 2) encoding
221# Returns: decoded text (or array ref for a list of values)
222sub DecodeVCardText($$;$)
223{
224 my ($et, $val, $enc) = @_;
225 $enc = defined($enc) ? lc $enc : '';
226 if ($enc eq 'b' or $enc eq 'base64') {
227 require Image::ExifTool::XMP;
228 $val = Image::ExifTool::XMP::DecodeBase64($val);
229 } else {
230 if ($enc eq 'quoted-printable') {
231 # convert "=HH" hex codes to characters
232 $val =~ s/=([0-9a-f]{2})/chr(hex($1))/ige;
233 }
234 $val = $et->Decode($val, 'UTF8'); # convert from UTF-8
235 # convert unescaped commas to nulls to separate list items
236 $val =~ s/(\\.)|(,)/$1 || "\0"/sge;
237 # unescape necessary characters in value
238 $val =~ s/\\(.)/$unescapeVCard{$1}||$1/sge;
239 # split into list if necessary
240 my @vals = split /\0/, $val;
241 $val = \@vals if @vals > 1;
242 }
243 return $val;
244}
245
246#------------------------------------------------------------------------------
247# Read information in a vCard file
248# Inputs: 0) ExifTool ref, 1) dirInfo ref
249# Returns: 1 on success, 0 if this wasn't a valid vCard file
250sub ProcessVCard($$)
251{
252 local $_;
253 my ($et, $dirInfo) = @_;
254 my $raf = $$dirInfo{RAF};
255 my ($buff, $val, $ok, $component, %compNum, @count);
256
257 return 0 unless $raf->Read($buff, 24) and $raf->Seek(0,0) and $buff=~/^BEGIN:(VCARD|VCALENDAR)\r\n/i;
258 my ($type, $lbl, $tbl, $ext) = uc($1) eq 'VCARD' ? qw(VCard vCard Main VCF) : qw(ICS iCalendar VCalendar ICS);
259 $et->SetFileType($type, undef, $ext);
260 return 1 if $$et{OPTIONS}{FastScan} and $$et{OPTIONS}{FastScan} == 3;
261 local $/ = "\r\n";
262 my $tagTablePtr = GetTagTable("Image::ExifTool::VCard::$tbl");
263 my $more = $raf->ReadLine($buff); # read first line
264 chomp $buff if $more;
265 while ($more) {
266 # retrieve previous line from $buff
267 $val = $buff if defined $buff;
268 # read ahead to next line to see if is a continuation
269 $more = $raf->ReadLine($buff);
270 if ($more) {
271 chomp $buff;
272 # add continuation line if necessary
273 $buff =~ s/^[ \t]// and $val .= $buff, undef($buff), next;
274 }
275 if ($val =~ /^(BEGIN|END):(V?)(\w+)$/i) {
276 my ($begin, $v, $what) = ((lc($1) eq 'begin' ? 1 : 0), $2, ucfirst lc $3);
277 if ($what eq 'Card' or $what eq 'Calendar') {
278 if ($begin) {
279 @count = ( { } ); # reset group counters
280 } else {
281 $ok = 1; # ok if we read at least on full VCARD or VCALENDAR
282 }
283 next;
284 }
285 # absorb top-level component into family 1 group name
286 if ($isComponent{$what}) {
287 if ($begin) {
288 unless ($component) {
289 # begin a new top-level component
290 @count = ( { } );
291 $component = $what;
292 $compNum{$component} = ($compNum{$component} || 0) + 1;
293 next;
294 }
295 } elsif ($component and $component eq $what) {
296 # this top-level component has ended
297 undef $component;
298 next;
299 }
300 }
301 # keep count of each component at this level
302 if ($begin) {
303 $count[-1]{$what} = ($count[-1]{$what} || 0) + 1 if $v;
304 push @count, { obj => $what };
305 } elsif (@count > 1) {
306 pop @count;
307 }
308 next;
309 } elsif ($ok) {
310 $ok = 0;
311 $$et{DOC_NUM} = ++$$et{DOC_COUNT}; # read next card as a new document
312 }
313 unless ($val =~ s/^([-A-Za-z0-9.]+)//) {
314 $et->WarnOnce("Unrecognized line in $lbl file");
315 next;
316 }
317 my $tag = $1;
318 # set group if it exists
319 if ($tag =~ s/^([-A-Za-z0-9]+)\.//) {
320 $$et{SET_GROUP1} = ucfirst lc $1;
321 } elsif ($component) {
322 $$et{SET_GROUP1} = $component . $compNum{$component};
323 } else {
324 delete $$et{SET_GROUP1};
325 }
326 my ($name, %param, $p);
327 # vCard tag ID's are case-insensitive, so normalize to lowercase with
328 # an uppercase first letter for use as a tag name
329 $name = ucfirst $tag if $tag =~ /[a-z]/; # preserve mixed case in name if it exists
330 $tag = ucfirst lc $tag;
331 # get source tagInfo reference
332 my $srcInfo = $et->GetTagInfo($tagTablePtr, $tag);
333 if ($srcInfo) {
334 $name = $$srcInfo{Name}; # use our name
335 } else {
336 $name or $name = $tag;
337 # remove leading "X-" from name if it exists
338 $name =~ s/^X-// and $name = ucfirst $name;
339 }
340 # add object name(s) to tag if necessary
341 if (@count > 1) {
342 my $i;
343 for ($i=$#count-1; $i>=0; --$i) {
344 my $pre = $count[$i-1]{obj}; # use containing object name as tag prefix
345 my $c = $count[$i]{$pre}; # add index for object number
346 $c = '' unless defined $c;
347 $tag = $pre . $c . $tag;
348 $name = $pre . $c . $name;
349 }
350 }
351 # parse parameters
352 while ($val =~ s/^;([-A-Za-z0-9]*)(=?)//) {
353 $p = ucfirst lc $1;
354 # convert old vCard 2.x parameters to the new "TYPE=" format
355 $2 or $val = $1 . $val, $p = 'Type';
356 # read parameter value
357 for (;;) {
358 last unless $val =~ s/^"([^"]*)",?// or $val =~ s/^([^";:,]+,?)//;
359 my $v = $p eq 'Type' ? ucfirst lc $1 : $1;
360 $param{$p} = defined($param{$p}) ? $param{$p} . $v : $v;
361 }
362 if (defined $param{$p}) {
363 $param{$p} =~ s/\\(.)/$unescapeVCard{$1}||$1/sge;
364 } else {
365 $param{$p} = '';
366 }
367 }
368 $val =~ s/^:// or $et->WarnOnce("Invalid line in $lbl file"), next;
369 # add 'Type' parameter to id and name if it exists
370 $param{Type} and $tag .= $param{Type}, $name .= $param{Type};
371 # convert base64-encoded data
372 if ($val =~ s{^data:(\w+)/(\w+);base64,}{}) {
373 my $xtra = ucfirst(lc $1) . ucfirst(lc $2);
374 $tag .= $xtra;
375 $name .= $xtra;
376 $param{Encoding} = 'base64';
377 }
378 $val = DecodeVCardText($et, $val, $param{Encoding});
379 my $tagInfo = GetVCardTag($et, $tagTablePtr, $tag, $name, $srcInfo, $param{Language});
380 $et->HandleTag($tagTablePtr, $tag, $val, TagInfo => $tagInfo);
381 # handle some other parameters that we care about (ignore the rest for now)
382 foreach $p (qw(Geo Label Tzid)) {
383 next unless defined $param{$p};
384 # use tag attributes from our table if it exists
385 my $srcTag2 = $et->GetTagInfo($tagTablePtr, $p);
386 my $pn = $srcTag2 ? $$srcTag2{Name} : $p;
387 $val = DecodeVCardText($et, $param{$p});
388 # add parameter to tag ID and name
389 my ($tg, $nm) = ($tag . $p, $name . $pn);
390 $tagInfo = GetVCardTag($et, $tagTablePtr, $tg, $nm, $srcTag2, $param{Language});
391 $et->HandleTag($tagTablePtr, $tg, $val, TagInfo => $tagInfo);
392 }
393 }
394 delete $$et{SET_GROUP1};
395 delete $$et{DOC_NUM};
396 $ok or $et->Warn("Missing $lbl end");
397 return 1;
398}
399
4001; # end
401
402__END__
403
404=head1 NAME
405
406Image::ExifTool::VCard - Read vCard and iCalendar meta information
407
408=head1 SYNOPSIS
409
410This module is used by Image::ExifTool
411
412=head1 DESCRIPTION
413
414This module contains definitions required by Image::ExifTool to read meta
415information from vCard VCF and iCalendar ICS files.
416
417=head1 AUTHOR
418
419Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
420
421This library is free software; you can redistribute it and/or modify it
422under the same terms as Perl itself.
423
424=head1 REFERENCES
425
426=over 4
427
428=item L<http://en.m.wikipedia.org/wiki/VCard>
429
430=item L<http://tools.ietf.org/html/rfc6350>
431
432=item L<http://tools.ietf.org/html/rfc5545>
433
434=back
435
436=head1 SEE ALSO
437
438L<Image::ExifTool::TagNames/VCard Tags>,
439L<Image::ExifTool::TagNames/VCard VCalendar Tags>,
440L<Image::ExifTool(3pm)|Image::ExifTool>
441
442=cut
443
Note: See TracBrowser for help on using the repository browser.