1 | #------------------------------------------------------------------------------
|
---|
2 | # File: RTF.pm
|
---|
3 | #
|
---|
4 | # Description: Read Rich Text Format meta information
|
---|
5 | #
|
---|
6 | # Revisions: 2010/06/17 - P. Harvey Created
|
---|
7 | #
|
---|
8 | # References: 1) http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc
|
---|
9 | # 2) http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod
|
---|
10 | #------------------------------------------------------------------------------
|
---|
11 |
|
---|
12 | package Image::ExifTool::RTF;
|
---|
13 |
|
---|
14 | use strict;
|
---|
15 | use vars qw($VERSION);
|
---|
16 | use Image::ExifTool qw(:DataAccess :Utils);
|
---|
17 |
|
---|
18 | $VERSION = '1.01';
|
---|
19 |
|
---|
20 | sub ProcessUserProps($$$);
|
---|
21 |
|
---|
22 | # supported RTF character entities
|
---|
23 | my %rtfEntity = (
|
---|
24 | par => 0x0a,
|
---|
25 | tab => 0x09,
|
---|
26 | endash => 0x2013,
|
---|
27 | emdash => 0x2014,
|
---|
28 | lquote => 0x2018,
|
---|
29 | rquote => 0x2019,
|
---|
30 | ldblquote => 0x201c,
|
---|
31 | rdblquote => 0x201d,
|
---|
32 | bullet => 0x2022,
|
---|
33 | );
|
---|
34 |
|
---|
35 | # RTF tags (ref 1)
|
---|
36 | %Image::ExifTool::RTF::Main = (
|
---|
37 | GROUPS => { 2 => 'Document' },
|
---|
38 | NOTES => q{
|
---|
39 | This table lists standard tags of the RTF information group, but ExifTool
|
---|
40 | will also extract any non-standard tags found in this group. As well,
|
---|
41 | ExifTool will extract any custom properties that are found. See
|
---|
42 | L<http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc>
|
---|
43 | for the specification.
|
---|
44 | },
|
---|
45 | title => { },
|
---|
46 | subject => { },
|
---|
47 | author => { Groups => { 2 => 'Author' } },
|
---|
48 | manager => { },
|
---|
49 | company => { },
|
---|
50 | copyright=> { Groups => { 2 => 'Author' } }, # (written by Apple TextEdit)
|
---|
51 | operator => { Name => 'LastModifiedBy' },
|
---|
52 | category => { },
|
---|
53 | keywords => { },
|
---|
54 | comment => { },
|
---|
55 | doccomm => { Name => 'Comments' },
|
---|
56 | hlinkbase=> { Name => 'HyperlinkBase' },
|
---|
57 | creatim => {
|
---|
58 | Name => 'CreateDate',
|
---|
59 | Format => 'date',
|
---|
60 | Groups => { 2 => 'Time' },
|
---|
61 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
62 | },
|
---|
63 | revtim => {
|
---|
64 | Name => 'ModifyDate',
|
---|
65 | Format => 'date',
|
---|
66 | Groups => { 2 => 'Time' },
|
---|
67 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
68 | },
|
---|
69 | printim => {
|
---|
70 | Name => 'LastPrinted',
|
---|
71 | Format => 'date',
|
---|
72 | Groups => { 2 => 'Time' },
|
---|
73 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
74 | },
|
---|
75 | buptim => {
|
---|
76 | Name => 'BackupTime',
|
---|
77 | Format => 'date',
|
---|
78 | Groups => { 2 => 'Time' },
|
---|
79 | PrintConv => '$self->ConvertDateTime($val)',
|
---|
80 | },
|
---|
81 | edmins => {
|
---|
82 | Name => 'TotalEditTime', # in minutes
|
---|
83 | PrintConv => 'ConvertTimeSpan($val, 60)',
|
---|
84 | },
|
---|
85 | nofpages => { Name => 'Pages' },
|
---|
86 | nofwords => { Name => 'Words' },
|
---|
87 | nofchars => { Name => 'Characters' },
|
---|
88 | nofcharsws=>{
|
---|
89 | Name => 'CharactersWithSpaces',
|
---|
90 | Notes => q{
|
---|
91 | according to the 2007 Microsoft RTF specification this is clearly the number
|
---|
92 | of characters NOT including spaces, but Microsoft Word writes this as the
|
---|
93 | number WITH spaces, so ExifTool names this tag according to the de facto
|
---|
94 | standard
|
---|
95 | },
|
---|
96 | },
|
---|
97 | id => { Name => 'InternalIDNumber' },
|
---|
98 | version => { Name => 'RevisionNumber' },
|
---|
99 | vern => { Name => 'InternalVersionNumber' },
|
---|
100 | );
|
---|
101 |
|
---|
102 | # lookup for user-defined properties
|
---|
103 | # (none are pre-defined and this table doesn't appear in the docs)
|
---|
104 | %Image::ExifTool::RTF::UserProps = (
|
---|
105 | GROUPS => { 2 => 'Document' },
|
---|
106 | );
|
---|
107 |
|
---|
108 | #------------------------------------------------------------------------------
|
---|
109 | # Read to nested closing curly bracket "}"
|
---|
110 | # Inputs: 0) data ref, 1) optional RAF ref to read more data if available
|
---|
111 | # Returns: text inside brackets, or undef on error
|
---|
112 | # Notes: On entry the current position in the data must be set to immediately
|
---|
113 | # after the command that opens the bracket. On return the current
|
---|
114 | # position is immediately following the closing brace if the return
|
---|
115 | # value is defined.
|
---|
116 | sub ReadToNested($;$)
|
---|
117 | {
|
---|
118 | my ($dataPt, $raf) = @_;
|
---|
119 | my $pos = pos $$dataPt;
|
---|
120 | my $level = 1;
|
---|
121 | for (;;) {
|
---|
122 | # look for the next bracket
|
---|
123 | unless ($$dataPt =~ /(\\*)([{}])/g) {
|
---|
124 | # must read some more data
|
---|
125 | my $p = length $$dataPt;
|
---|
126 | my $buff;
|
---|
127 | last unless $raf and $raf->Read($buff, 65536);
|
---|
128 | $$dataPt .= $buff;
|
---|
129 | # rewind position to include any leading backslashes
|
---|
130 | --$p while $p and substr($$dataPt, $p - 1, 1) eq '\\';
|
---|
131 | pos($$dataPt) = $p; # set position to continue search
|
---|
132 | next;
|
---|
133 | }
|
---|
134 | # bracket is escaped if preceded by an odd number of backslashes
|
---|
135 | next if $1 and length($1) & 0x01;
|
---|
136 | $2 eq '{' and ++$level, next;
|
---|
137 | next unless --$level <= 0;
|
---|
138 | return substr($$dataPt, $pos, pos($$dataPt) - $pos - 1);
|
---|
139 | }
|
---|
140 | return undef;
|
---|
141 | }
|
---|
142 |
|
---|
143 | #------------------------------------------------------------------------------
|
---|
144 | # Unescape RTF escape sequences
|
---|
145 | # Inputs: 0) ExifTool ref, 1) RTF text, 2) RTF character set (for hex characters)
|
---|
146 | # Returns: Unescaped text (in current ExifTool Charset)
|
---|
147 | sub UnescapeRTF($$$)
|
---|
148 | {
|
---|
149 | my ($exifTool, $val, $charset) = @_;
|
---|
150 |
|
---|
151 | # return now unless we have a control sequence
|
---|
152 | unless ($val =~ /\\/) {
|
---|
153 | $val =~ tr/\n\r//d; # ignore CR's and LF's
|
---|
154 | return $val;
|
---|
155 | }
|
---|
156 | # CR/LF is signficant if it terminates a control sequence (so change these to a space)
|
---|
157 | $val =~ s/(^|[^\\])((?:\\\\)*)(\\[a-zA-Z]+(?:-?\d+)?)[\n\r]/$1$2$3 /g;
|
---|
158 | # protect the newline control sequence by converting to a \par command
|
---|
159 | $val =~ s/(^|[^\\])((?:\\\\)*)(\\[\n\r])/$1$2\\par /g;
|
---|
160 | # all other CR/LF's are ignored (so delete them)
|
---|
161 | $val =~ tr/\n\r//d;
|
---|
162 |
|
---|
163 | my $rtnVal = '';
|
---|
164 | my $len = length $val;
|
---|
165 | my $skip = 1; # default Unicode skip count
|
---|
166 | my $p0 = 0;
|
---|
167 |
|
---|
168 | for (;;) {
|
---|
169 | # find next backslash
|
---|
170 | my $p1 = ($val =~ /\\/g) ? pos($val) : $len + 1;
|
---|
171 | # add text up to start of this control sequence (or up to end)
|
---|
172 | my $n = $p1 - $p0 - 1;
|
---|
173 | $rtnVal .= substr($val, $p0, $n) if $n > 0;
|
---|
174 | # all done if at the end or if control sequence is empty
|
---|
175 | last if $p1 >= $len;
|
---|
176 | # look for an ASCII-letter control word or Unicode control
|
---|
177 | if ($val =~ /\G([a-zA-Z]+)(-?\d+)? ?/g) {
|
---|
178 | # interpret command if recognized
|
---|
179 | if ($1 eq 'uc') { # \ucN
|
---|
180 | $skip = $2;
|
---|
181 | } elsif ($1 eq 'u') { # \uN
|
---|
182 | require Image::ExifTool::Charset;
|
---|
183 | $rtnVal .= Image::ExifTool::Charset::Recompose($exifTool, [$2]);
|
---|
184 | if ($skip) {
|
---|
185 | # must skip the specified number of characters
|
---|
186 | # (not simple because RTF control words count as a single character)
|
---|
187 | last unless $val =~ /\G([^\\]|\\([a-zA-Z]+)(-?\d+)? ?|\\'.{2}|\\.){$skip}/g;
|
---|
188 | }
|
---|
189 | } elsif ($rtfEntity{$1}) {
|
---|
190 | require Image::ExifTool::Charset;
|
---|
191 | $rtnVal .= Image::ExifTool::Charset::Recompose($exifTool, [$rtfEntity{$1}]);
|
---|
192 | } # (else ignore the command)
|
---|
193 | } else {
|
---|
194 | my $ch = substr($val, $p1, 1);
|
---|
195 | if ($ch eq "'") {
|
---|
196 | # hex character code
|
---|
197 | last if $p1 + 3 > $len;
|
---|
198 | my $hex = substr($val, $p1 + 1, 2);
|
---|
199 | if ($hex =~ /^[0-9a-fA-F]{2}$/) {
|
---|
200 | require Image::ExifTool::Charset;
|
---|
201 | $rtnVal .= $exifTool->Decode(chr(hex($hex)), $charset);
|
---|
202 | }
|
---|
203 | pos($val) = $p1 + 3; # skip to after the hex code
|
---|
204 | } else {
|
---|
205 | # assume a standard control symbol (\, {, }, etc)
|
---|
206 | # (note, this may not be valid for some uncommon
|
---|
207 | # control symbols like \~ for non-breaking space)
|
---|
208 | $rtnVal .= $ch;
|
---|
209 | pos($val) = $p1 + 1; # skip to after this character
|
---|
210 | }
|
---|
211 | }
|
---|
212 | $p0 = pos($val);
|
---|
213 | }
|
---|
214 | return $rtnVal;
|
---|
215 | }
|
---|
216 |
|
---|
217 | #------------------------------------------------------------------------------
|
---|
218 | # Read information in a RTF document
|
---|
219 | # Inputs: 0) ExifTool ref, 1) dirInfo ref
|
---|
220 | # Returns: 1 on success, 0 if this wasn't a valid RTF file
|
---|
221 | sub ProcessRTF($$)
|
---|
222 | {
|
---|
223 | my ($exifTool, $dirInfo) = @_;
|
---|
224 | my $raf = $$dirInfo{RAF};
|
---|
225 | my ($buff, $buf2, $cs);
|
---|
226 |
|
---|
227 | return 0 unless $raf->Read($buff, 64) and $raf->Seek(0,0);
|
---|
228 | return 0 unless $buff =~ /^[\n\r]*\{[\n\r]*\\rtf[^a-zA-Z]/;
|
---|
229 | $exifTool->SetFileType();
|
---|
230 | #
|
---|
231 | # determine the RTF character set
|
---|
232 | #
|
---|
233 | if ($buff=~ /\\ansicpg(\d*)/) {
|
---|
234 | $cs = "cp$1";
|
---|
235 | } elsif ($buff=~ /\\(ansi|mac|pc|pca)[^a-zA-Z]/) {
|
---|
236 | my %trans = (
|
---|
237 | ansi => 'Latin',
|
---|
238 | mac => 'MacRoman',
|
---|
239 | pc => 'cp437',
|
---|
240 | pca => 'cp850',
|
---|
241 | );
|
---|
242 | $cs = $trans{$1};
|
---|
243 | } else {
|
---|
244 | $exifTool->Warn('Unspecified RTF encoding. Will assume Latin');
|
---|
245 | $cs = 'Latin';
|
---|
246 | }
|
---|
247 | my $charset = $Image::ExifTool::charsetName{lc $cs};
|
---|
248 | unless ($charset) {
|
---|
249 | $exifTool->Warn("Unsupported RTF encoding $cs. Will assume Latin.");
|
---|
250 | $charset = 'Latin';
|
---|
251 | }
|
---|
252 | my $tagTablePtr = GetTagTable('Image::ExifTool::RTF::Main');
|
---|
253 | undef $buff;
|
---|
254 | #
|
---|
255 | # scan for \info group
|
---|
256 | #
|
---|
257 | for (;;) {
|
---|
258 | $raf->Read($buf2, 65536) or last;
|
---|
259 | if (defined $buff) {
|
---|
260 | # read more but leave some overlap for the match
|
---|
261 | $buff = substr($buff, -16) . $buf2;
|
---|
262 | } else {
|
---|
263 | $buff = $buf2;
|
---|
264 | }
|
---|
265 | next unless $buff =~ /[^\\]\{[\n\r]*\\info([^a-zA-Z])/g;
|
---|
266 | # anything but a space is included in the contents
|
---|
267 | pos($buff) = pos($buff) - 1 if $1 ne ' ';
|
---|
268 | my $info = ReadToNested(\$buff, $raf);
|
---|
269 | unless (defined $info) {
|
---|
270 | $exifTool->Warn('Unterminated information group');
|
---|
271 | last;
|
---|
272 | }
|
---|
273 | # process info commands (ie. "\author", "\*\copyright");
|
---|
274 | while ($info =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
|
---|
275 | pos($info) = pos($info) - 1 if $3 ne ' ';
|
---|
276 | my $tag = $2;
|
---|
277 | my $val = ReadToNested(\$info);
|
---|
278 | last unless defined $val;
|
---|
279 | my $tagInfo = $$tagTablePtr{$tag};
|
---|
280 | if ($tagInfo and $$tagInfo{Format} and $$tagInfo{Format} eq 'date') {
|
---|
281 | # parse RTF date commands
|
---|
282 | my %idx = (yr=>0,mo=>1,dy=>2,hr=>3,min=>4,sec=>5);
|
---|
283 | my @t = (0) x 6;
|
---|
284 | while ($val =~ /\\([a-z]+)(\d+)/g) {
|
---|
285 | next unless defined $idx{$1};
|
---|
286 | $t[$idx{$1}] = $2;
|
---|
287 | }
|
---|
288 | $val = sprintf("%.4d:%.2d:%.2d %.2d:%.2d:%.2d", @t);
|
---|
289 | } else {
|
---|
290 | # unescape RTF string value
|
---|
291 | $val = UnescapeRTF($exifTool, $val, $charset);
|
---|
292 | }
|
---|
293 | # create tagInfo for unknown tags
|
---|
294 | if (not $tagInfo) {
|
---|
295 | Image::ExifTool::AddTagToTable($tagTablePtr, $tag, { Name => ucfirst($tag) });
|
---|
296 | }
|
---|
297 | $exifTool->HandleTag($tagTablePtr, $tag, $val);
|
---|
298 | }
|
---|
299 | }
|
---|
300 | return 1 unless defined $buff;
|
---|
301 | #
|
---|
302 | # scan for \userprops (but don't read more from file to find the start of this command)
|
---|
303 | #
|
---|
304 | pos($buff) = 0;
|
---|
305 | while ($buff =~ /[^\\]\{[\n\r]*\\\*[\n\r]*\\userprops([^a-zA-Z])/g) {
|
---|
306 | # Note: The RTF spec places brackets around each propinfo structure,
|
---|
307 | # but Microsoft Word doesn't write it this way, so tolerate either.
|
---|
308 | pos($buff) = pos($buff) - 1 if $1 ne ' ';
|
---|
309 | my $props = ReadToNested(\$buff, $raf);
|
---|
310 | $tagTablePtr = Image::ExifTool::GetTagTable('Image::ExifTool::RTF::UserProps');
|
---|
311 | unless (defined $props) {
|
---|
312 | $exifTool->Warn('Unterminated user properties');
|
---|
313 | last;
|
---|
314 | }
|
---|
315 | # process user properties
|
---|
316 | my $tag;
|
---|
317 | while ($props =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
|
---|
318 | pos($props) = pos($props) - 1 if $3 ne ' ';
|
---|
319 | my $t = $2;
|
---|
320 | my $val = ReadToNested(\$props);
|
---|
321 | last unless defined $val;
|
---|
322 | $val = UnescapeRTF($exifTool, $val, $charset);
|
---|
323 | if ($t eq 'propname') {
|
---|
324 | $tag = $val;
|
---|
325 | next;
|
---|
326 | } elsif ($t ne 'staticval' or not defined $tag) {
|
---|
327 | next; # ignore \linkval and \proptype for now
|
---|
328 | }
|
---|
329 | $tag =~ s/\s(.)/\U$1/g; # capitalize all words in tag name
|
---|
330 | $tag =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
|
---|
331 | next unless $tag;
|
---|
332 | # create tagInfo for unknown tags
|
---|
333 | unless ($$tagTablePtr{$tag}) {
|
---|
334 | Image::ExifTool::AddTagToTable($tagTablePtr, $tag, { Name => $tag });
|
---|
335 | }
|
---|
336 | $exifTool->HandleTag($tagTablePtr, $tag, $val);
|
---|
337 | }
|
---|
338 | last; # (didn't really want to loop)
|
---|
339 | }
|
---|
340 | return 1;
|
---|
341 | }
|
---|
342 |
|
---|
343 | 1; # end
|
---|
344 |
|
---|
345 | __END__
|
---|
346 |
|
---|
347 | =head1 NAME
|
---|
348 |
|
---|
349 | Image::ExifTool::RTF - Read Rich Text Format meta information
|
---|
350 |
|
---|
351 | =head1 SYNOPSIS
|
---|
352 |
|
---|
353 | This module is used by Image::ExifTool
|
---|
354 |
|
---|
355 | =head1 DESCRIPTION
|
---|
356 |
|
---|
357 | This module contains definitions required by Image::ExifTool to read meta
|
---|
358 | information from RTF (Rich Text Format) documents.
|
---|
359 |
|
---|
360 | =head1 AUTHOR
|
---|
361 |
|
---|
362 | Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
|
---|
363 |
|
---|
364 | This library is free software; you can redistribute it and/or modify it
|
---|
365 | under the same terms as Perl itself.
|
---|
366 |
|
---|
367 | =head1 REFERENCES
|
---|
368 |
|
---|
369 | =over 4
|
---|
370 |
|
---|
371 | =item L<http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc>
|
---|
372 |
|
---|
373 | =item L<http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod>
|
---|
374 |
|
---|
375 | =back
|
---|
376 |
|
---|
377 | =head1 SEE ALSO
|
---|
378 |
|
---|
379 | L<Image::ExifTool::TagNames/RTF Tags>,
|
---|
380 | L<Image::ExifTool(3pm)|Image::ExifTool>
|
---|
381 |
|
---|
382 | =cut
|
---|
383 |
|
---|