source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/Image/ExifTool/RTF.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

  • Property svn:executable set to *
File size: 13.3 KB
Line 
1#------------------------------------------------------------------------------
2# File: RTF.pm
3#
4# Description: Read Rich Text Format meta information
5#
6# Revisions: 2010/06/17 - P. Harvey Created
7#
8# References: 1) http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc
9# 2) http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod
10#------------------------------------------------------------------------------
11
12package Image::ExifTool::RTF;
13
14use strict;
15use vars qw($VERSION);
16use Image::ExifTool qw(:DataAccess :Utils);
17
18$VERSION = '1.01';
19
20sub ProcessUserProps($$$);
21
22# supported RTF character entities
23my %rtfEntity = (
24 par => 0x0a,
25 tab => 0x09,
26 endash => 0x2013,
27 emdash => 0x2014,
28 lquote => 0x2018,
29 rquote => 0x2019,
30 ldblquote => 0x201c,
31 rdblquote => 0x201d,
32 bullet => 0x2022,
33);
34
35# RTF tags (ref 1)
36%Image::ExifTool::RTF::Main = (
37 GROUPS => { 2 => 'Document' },
38 NOTES => q{
39 This table lists standard tags of the RTF information group, but ExifTool
40 will also extract any non-standard tags found in this group. As well,
41 ExifTool will extract any custom properties that are found. See
42 L<http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc>
43 for the specification.
44 },
45 title => { },
46 subject => { },
47 author => { Groups => { 2 => 'Author' } },
48 manager => { },
49 company => { },
50 copyright=> { Groups => { 2 => 'Author' } }, # (written by Apple TextEdit)
51 operator => { Name => 'LastModifiedBy' },
52 category => { },
53 keywords => { },
54 comment => { },
55 doccomm => { Name => 'Comments' },
56 hlinkbase=> { Name => 'HyperlinkBase' },
57 creatim => {
58 Name => 'CreateDate',
59 Format => 'date',
60 Groups => { 2 => 'Time' },
61 PrintConv => '$self->ConvertDateTime($val)',
62 },
63 revtim => {
64 Name => 'ModifyDate',
65 Format => 'date',
66 Groups => { 2 => 'Time' },
67 PrintConv => '$self->ConvertDateTime($val)',
68 },
69 printim => {
70 Name => 'LastPrinted',
71 Format => 'date',
72 Groups => { 2 => 'Time' },
73 PrintConv => '$self->ConvertDateTime($val)',
74 },
75 buptim => {
76 Name => 'BackupTime',
77 Format => 'date',
78 Groups => { 2 => 'Time' },
79 PrintConv => '$self->ConvertDateTime($val)',
80 },
81 edmins => {
82 Name => 'TotalEditTime', # in minutes
83 PrintConv => 'ConvertTimeSpan($val, 60)',
84 },
85 nofpages => { Name => 'Pages' },
86 nofwords => { Name => 'Words' },
87 nofchars => { Name => 'Characters' },
88 nofcharsws=>{
89 Name => 'CharactersWithSpaces',
90 Notes => q{
91 according to the 2007 Microsoft RTF specification this is clearly the number
92 of characters NOT including spaces, but Microsoft Word writes this as the
93 number WITH spaces, so ExifTool names this tag according to the de facto
94 standard
95 },
96 },
97 id => { Name => 'InternalIDNumber' },
98 version => { Name => 'RevisionNumber' },
99 vern => { Name => 'InternalVersionNumber' },
100);
101
102# lookup for user-defined properties
103# (none are pre-defined and this table doesn't appear in the docs)
104%Image::ExifTool::RTF::UserProps = (
105 GROUPS => { 2 => 'Document' },
106);
107
108#------------------------------------------------------------------------------
109# Read to nested closing curly bracket "}"
110# Inputs: 0) data ref, 1) optional RAF ref to read more data if available
111# Returns: text inside brackets, or undef on error
112# Notes: On entry the current position in the data must be set to immediately
113# after the command that opens the bracket. On return the current
114# position is immediately following the closing brace if the return
115# value is defined.
116sub ReadToNested($;$)
117{
118 my ($dataPt, $raf) = @_;
119 my $pos = pos $$dataPt;
120 my $level = 1;
121 for (;;) {
122 # look for the next bracket
123 unless ($$dataPt =~ /(\\*)([{}])/g) {
124 # must read some more data
125 my $p = length $$dataPt;
126 my $buff;
127 last unless $raf and $raf->Read($buff, 65536);
128 $$dataPt .= $buff;
129 # rewind position to include any leading backslashes
130 --$p while $p and substr($$dataPt, $p - 1, 1) eq '\\';
131 pos($$dataPt) = $p; # set position to continue search
132 next;
133 }
134 # bracket is escaped if preceded by an odd number of backslashes
135 next if $1 and length($1) & 0x01;
136 $2 eq '{' and ++$level, next;
137 next unless --$level <= 0;
138 return substr($$dataPt, $pos, pos($$dataPt) - $pos - 1);
139 }
140 return undef;
141}
142
143#------------------------------------------------------------------------------
144# Unescape RTF escape sequences
145# Inputs: 0) ExifTool ref, 1) RTF text, 2) RTF character set (for hex characters)
146# Returns: Unescaped text (in current ExifTool Charset)
147sub UnescapeRTF($$$)
148{
149 my ($exifTool, $val, $charset) = @_;
150
151 # return now unless we have a control sequence
152 unless ($val =~ /\\/) {
153 $val =~ tr/\n\r//d; # ignore CR's and LF's
154 return $val;
155 }
156 # CR/LF is signficant if it terminates a control sequence (so change these to a space)
157 $val =~ s/(^|[^\\])((?:\\\\)*)(\\[a-zA-Z]+(?:-?\d+)?)[\n\r]/$1$2$3 /g;
158 # protect the newline control sequence by converting to a \par command
159 $val =~ s/(^|[^\\])((?:\\\\)*)(\\[\n\r])/$1$2\\par /g;
160 # all other CR/LF's are ignored (so delete them)
161 $val =~ tr/\n\r//d;
162
163 my $rtnVal = '';
164 my $len = length $val;
165 my $skip = 1; # default Unicode skip count
166 my $p0 = 0;
167
168 for (;;) {
169 # find next backslash
170 my $p1 = ($val =~ /\\/g) ? pos($val) : $len + 1;
171 # add text up to start of this control sequence (or up to end)
172 my $n = $p1 - $p0 - 1;
173 $rtnVal .= substr($val, $p0, $n) if $n > 0;
174 # all done if at the end or if control sequence is empty
175 last if $p1 >= $len;
176 # look for an ASCII-letter control word or Unicode control
177 if ($val =~ /\G([a-zA-Z]+)(-?\d+)? ?/g) {
178 # interpret command if recognized
179 if ($1 eq 'uc') { # \ucN
180 $skip = $2;
181 } elsif ($1 eq 'u') { # \uN
182 require Image::ExifTool::Charset;
183 $rtnVal .= Image::ExifTool::Charset::Recompose($exifTool, [$2]);
184 if ($skip) {
185 # must skip the specified number of characters
186 # (not simple because RTF control words count as a single character)
187 last unless $val =~ /\G([^\\]|\\([a-zA-Z]+)(-?\d+)? ?|\\'.{2}|\\.){$skip}/g;
188 }
189 } elsif ($rtfEntity{$1}) {
190 require Image::ExifTool::Charset;
191 $rtnVal .= Image::ExifTool::Charset::Recompose($exifTool, [$rtfEntity{$1}]);
192 } # (else ignore the command)
193 } else {
194 my $ch = substr($val, $p1, 1);
195 if ($ch eq "'") {
196 # hex character code
197 last if $p1 + 3 > $len;
198 my $hex = substr($val, $p1 + 1, 2);
199 if ($hex =~ /^[0-9a-fA-F]{2}$/) {
200 require Image::ExifTool::Charset;
201 $rtnVal .= $exifTool->Decode(chr(hex($hex)), $charset);
202 }
203 pos($val) = $p1 + 3; # skip to after the hex code
204 } else {
205 # assume a standard control symbol (\, {, }, etc)
206 # (note, this may not be valid for some uncommon
207 # control symbols like \~ for non-breaking space)
208 $rtnVal .= $ch;
209 pos($val) = $p1 + 1; # skip to after this character
210 }
211 }
212 $p0 = pos($val);
213 }
214 return $rtnVal;
215}
216
217#------------------------------------------------------------------------------
218# Read information in a RTF document
219# Inputs: 0) ExifTool ref, 1) dirInfo ref
220# Returns: 1 on success, 0 if this wasn't a valid RTF file
221sub ProcessRTF($$)
222{
223 my ($exifTool, $dirInfo) = @_;
224 my $raf = $$dirInfo{RAF};
225 my ($buff, $buf2, $cs);
226
227 return 0 unless $raf->Read($buff, 64) and $raf->Seek(0,0);
228 return 0 unless $buff =~ /^[\n\r]*\{[\n\r]*\\rtf[^a-zA-Z]/;
229 $exifTool->SetFileType();
230#
231# determine the RTF character set
232#
233 if ($buff=~ /\\ansicpg(\d*)/) {
234 $cs = "cp$1";
235 } elsif ($buff=~ /\\(ansi|mac|pc|pca)[^a-zA-Z]/) {
236 my %trans = (
237 ansi => 'Latin',
238 mac => 'MacRoman',
239 pc => 'cp437',
240 pca => 'cp850',
241 );
242 $cs = $trans{$1};
243 } else {
244 $exifTool->Warn('Unspecified RTF encoding. Will assume Latin');
245 $cs = 'Latin';
246 }
247 my $charset = $Image::ExifTool::charsetName{lc $cs};
248 unless ($charset) {
249 $exifTool->Warn("Unsupported RTF encoding $cs. Will assume Latin.");
250 $charset = 'Latin';
251 }
252 my $tagTablePtr = GetTagTable('Image::ExifTool::RTF::Main');
253 undef $buff;
254#
255# scan for \info group
256#
257 for (;;) {
258 $raf->Read($buf2, 65536) or last;
259 if (defined $buff) {
260 # read more but leave some overlap for the match
261 $buff = substr($buff, -16) . $buf2;
262 } else {
263 $buff = $buf2;
264 }
265 next unless $buff =~ /[^\\]\{[\n\r]*\\info([^a-zA-Z])/g;
266 # anything but a space is included in the contents
267 pos($buff) = pos($buff) - 1 if $1 ne ' ';
268 my $info = ReadToNested(\$buff, $raf);
269 unless (defined $info) {
270 $exifTool->Warn('Unterminated information group');
271 last;
272 }
273 # process info commands (ie. "\author", "\*\copyright");
274 while ($info =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
275 pos($info) = pos($info) - 1 if $3 ne ' ';
276 my $tag = $2;
277 my $val = ReadToNested(\$info);
278 last unless defined $val;
279 my $tagInfo = $$tagTablePtr{$tag};
280 if ($tagInfo and $$tagInfo{Format} and $$tagInfo{Format} eq 'date') {
281 # parse RTF date commands
282 my %idx = (yr=>0,mo=>1,dy=>2,hr=>3,min=>4,sec=>5);
283 my @t = (0) x 6;
284 while ($val =~ /\\([a-z]+)(\d+)/g) {
285 next unless defined $idx{$1};
286 $t[$idx{$1}] = $2;
287 }
288 $val = sprintf("%.4d:%.2d:%.2d %.2d:%.2d:%.2d", @t);
289 } else {
290 # unescape RTF string value
291 $val = UnescapeRTF($exifTool, $val, $charset);
292 }
293 # create tagInfo for unknown tags
294 if (not $tagInfo) {
295 Image::ExifTool::AddTagToTable($tagTablePtr, $tag, { Name => ucfirst($tag) });
296 }
297 $exifTool->HandleTag($tagTablePtr, $tag, $val);
298 }
299 }
300 return 1 unless defined $buff;
301#
302# scan for \userprops (but don't read more from file to find the start of this command)
303#
304 pos($buff) = 0;
305 while ($buff =~ /[^\\]\{[\n\r]*\\\*[\n\r]*\\userprops([^a-zA-Z])/g) {
306 # Note: The RTF spec places brackets around each propinfo structure,
307 # but Microsoft Word doesn't write it this way, so tolerate either.
308 pos($buff) = pos($buff) - 1 if $1 ne ' ';
309 my $props = ReadToNested(\$buff, $raf);
310 $tagTablePtr = Image::ExifTool::GetTagTable('Image::ExifTool::RTF::UserProps');
311 unless (defined $props) {
312 $exifTool->Warn('Unterminated user properties');
313 last;
314 }
315 # process user properties
316 my $tag;
317 while ($props =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
318 pos($props) = pos($props) - 1 if $3 ne ' ';
319 my $t = $2;
320 my $val = ReadToNested(\$props);
321 last unless defined $val;
322 $val = UnescapeRTF($exifTool, $val, $charset);
323 if ($t eq 'propname') {
324 $tag = $val;
325 next;
326 } elsif ($t ne 'staticval' or not defined $tag) {
327 next; # ignore \linkval and \proptype for now
328 }
329 $tag =~ s/\s(.)/\U$1/g; # capitalize all words in tag name
330 $tag =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
331 next unless $tag;
332 # create tagInfo for unknown tags
333 unless ($$tagTablePtr{$tag}) {
334 Image::ExifTool::AddTagToTable($tagTablePtr, $tag, { Name => $tag });
335 }
336 $exifTool->HandleTag($tagTablePtr, $tag, $val);
337 }
338 last; # (didn't really want to loop)
339 }
340 return 1;
341}
342
3431; # end
344
345__END__
346
347=head1 NAME
348
349Image::ExifTool::RTF - Read Rich Text Format meta information
350
351=head1 SYNOPSIS
352
353This module is used by Image::ExifTool
354
355=head1 DESCRIPTION
356
357This module contains definitions required by Image::ExifTool to read meta
358information from RTF (Rich Text Format) documents.
359
360=head1 AUTHOR
361
362Copyright 2003-2011, Phil Harvey (phil at owl.phy.queensu.ca)
363
364This library is free software; you can redistribute it and/or modify it
365under the same terms as Perl itself.
366
367=head1 REFERENCES
368
369=over 4
370
371=item L<http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc>
372
373=item L<http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod>
374
375=back
376
377=head1 SEE ALSO
378
379L<Image::ExifTool::TagNames/RTF Tags>,
380L<Image::ExifTool(3pm)|Image::ExifTool>
381
382=cut
383
Note: See TracBrowser for help on using the repository browser.