source: main/trunk/greenstone2/perllib/cpan/Image/ExifTool/RTF.pm@ 34921

Last change on this file since 34921 was 34921, checked in by anupama, 3 years ago

Committing the improvements to EmbeddedMetaPlugin's processing of Keywords vs other metadata fields. Keywords were literally stored as arrays of words rather than phrases in PDFs (at least in Diego's sample PDF), whereas other meta fields like Subjects and Creators stored them as arrays of phrases. To get both to work, Kathy updated EXIF to a newer version, to retrieve the actual EXIF values stored in the PDF. And Kathy and Dr Bainbridge came up with a new option that I added called apply_join_before_split_to_metafields that's a regex which can list the metadata fields to apply the join_before_split to and whcih previously always got applied to all metadata fields. Now it's applied to any *Keywords metafields by default, as that's the metafield we have experience of that behaves differently to the others, as it stores by word instead of phrases. Tested on Diego's sample PDF. Diego has double-checked it to works on his sample PDF too, setting the split char to ; and turning on the join_before_split and leaving apply_join_before_split_to_metafields at its default of .*Keywords. File changes are strings.properties for the tooltip, the plugin introducing the option and working with it and Kathy's EXIF updates affecting cpan/File and cpan/Image.

  • Property svn:executable set to *
File size: 13.5 KB
Line 
1#------------------------------------------------------------------------------
2# File: RTF.pm
3#
4# Description: Read Rich Text Format meta information
5#
6# Revisions: 2010/06/17 - P. Harvey Created
7#
8# References: 1) http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc
9# 2) http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod
10#------------------------------------------------------------------------------
11
12package Image::ExifTool::RTF;
13
14use strict;
15use vars qw($VERSION);
16use Image::ExifTool qw(:DataAccess :Utils);
17
18$VERSION = '1.04';
19
20sub ProcessUserProps($$$);
21
22# supported RTF character entities
23my %rtfEntity = (
24 par => 0x0a,
25 tab => 0x09,
26 endash => 0x2013,
27 emdash => 0x2014,
28 lquote => 0x2018,
29 rquote => 0x2019,
30 ldblquote => 0x201c,
31 rdblquote => 0x201d,
32 bullet => 0x2022,
33);
34
35# RTF tags (ref 1)
36%Image::ExifTool::RTF::Main = (
37 GROUPS => { 2 => 'Document' },
38 NOTES => q{
39 This table lists standard tags of the RTF information group, but ExifTool
40 will also extract any non-standard tags found in this group. As well,
41 ExifTool will extract any custom properties that are found. See
42 L<http://www.microsoft.com/en-ca/download/details.aspx?id=10725> for the
43 specification.
44 },
45 title => { },
46 subject => { },
47 author => { Groups => { 2 => 'Author' } },
48 manager => { },
49 company => { },
50 copyright=> { Groups => { 2 => 'Author' } }, # (written by Apple TextEdit)
51 operator => { Name => 'LastModifiedBy' },
52 category => { },
53 keywords => { },
54 comment => { },
55 doccomm => { Name => 'Comments' },
56 hlinkbase=> { Name => 'HyperlinkBase' },
57 creatim => {
58 Name => 'CreateDate',
59 Format => 'date',
60 Groups => { 2 => 'Time' },
61 PrintConv => '$self->ConvertDateTime($val)',
62 },
63 revtim => {
64 Name => 'ModifyDate',
65 Format => 'date',
66 Groups => { 2 => 'Time' },
67 PrintConv => '$self->ConvertDateTime($val)',
68 },
69 printim => {
70 Name => 'LastPrinted',
71 Format => 'date',
72 Groups => { 2 => 'Time' },
73 PrintConv => '$self->ConvertDateTime($val)',
74 },
75 buptim => {
76 Name => 'BackupTime',
77 Format => 'date',
78 Groups => { 2 => 'Time' },
79 PrintConv => '$self->ConvertDateTime($val)',
80 },
81 edmins => {
82 Name => 'TotalEditTime', # in minutes
83 PrintConv => 'ConvertTimeSpan($val, 60)',
84 },
85 nofpages => { Name => 'Pages' },
86 nofwords => { Name => 'Words' },
87 nofchars => { Name => 'Characters' },
88 nofcharsws=>{
89 Name => 'CharactersWithSpaces',
90 Notes => q{
91 according to the 2007 Microsoft RTF specification this is clearly the number
92 of characters NOT including spaces, but Microsoft Word writes this as the
93 number WITH spaces, so ExifTool names this tag according to the de facto
94 standard
95 },
96 },
97 id => { Name => 'InternalIDNumber' },
98 version => { Name => 'RevisionNumber' },
99 vern => { Name => 'InternalVersionNumber' },
100);
101
102# lookup for user-defined properties
103# (none are pre-defined and this table doesn't appear in the docs)
104%Image::ExifTool::RTF::UserProps = (
105 GROUPS => { 2 => 'Document' },
106);
107
108#------------------------------------------------------------------------------
109# Read to nested closing curly bracket "}"
110# Inputs: 0) data ref, 1) optional RAF ref to read more data if available
111# Returns: text inside brackets, or undef on error
112# Notes: On entry the current position in the data must be set to immediately
113# after the command that opens the bracket. On return the current
114# position is immediately following the closing brace if the return
115# value is defined.
116sub ReadToNested($;$)
117{
118 my ($dataPt, $raf) = @_;
119 my $pos = pos $$dataPt;
120 my $level = 1;
121 for (;;) {
122 # look for the next bracket
123 unless ($$dataPt =~ /(\\*)([{}])/g) {
124 # must read some more data
125 my $p = length $$dataPt;
126 my $buff;
127 last unless $raf and $raf->Read($buff, 65536);
128 $$dataPt .= $buff;
129 # rewind position to include any leading backslashes
130 --$p while $p and substr($$dataPt, $p - 1, 1) eq '\\';
131 pos($$dataPt) = $p; # set position to continue search
132 next;
133 }
134 # bracket is escaped if preceded by an odd number of backslashes
135 next if $1 and length($1) & 0x01;
136 $2 eq '{' and ++$level, next;
137 next unless --$level <= 0;
138 return substr($$dataPt, $pos, pos($$dataPt) - $pos - 1);
139 }
140 return undef;
141}
142
143#------------------------------------------------------------------------------
144# Unescape RTF escape sequences
145# Inputs: 0) ExifTool ref, 1) RTF text, 2) RTF character set (for hex characters)
146# Returns: Unescaped text (in current ExifTool Charset)
147sub UnescapeRTF($$$)
148{
149 my ($et, $val, $charset) = @_;
150
151 # return now unless we have a control sequence
152 unless ($val =~ /\\/) {
153 $val =~ tr/\n\r//d; # ignore CR's and LF's
154 return $val;
155 }
156 # CR/LF is significant if it terminates a control sequence (so change these to a space)
157 # (was $val =~ s/(^|[^\\])((?:\\\\)*)(\\[a-zA-Z]+(?:-?\d+)?)[\n\r]/$1$2$3 /g;)
158 $val =~ s/\\(?:([a-zA-Z]+(?:-?\d+)?)[\n\r]|(.))/'\\'.($1 ? "$1 " : $2)/sge;
159 # protect the newline control sequence by converting to a \par command
160 # (was $val =~ s/(^|[^\\])((?:\\\\)*)(\\[\n\r])/$1$2\\par /g;)
161 $val =~ s/(\\[\n\r])|(\\.)/$2 || '\\par '/sge;
162 # all other CR/LF's are ignored (so delete them)
163 $val =~ tr/\n\r//d;
164
165 my $rtnVal = '';
166 my $len = length $val;
167 my $skip = 1; # default Unicode skip count
168 my $p0 = 0;
169
170 for (;;) {
171 # find next backslash
172 my $p1 = ($val =~ /\\/g) ? pos($val) : $len + 1;
173 # add text up to start of this control sequence (or up to end)
174 my $n = $p1 - $p0 - 1;
175 $rtnVal .= substr($val, $p0, $n) if $n > 0;
176 # all done if at the end or if control sequence is empty
177 last if $p1 >= $len;
178 # look for an ASCII-letter control word or Unicode control
179 if ($val =~ /\G([a-zA-Z]+)(-?\d+)? ?/g) {
180 # interpret command if recognized
181 if ($1 eq 'uc') { # \ucN
182 $skip = $2;
183 } elsif ($1 eq 'u') { # \uN
184 if ($2 < 0) {
185 $et->WarnOnce('Invalid Unicode character(s) in text');
186 $rtnVal .= '?';
187 } else {
188 require Image::ExifTool::Charset;
189 $rtnVal .= Image::ExifTool::Charset::Recompose($et, [$2]);
190 if ($skip) {
191 # must skip the specified number of characters
192 # (not simple because RTF control words count as a single character)
193 last unless $val =~ /\G([^\\]|\\([a-zA-Z]+)(-?\d+)? ?|\\'.{2}|\\.){$skip}/g;
194 }
195 }
196 } elsif ($rtfEntity{$1}) {
197 require Image::ExifTool::Charset;
198 $rtnVal .= Image::ExifTool::Charset::Recompose($et, [$rtfEntity{$1}]);
199 } # (else ignore the command)
200 } else {
201 my $ch = substr($val, $p1, 1);
202 if ($ch eq "'") {
203 # hex character code
204 last if $p1 + 3 > $len;
205 my $hex = substr($val, $p1 + 1, 2);
206 if ($hex =~ /^[0-9a-fA-F]{2}$/) {
207 require Image::ExifTool::Charset;
208 $rtnVal .= $et->Decode(chr(hex($hex)), $charset);
209 }
210 pos($val) = $p1 + 3; # skip to after the hex code
211 } else {
212 # assume a standard control symbol (\, {, }, etc)
213 # (note, this may not be valid for some uncommon
214 # control symbols like \~ for non-breaking space)
215 $rtnVal .= $ch;
216 pos($val) = $p1 + 1; # skip to after this character
217 }
218 }
219 $p0 = pos($val);
220 }
221 return $rtnVal;
222}
223
224#------------------------------------------------------------------------------
225# Read information in a RTF document
226# Inputs: 0) ExifTool ref, 1) dirInfo ref
227# Returns: 1 on success, 0 if this wasn't a valid RTF file
228sub ProcessRTF($$)
229{
230 my ($et, $dirInfo) = @_;
231 my $raf = $$dirInfo{RAF};
232 my ($buff, $buf2, $cs);
233
234 return 0 unless $raf->Read($buff, 64) and $raf->Seek(0,0);
235 return 0 unless $buff =~ /^[\n\r]*\{[\n\r]*\\rtf[^a-zA-Z]/;
236 $et->SetFileType();
237#
238# determine the RTF character set
239#
240 if ($buff=~ /\\ansicpg(\d*)/) {
241 $cs = "cp$1";
242 } elsif ($buff=~ /\\(ansi|mac|pc|pca)[^a-zA-Z]/) {
243 my %trans = (
244 ansi => 'Latin',
245 mac => 'MacRoman',
246 pc => 'cp437',
247 pca => 'cp850',
248 );
249 $cs = $trans{$1};
250 } else {
251 $et->Warn('Unspecified RTF encoding. Will assume Latin');
252 $cs = 'Latin';
253 }
254 my $charset = $Image::ExifTool::charsetName{lc $cs};
255 unless ($charset) {
256 $et->Warn("Unsupported RTF encoding $cs. Will assume Latin.");
257 $charset = 'Latin';
258 }
259 my $tagTablePtr = GetTagTable('Image::ExifTool::RTF::Main');
260 undef $buff;
261#
262# scan for \info group
263#
264 for (;;) {
265 $raf->Read($buf2, 65536) or last;
266 if (defined $buff) {
267 # read more but leave some overlap for the match
268 $buff = substr($buff, -16) . $buf2;
269 } else {
270 $buff = $buf2;
271 }
272 next unless $buff =~ /[^\\]\{[\n\r]*\\info([^a-zA-Z])/g;
273 # anything but a space is included in the contents
274 pos($buff) = pos($buff) - 1 if $1 ne ' ';
275 my $info = ReadToNested(\$buff, $raf);
276 unless (defined $info) {
277 $et->Warn('Unterminated information group');
278 last;
279 }
280 # process info commands (eg. "\author", "\*\copyright");
281 while ($info =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
282 pos($info) = pos($info) - 1 if $3 ne ' ';
283 my $tag = $2;
284 my $val = ReadToNested(\$info);
285 last unless defined $val;
286 my $tagInfo = $$tagTablePtr{$tag};
287 if ($tagInfo and $$tagInfo{Format} and $$tagInfo{Format} eq 'date') {
288 # parse RTF date commands
289 my %idx = (yr=>0,mo=>1,dy=>2,hr=>3,min=>4,sec=>5);
290 my @t = (0) x 6;
291 while ($val =~ /\\([a-z]+)(\d+)/g) {
292 next unless defined $idx{$1};
293 $t[$idx{$1}] = $2;
294 }
295 $val = sprintf("%.4d:%.2d:%.2d %.2d:%.2d:%.2d", @t);
296 } else {
297 # unescape RTF string value
298 $val = UnescapeRTF($et, $val, $charset);
299 }
300 # create tagInfo for unknown tags
301 if (not $tagInfo) {
302 AddTagToTable($tagTablePtr, $tag, { Name => ucfirst($tag) });
303 }
304 $et->HandleTag($tagTablePtr, $tag, $val);
305 }
306 }
307 return 1 unless defined $buff;
308#
309# scan for \userprops (but don't read more from file to find the start of this command)
310#
311 pos($buff) = 0;
312 while ($buff =~ /[^\\]\{[\n\r]*\\\*[\n\r]*\\userprops([^a-zA-Z])/g) {
313 # Note: The RTF spec places brackets around each propinfo structure,
314 # but Microsoft Word doesn't write it this way, so tolerate either.
315 pos($buff) = pos($buff) - 1 if $1 ne ' ';
316 my $props = ReadToNested(\$buff, $raf);
317 $tagTablePtr = Image::ExifTool::GetTagTable('Image::ExifTool::RTF::UserProps');
318 unless (defined $props) {
319 $et->Warn('Unterminated user properties');
320 last;
321 }
322 # process user properties
323 my $tag;
324 while ($props =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
325 pos($props) = pos($props) - 1 if $3 ne ' ';
326 my $t = $2;
327 my $val = ReadToNested(\$props);
328 last unless defined $val;
329 $val = UnescapeRTF($et, $val, $charset);
330 if ($t eq 'propname') {
331 $tag = $val;
332 next;
333 } elsif ($t ne 'staticval' or not defined $tag) {
334 next; # ignore \linkval and \proptype for now
335 }
336 $tag =~ s/\s(.)/\U$1/g; # capitalize all words in tag name
337 $tag =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
338 next unless $tag;
339 # create tagInfo for unknown tags
340 unless ($$tagTablePtr{$tag}) {
341 AddTagToTable($tagTablePtr, $tag, { Name => $tag });
342 }
343 $et->HandleTag($tagTablePtr, $tag, $val);
344 }
345 last; # (didn't really want to loop)
346 }
347 return 1;
348}
349
3501; # end
351
352__END__
353
354=head1 NAME
355
356Image::ExifTool::RTF - Read Rich Text Format meta information
357
358=head1 SYNOPSIS
359
360This module is used by Image::ExifTool
361
362=head1 DESCRIPTION
363
364This module contains definitions required by Image::ExifTool to read meta
365information from RTF (Rich Text Format) documents.
366
367=head1 AUTHOR
368
369Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
370
371This library is free software; you can redistribute it and/or modify it
372under the same terms as Perl itself.
373
374=head1 REFERENCES
375
376=over 4
377
378=item L<http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc>
379
380=item L<http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod>
381
382=back
383
384=head1 SEE ALSO
385
386L<Image::ExifTool::TagNames/RTF Tags>,
387L<Image::ExifTool(3pm)|Image::ExifTool>
388
389=cut
390
Note: See TracBrowser for help on using the repository browser.