source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 2680

Last change on this file since 2680 was 2680, checked in by jrm21, 23 years ago
  1. we escape 'and' chars in headers so greenstone doesn't try to expand it

as metadata.

  1. fixed up FromName and FromAddr if the From: field isn't in the "Name" <addr>

format.

  1. MIME boundaries are allowed spaces in them (didn't read RFC properly...)
  2. Body is now correctly interpreted if in quoted-printable or base64 and

content-type is text/* (not multipart).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata:
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header - this will be stored as Creator
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51#
52# John McPherson - June/July 2001
53# added (basic) MIME support and quoted-printable and base64 decodings.
54# Minor fixes for names that are actually email addresses (ie <...> was lost)
55#
56# See: * RFC 822 - ARPA Internet Text Messages
57# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
58# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
59# * RFC 1806 - Content Dispositions (ie inline/attachment)
60package EMAILPlug;
61
62use SplitPlug;
63
64use sorttools;
65use util;
66
67
68# EMAILPlug is a sub-class of SplitPlug.
69
70sub BEGIN {
71 @ISA = ('SplitPlug');
72}
73
74# Create a new EMAILPlug object with which to parse a file.
75# Accomplished by creating a new BasPlug and using bless to
76# turn it into an EMAILPlug.
77
78sub new {
79 my ($class) = @_;
80 my $self = new BasPlug ("EMAILPlug", @_);
81 # make sure we don't run textcat (defaults to "auto");
82 $self->{'input_encoding'}="ascii"; # this might not be good enough...
83 return bless $self, $class;
84}
85
86sub get_default_process_exp {
87 my $self = shift (@_);
88 # mbx/email for mailbox file format, \d+ for maildir (each message is
89 # in a separate file, with a unique number for filename)
90 return q@([\\/]\d+|\.(mbx|email))$@;
91}
92
93# This plugin splits the mbox mail files at lines starting with From<sp>
94sub get_default_split_exp {
95 return q^\nFrom .*\n^;
96}
97
98
99# do plugin specific processing of doc_obj
100sub process {
101
102 my $self = shift (@_);
103 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
104 my $outhandle = $self->{'outhandle'};
105
106 # Check that we're dealing with a valid mail file
107 return undef unless (($$textref =~ /From:/m) || ($$textref =~ /To:/m));
108
109 # slightly more strict validity check, to prevent us from matching
110 # .so.x files ...
111 return undef unless (($$textref =~ /^From /) ||
112 ($$textref =~ /^[-A-Za-z]{2,100}:/m));
113
114 print $outhandle "EMAILPlug: processing $file\n"
115 if $self->{'verbosity'} > 1;
116
117 my $cursection = $doc_obj->get_top_section();
118
119 #
120 # Parse the document's text and extract metadata
121 #
122
123 # Protect backslashes
124 $$textref =~ s@\\@\\\\@g;
125
126 # Separate header from body of message
127 my $Headers = $$textref;
128 $Headers =~ s/\r?\n\r?\n(.*)$//s;
129 $$textref = $1;
130 # escape [] so it isn't re-interpreted as metadata
131 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
132
133
134 # Unfold headers - see rfc822
135 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
136 # Extract basic metadata from header
137 my @headers = ("From", "To", "Subject", "Date");
138 my %raw;
139 foreach my $name (@headers) {
140 $raw{$name} = "No $name value";
141 }
142
143 # Examine each line of the headers
144 my ($line, $name, $value);
145 my @parts;
146 foreach $line (split(/\n/, $Headers)) {
147
148 # Ignore lines with no content or which begin with whitespace
149 next unless ($line =~ /:/);
150 next if ($line =~ /^\s/);
151
152 # Find out what metadata is on this line
153 @parts = split(/:/, $line);
154 $name = shift @parts;
155# uppercase the first character according to the current locale
156 $name=~s/(.+)/\u$1/;
157 next unless $name;
158 next unless ($raw{$name});
159
160 # Find the value of that metadata
161 $value = join(":", @parts);
162 $value =~ s/^\s+//;
163 $value =~ s/\s+$//;
164
165 # Store the metadata
166 $raw{$name} = $value;
167 }
168
169 # Extract the name and e-mail address from the From metadata
170 $frommeta = $raw{"From"};
171 my $fromnamemeta;
172 my $fromaddrmeta;
173
174 $frommeta =~ s/\s*$//; # Remove trailing space, if any
175
176 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
177 $fromnamemeta=$1;
178 $fromaddrmeta=$2;
179 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
180 $fromnamemeta=$2;
181 $fromaddrmeta=$1;
182 }
183 if (!defined($fromaddrmeta)) {
184 $fromaddrmeta=$frommeta;
185 }
186 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
187 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
188
189 if (defined($fromnamemeta)) {
190 $fromnamemeta =~ s/\"//g;
191 }
192 else {
193 $fromnamemeta = $fromaddrmeta;
194 }
195 # if name is an address
196 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
197 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
198
199 # Escape < and > in the whole From field;
200 $frommeta =~ s/</&lt;/g; $frommeta =~ s/>/&gt;/g;
201 $raw{"From"}=$frommeta;
202
203 # Process Date information
204 if ($raw{"Date"} !~ /No Date/) {
205 $raw{"DateText"} = $raw{"Date"};
206
207 # Convert the date text to internal date format
208 $value = $raw{"Date"};
209 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
210 if ($year < 100) { $year += 1900; }
211 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
212
213 } else {
214 # We have not extracted a date
215 $raw{"DateText"} = "Unknown.";
216 $raw{"Date"} = "19000000";
217 }
218
219
220 # Add extracted metadata to document object
221 foreach my $name (keys %raw) {
222 $value = $raw{$name};
223 if ($value) {
224 $value = &text_into_html($value);
225 } else {
226 $value = "No $name field";
227 }
228 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
229 }
230
231 my $mimetype="text/plain";
232 my $mimeinfo="";
233 # Do MIME and encoding stuff
234 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
235 {
236 $mimetype=$1;
237 $mimetype =~ tr/[A-Z]/[a-z]/;
238 $mimeinfo=$2;
239 }
240
241 my $transfer_encoding="7bit";
242 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
243 $transfer_encoding=$1;
244 }
245 if ($mimetype ne "text/plain") {
246 $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref,
247 $outhandle);
248 } elsif ($transfer_encoding =~ /quoted\-printable/) {
249 $$textref=qp_decode($$textref);
250 } elsif ($transfer_encoding =~ /base64/) {
251 $$textref=base64_decode($$textref);
252 }
253
254
255 # Add "All headers" metadata
256 $Headers = &text_into_html($Headers);
257 $Headers = "No headers" unless ($Headers =~ /\w/);
258 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
259
260 # Add text to document object
261 if ($mimetype eq "text/plain") {
262 $$textref = &text_into_html($$textref);
263 }
264 $$textref = "No message" unless ($$textref =~ /\w/);
265 $doc_obj->add_utf8_text($cursection, $$textref);
266
267 return 1;
268}
269
270
271# Convert a text string into HTML.
272#
273# The HTML is going to be inserted into a GML file, so
274# we have to be careful not to use symbols like ">",
275# which ocurs frequently in email messages (and use
276# &gt instead.
277#
278# This function also turns links and email addresses into hyperlinks,
279# and replaces carriage returns with <BR> tags (and multiple carriage
280# returns with <P> tags).
281
282
283sub text_into_html {
284 my ($text) = @_;
285
286 # Convert problem characters into HTML symbols
287 $text =~ s/&/&amp;/go;
288 $text =~ s/</&lt;/go;
289 $text =~ s/>/&gt;/go;
290 $text =~ s/\"/&quot;/go;
291
292 # convert email addresses and URIs into links
293# don't markup email addresses for now
294# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
295
296 # assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
297 # URI can't finish with a '.'
298 $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&amp;|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;
299
300
301 # Clean up whitespace and convert \n charaters to <BR> or <P>
302 $text =~ s/ +/ /go;
303 $text =~ s/\s*$//o;
304 $text =~ s/^\s*//o;
305 $text =~ s/\n/\n<BR>/go;
306 $text =~ s/<BR>\s*<BR>/<P>/go;
307
308 return $text;
309}
310
311
312
313
314#Process a MIME message.
315# the textref we are given DOES NOT include the header.
316sub text_from_mime_message {
317 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
318
319 # Check for multiparts - $mimeinfo will be a boundary
320 if ($mimetype =~ /multipart/) {
321 $boundary="";
322 if ($mimeinfo =~ /boundary=\"?([^"]+)\"?\s*$/im) {
323 $boundary=$1;
324 }
325 # parts start with "--$boundary"
326 # message ends with "--$boundary--"
327 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
328 # that perl might want to interpolate. Also allows spaces...
329
330 $boundary=~s/\\/\\\\/g;
331 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
332 my @message_parts = split("\r?\n\-\-$boundary", $text);
333 # remove first "part" and last "part" (final --)
334 shift @message_parts;
335 my $last=pop @message_parts;
336 # if our boundaries are a bit dodgy and we only found 1 part...
337 if (!defined($last)) {$last="";}
338 # make sure it is only -- and whitespace
339 if ($last !~ /^\-\-\s*$/ms) {
340 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
341 }
342 foreach my $message_part (@message_parts) {
343 # remove the leading newline left from split.
344 $message_part=~s/^\r?\n//;
345 }
346 if ($mimetype eq "multipart/alternative") {
347 # check for an HTML version first, then TEXT, otherwise use first.
348 my $part_text="";
349 foreach my $message_part (@message_parts) {
350 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
351 {
352 # Use the HTML version
353 $part_text=text_from_part($message_part);
354 $mimetype="text/html";
355 last;
356 }
357 }
358 if ($part_text eq "") { # try getting a text part instead
359 foreach my $message_part (@message_parts) {
360 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
361 {
362 # Use the plain version
363 $part_text=text_from_part($message_part);
364 $mimetype="text/plain";
365 last;
366 }
367 }
368 }
369 if ($part_text eq "") { # use first part
370 $part_text=text_from_part(shift @message_parts);
371 }
372 if ($part_text eq "") { # we couldn't get anything!!!
373 # or it was an empty message...
374 # do nothing...
375 print $outhandle "EMAILPlug: no text - empty body?\n";
376 } else {
377 $text=$part_text;
378 }
379 } elsif ($mimetype =~ m@multipart/(mixed|digest|related)@) {
380 $text="";
381 foreach my $message_part (@message_parts) {
382 my $part_header=$message_part;
383 my $part_body;
384 if ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
385 $part_body=$1;
386 } else {
387 # no header... use defaults
388 $part_body=$message_part;
389 $part_header="Content-type: text/plain; charset=us-ascii";
390 }
391 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
392 my $part_content_type="";
393 my $part_content_info="";
394 if ($mimetype eq "multipart/digest") {
395 # default type - RTFRFC!!
396 $part_content_type="message/rfc822";
397 }
398 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*([^\s]*)@mi) {
399 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
400 $part_content_info=$2;
401 }
402 my $filename="";
403 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
404 $filename=$1;
405 }
406
407 # disposition - either inline or attachment.
408 # NOT CURRENTLY USED - we display all text types instead...
409 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
410
411 # add <<attachment>> to each part except the first...
412 if ($text ne "") {
413 $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
414 # add part info header
415 $text.="<br>Type: $part_content_type<br>\n";
416 if ($filename ne "") {
417 $text.="Filename: $filename\n";
418 }
419 $text.="</strong></p>\n";
420 }
421
422 if ($part_content_type =~ m@text/@)
423 {
424 my $part_text=text_from_part($message_part);
425 if ($part_content_type !~ m@text/(ht|x)ml@) {
426 $part_text=text_into_html($part_text);
427 }
428 if ($part_text eq "") {
429 $part_text='&lt;&lt;empty message&gt;&gt;';
430 }
431 $text.=$part_text;
432 } elsif ($part_content_type =~ m@message/rfc822@) {
433 # This is a forwarded message
434 my $message_part_headers=$part_body;
435 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
436 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
437 my $message_part_body=$1;
438
439 my $rfc822_formatted_body=""; # put result in here
440 if ($message_part_headers =~
441 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*?([^\s]+)?\s*$/ims)
442 {
443 # The message header uses MIME flags
444 my $message_content_type=$1;
445 my $message_content_info=$2;
446 if (!defined($message_content_info)) {
447 $message_content_info="";
448 }
449 $message_content_type =~ tr/A-Z/a-z/;
450 if ($message_content_type =~ /multipart/) {
451 $rfc822_formatted_body=
452 text_from_mime_message($message_content_type,
453 $message_content_info,
454 $message_part_body,
455 $outhandle);
456 } else {
457 $message_part_body=text_from_part($part_body);
458 $rfc822_formatted_body=text_into_html($message_part_body);
459 }
460 } else {
461 # message doesn't use MIME flags
462 $rfc822_formatted_body=text_into_html($message_part_body);
463 }
464 # Add the returned text to the output
465 # don't put all the headers...
466 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
467 $text.=text_into_html($message_part_headers);
468 $text.="<p>\n";
469 $text.=$rfc822_formatted_body;
470 # end of message/rfc822
471 } elsif ($part_content_type =~ /multipart/) {
472 # recurse again
473 $tmptext=text_from_mime_message($part_content_type,
474 $part_content_info,
475 $part_body,
476 $outhandle);
477 $text.=$tmptext;
478 } elsif ($text eq "") {
479 # we can't do anything with this part, but if it's the first
480 # part then make sure it is mentioned..
481
482 $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
483 # add part info header
484 $text.="<br>Type: $part_content_type<br>\n";
485 if ($filename ne "") {
486 $text.="Filename: $filename\n";
487 }
488 $text.="</strong></p>\n";
489 }
490 } # foreach message part.
491 } else {
492 # we can't handle this multipart type (not mixed or alternative)
493 # the RFC also mentions "parallel".
494 }
495 } # end of multipart
496 return $text;
497}
498
499
500
501
502
503
504# Process a MIME part. Return "" if we can't decode it.
505sub text_from_part {
506 my $text=shift;
507 my $part_header=$text;
508 $part_header =~ s/\r?\n\r?\n(.*)$//s;
509 $text=$1; if (!defined($text)) {$text="";}
510 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
511 $part_header =~ /content\-type:\s*([\w\/]+)/is;
512 my $type=$1; if (!defined($type)) {$type="";}
513 my $encoding="";
514 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
515 $encoding=$1; $encoding=~tr/A-Z/a-z/;
516 }
517 # Content-Transfer-Encoding is per-part
518 if ($encoding ne "") {
519 if ($encoding =~ /quoted\-printable/) {
520 $text=qp_decode($text);
521 } elsif ($encoding =~ /base64/) {
522 $text=base64_decode($text);
523 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
524 # rfc2045 also allows binary, which we ignore (for now).
525 # maybe this shouldn't go to stderr, but anyway...
526 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
527 return "";
528 }
529 }
530
531 if ($type eq "text/html") {
532 # only get stuff between <body> tags, or <html> tags.
533 $text =~ s/^.*?<(html|HTML)[^>]*>//s;
534 $text =~ s/<\/(html|HTML)>.*$//s;
535
536 $text =~ s/^.*?<(body|BODY)[^>]*>//s;
537 $text =~ s/<\/(body|BODY)>.*$//s;
538 }
539 elsif ($type eq "text/xml") {
540 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
541 $text="<pre>\n$text\n</pre>\n";
542 }
543 return $text;
544}
545
546
547# decode quoted-printable text
548sub qp_decode {
549 my $text=shift;
550
551 my @lines=split('\n', $text);
552
553 # if a line ends with "=\s*", it is a soft line break, otherwise
554 # keep in any newline characters.
555 foreach my $line (@lines) {
556 if ($line =~ s/=\s*$//) {}
557 else {$line.="\n";}
558
559 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
560 my @hexcode_segments=split('=',$line);
561 shift @hexcode_segments;
562 my @hexcodes;
563 foreach my $hexcode (@hexcode_segments) {
564 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
565 chomp($hexcode); # just in case...
566 my $char=chr (hex "0x$hexcode");
567 $line =~ s/=$hexcode/$char/g;
568 }
569 }
570 }
571 $text= join('', @lines);
572 return $text;
573}
574
575# decode base64 text. This is fairly slow (since it's interpreted perl rather
576# than compiled XS stuff like in the ::MIME modules, but this is more portable
577# for us at least).
578# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
579# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
580# from each encoded byte.
581
582
583sub base64_decode {
584 my $enc_text = shift;
585# A=>0, B=>1, ..., '+'=>62, '/'=>63
586# also '=' is used for padding at the end, but we remove it anyway.
587 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
588# map each MIME char into it's value, for more efficient lookup.
589 my %index;
590 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
591# remove all non-base64 chars. eval to get variable in transliteration...
592# also remove '=' - we'll assume (!!) that there are no errors in the encoding
593 eval "\$enc_text =~ tr|$mimechars||cd";
594 my $decoded="";
595 while (length ($enc_text)>3)
596 {
597 my $fourchars=substr($enc_text,0,4,"");
598 my @chars=(split '',$fourchars);
599 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
600 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
601 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
602 }
603# if there are any input chars left, there are either
604# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
605 my @chars=(split '',$enc_text);
606 if (length($enc_text)) {
607 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
608 }
609 if (length($enc_text)==3) {
610 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
611 }
612 return $decoded;
613}
614
615
616# Perl packages have to return true if they are run.
6171;
Note: See TracBrowser for help on using the repository browser.