source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3721

Last change on this file since 3721 was 3721, checked in by jrm21, 21 years ago

bug where some text/plain messages weren't having < > & properly escaped.
Also some minor code formatting changes for consistency.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.0 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63
64# 12/05/02 Added usage datastructure - John Thompson
65
66package EMAILPlug;
67
68use SplitPlug;
69
70use unicode;
71
72use sorttools;
73use util;
74
75
76# EMAILPlug is a sub-class of SplitPlug.
77
78sub BEGIN {
79 @ISA = ('SplitPlug');
80}
81
82sub print_usage {
83 print STDERR "\n usage: plugin EMAILPlug [options]\n\n";
84 print STDERR " options:\n";
85 print STDERR " -split_exp A perl regular expression used to split files\n";
86 print STDERR " containing many messages into individual documents.\n\n";
87 print STDERR " -no_attachments Do not save message attachments.\n\n";
88
89}
90
91my $arguments =
92[ { 'name' => "process_exp",
93 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
94 'type' => "string",
95 'reqd' => "no",
96 'deft' => q@([\\/]\d+|\.(mbx|email|eml))$@
97 },
98 { 'name' => "no_attachments",
99 'desc' => "Do not save message attachments.",
100 'type' => "flag",
101 'reqd' => "no"
102 },
103 { 'name' => "block_exp",
104 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
105 'type' => "string",
106 'reqd' => "no",
107 'deft' => q^^}
108];
109
110my $options =
111{ 'name' => "EMAILPlug",
112 'desc' => "Email plug reads email files. These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers All the header content\n\t\$Subject Subject: header\n\t\$To To: header\n\t\$From From: header\n\t\$FromName Name of sender (where available)\n\t\$FromAddr E-mail address of sender\n\t\$DateText Date: header\n\t\$Date Date: header in GSDL format (eg: 19990924)",
113 'inherits' => "Yes",
114 'args' => $arguments };
115
116# Create a new EMAILPlug object with which to parse a file.
117# Accomplished by creating a new BasPlug and using bless to
118# turn it into an EMAILPlug.
119
120sub new {
121 my ($class) = @_;
122 my $self = new BasPlug ($class, @_);
123
124 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
125 my $option_list = $self->{'option_list'};
126 push( @{$option_list}, $options );
127
128 if (!parsargv::parse(\@_,
129 q^split_exp/.*/^, \$self->{'split_exp'},
130 q^no_attachments^, \$self->{'ignore_attachments'},
131 "allow_extra_options")) {
132 print STDERR "\nIncorrect options passed to $class.";
133 print STDERR "\nCheck your collect.cfg configuration file\n";
134 die "\n";
135 }
136 $self->{'assoc_filenames'} = {}; # to save attach names so we don't clobber
137
138 # this might not actually be true at read-time, but after processing
139 # it should all be utf8.
140 $self->{'input_encoding'}="utf8";
141 return bless $self, $class;
142}
143
144sub get_default_process_exp {
145 my $self = shift (@_);
146 # mbx/email for mailbox file format, \d+ for maildir (each message is
147 # in a separate file, with a unique number for filename)
148 # mozilla and IE will save individual mbx format files with a ".eml" ext.
149 return q@([\\/]\d+|\.(mbx|email|eml))$@;
150}
151
152# This plugin splits the mbox mail files at lines starting with From<sp>
153# It is supposed to be "\n\nFrom ", but this isn't always used.
154sub get_default_split_exp {
155 return q^\nFrom .*\n^;
156
157}
158
159
160# do plugin specific processing of doc_obj
161sub process {
162
163 my $self = shift (@_);
164 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
165 my $outhandle = $self->{'outhandle'};
166
167 # Check that we're dealing with a valid mail file
168 # mbox message files start with "From "
169 # maildir messages usually start with Return-Path and Delivered-To
170 # mh is very similar to maildir
171 my $startoffile=substr($$textref,0,256);
172 if (($startoffile !~ /^(From )/) &&
173 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
174 return undef;
175 }
176
177
178 print $outhandle "EMAILPlug: processing $file\n"
179 if $self->{'verbosity'} > 1;
180
181 my $cursection = $doc_obj->get_top_section();
182
183 #
184 # Parse the document's text and extract metadata
185 #
186
187 # Protect backslashes
188 $$textref =~ s@\\@\\\\@g;
189
190 # Separate header from body of message
191 my $Headers = $$textref;
192 $Headers =~ s/\r?\n\r?\n(.*)$//s;
193 $$textref = $1;
194
195 # Unfold headers - see rfc822
196 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
197 # Extract basic metadata from header
198 my @headers = ("From", "To", "Subject", "Date");
199 my %raw;
200 foreach my $name (@headers) {
201 $raw{$name} = "No $name value";
202 }
203
204 # Get a default encoding for the header - RFC says should be ascii...
205 my $default_heading_encoding="iso_8859_1";
206
207 # We don't know what character set is the user's default...
208 # We could use textcat to guess... for now we'll look at mime content-type
209# if ($Headers =~ /([[:^ascii:]])/) {
210# }
211 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
212 $default_header_encoding=$1;
213 $default_header_encoding =~ s@\-@_@g;
214 $default_header_encoding =~ tr/A-Z/a-z/;
215 }
216
217
218 # Examine each line of the headers
219 my ($line, $name, $value);
220 my @parts;
221 foreach $line (split(/\n/, $Headers)) {
222
223 # Ignore lines with no content or which begin with whitespace
224 next unless ($line =~ /:/);
225 next if ($line =~ /^\s/);
226
227 # Find out what metadata is on this line
228 @parts = split(/:/, $line);
229 $name = shift @parts;
230 # get fieldname in canonical form - first cap, then lower case.
231 $name =~ tr/A-Z/a-z/;
232 # uppercase the first character according to the current locale
233 $name=~s/(.+)/\u$1/;
234 next unless $name;
235 next unless ($raw{$name});
236
237 # Find the value of that metadata
238 $value = join(":", @parts);
239 $value =~ s/^\s+//;
240 $value =~ s/\s+$//;
241 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
242 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
243 my $original_value=$value;
244 my $encoded=$value;
245 $value="";
246 # we should ignore spaces between consecutive encoded-texts
247 $encoded =~ s@\?=\s+=\?@\?==\?@g;
248 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=//i) {
249 my ($charset, $encoding, $data)=($2,$3,$4);
250 my ($decoded_data);
251 $value.="$1"; # any leading chars
252 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
253 chomp $data;
254 $encoding =~ tr/BQ/bq/;
255 if ($encoding eq "q") { # quoted printable
256 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
257 $decoded_data=qp_decode($data);
258 # qp_decode adds \n, which is default for body text
259 chomp($decoded_data);
260 } else { # base 64
261 $decoded_data=base64_decode($data);
262 }
263 $self->convert2unicode($charset, \$decoded_data);
264 $value .= $decoded_data;
265 } # end of while loop
266
267 # get any trailing characters
268 $self->convert2unicode($default_header_encoding, \$encoded);
269 $value.=$encoded;
270
271 if ($value =~ /^\s*$/) { # we couldn't extract anything...
272 $self->convert2unicode($default_header_encoding,
273 \$original_value);
274 $value=original_value;
275 }
276 } # end of if =?...?=
277
278 # In the absense of other charset information, assume the
279 # header is the default (usually "iso_8859_1") and convert it to unicode.
280 else {
281 $self->convert2unicode($default_header_encoding, \$value);
282 }
283
284 # Store the metadata
285 $raw{$name} = $value;
286 }
287
288 # Extract the name and e-mail address from the From metadata
289 $frommeta = $raw{"From"};
290 my $fromnamemeta;
291 my $fromaddrmeta;
292
293 $frommeta =~ s/\s*$//; # Remove trailing space, if any
294
295 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
296 $fromnamemeta=$1;
297 $fromaddrmeta=$2;
298 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
299 $fromnamemeta=$2;
300 $fromaddrmeta=$1;
301 }
302 if (!defined($fromaddrmeta)) {
303 $fromaddrmeta=$frommeta;
304 }
305 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
306 # minor attempt to prevent spam-bots from harvesting addresses...
307 $fromaddrmeta=~s/@/&#64;/;
308 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
309
310 if (defined($fromnamemeta) && $fromnamemeta) { # must be > 0 long
311 $fromnamemeta =~ s/\"//g; # remove quotes
312 $fromnamemeta =~ s/\s+$//; # remove trailing whitespace
313 }
314 else {
315 $fromnamemeta = $fromaddrmeta;
316 }
317 # if name is an address
318 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
319 $fromnamemeta=~s/@/&#64\;/;
320 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
321
322 $raw{"From"}=$frommeta;
323
324 # Process Date information
325 if ($raw{"Date"} !~ /No Date/) {
326 $raw{"DateText"} = $raw{"Date"};
327
328 # Convert the date text to internal date format
329 $value = $raw{"Date"};
330 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
331 # make some assumptions about the year formatting...
332 # some (old) software thinks 2001 is 101, some think 2001 is 01
333 if ($year < 20) { $year += 2000; } # assume not really 1920...
334 elsif ($year < 150) { $year += 1900; } # assume not really 2150...
335 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
336
337 } else {
338 # We have not extracted a date
339 $raw{"DateText"} = "Unknown.";
340 $raw{"Date"} = "19000000";
341 }
342
343 # Add extracted metadata to document object
344 foreach my $name (keys %raw) {
345 $value = $raw{$name};
346 if ($value) {
347 # assume subject, etc headers have no special HTML meaning.
348 $value = &text_into_html($value);
349 # escape [] so it isn't re-interpreted as metadata
350 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
351 } else {
352 $value = "No $name field";
353 }
354 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
355 }
356
357 my $mimetype="text/plain";
358 my $mimeinfo="";
359 my $charset = $default_header_encoding;
360 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
361 # more than one parameter given to Content-type.
362 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
363 if ($Headers =~ m@^content\-type:\s*([\w\.\-/]+)\s*(\;\s*.+)?\s*$@mi)
364 {
365 $mimetype=$1;
366 $mimetype =~ tr/[A-Z]/[a-z]/;
367
368 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
369 $mimetype = "text/plain";
370 }
371
372 $mimeinfo=$2;
373 if (!defined $mimeinfo) {
374 $mimeinfo="";
375 } else { # strip leading and trailing stuff
376 $mimeinfo =~ s/^\;\s*//;
377 $mimeinfo =~ s/\s*$//;
378 }
379 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
380 $charset = $1;
381 }
382 }
383
384 my $transfer_encoding="7bit";
385 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
386 $transfer_encoding=$1;
387 }
388 if ($mimetype eq "text/html") {
389 $$textref= $self->text_from_part("$Headers\n$$textref");
390 } elsif ($mimetype ne "text/plain") {
391 $self->{'doc_obj'} = $doc_obj; # in case we need to associate files...
392 $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
393 $outhandle);
394 } elsif ($transfer_encoding =~ /quoted\-printable/) {
395 $$textref=qp_decode($$textref);
396 $self->convert2unicode($charset, $textref);
397 } elsif ($transfer_encoding =~ /base64/) {
398 $$textref=base64_decode($$textref);
399 $self->convert2unicode($charset, $textref);
400 } else {
401 $self->convert2unicode($charset, $textref);
402 }
403
404
405 # Add "All headers" metadata
406 $Headers = &text_into_html($Headers);
407
408 $Headers = "No headers" unless ($Headers =~ /\w/);
409 $Headers =~ s/@/&#64\;/g;
410 # escape [] so it isn't re-interpreted as metadata
411 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
412 $self->convert2unicode($charset, \$Headers);
413 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
414
415
416 # Add Title metadata
417 my $Title = text_into_html($raw{'Subject'});
418 $Title .= "<br>From: " . text_into_html($raw{'From'});
419 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
420 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
421
422 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
423
424
425 # Add text to document object
426 if ($mimetype eq "text/plain") {
427 $$textref = &text_into_html($$textref);
428 }
429 $$textref = "No message" unless ($$textref =~ /\w/);
430 $doc_obj->add_utf8_text($cursection, $$textref);
431
432 return 1;
433}
434
435
436# Convert a text string into HTML.
437#
438# The HTML is going to be inserted into a GML file, so
439# we have to be careful not to use symbols like ">",
440# which ocurs frequently in email messages (and use
441# &gt instead.
442#
443# This function also turns links and email addresses into hyperlinks,
444# and replaces carriage returns with <BR> tags (and multiple carriage
445# returns with <P> tags).
446
447
448sub text_into_html {
449 my ($text) = @_;
450
451 # Convert problem characters into HTML symbols
452 $text =~ s/&/&amp;/g;
453 $text =~ s/</&lt;/g;
454 $text =~ s/>/&gt;/g;
455 $text =~ s/\"/&quot;/g;
456
457 $text =~ s@_@\\_@g; # protect against greenstone macros...
458
459 # convert email addresses and URIs into links
460# don't markup email addresses for now
461# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
462
463 # try to munge email addresses a little bit...
464 $text =~ s/@/&#64;/;
465 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
466 # assume URI doesn't finish with a '.'
467 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*(\#[\w\.\-_]*)?)@<a href=\"$1\">$1<\/a>@g;
468
469
470 # Clean up whitespace and convert \n charaters to <BR> or <P>
471 $text =~ s/ +/ /g;
472 $text =~ s/\s*$//g;
473 $text =~ s/^\s*//g;
474 $text =~ s/\n/\n<br>/g;
475 $text =~ s/<br>\s*<br>/<p>/gi;
476
477 return $text;
478}
479
480
481
482
483#Process a MIME message.
484# the textref we are given DOES NOT include the header.
485sub text_from_mime_message {
486 my $self = shift(@_);
487 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
488
489 # Check for multiparts - $mimeinfo will be a boundary
490 if ($mimetype =~ /multipart/) {
491 $boundary="";
492 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
493 $boundary=$1;
494 if ($boundary =~ m@^\"@) {
495 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
496 }
497 } else {
498 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
499 }
500 # parts start with "--$boundary"
501 # message ends with "--$boundary--"
502 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
503 # that perl might want to interpolate. Also allows spaces...
504 $boundary=~s/\\/\\\\/g;
505 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
506 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
507 # remove first "part" and last "part" (final --)
508 shift @message_parts;
509 my $last=pop @message_parts;
510 # if our boundaries are a bit dodgy and we only found 1 part...
511 if (!defined($last)) {$last="";}
512 # make sure it is only -- and whitespace
513 if ($last !~ /^\-\-\s*$/ms) {
514 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
515 }
516 foreach my $message_part (@message_parts) {
517 # remove the leading newline left from split.
518 $message_part=~s/^\r?\n//;
519 }
520 if ($mimetype eq "multipart/alternative") {
521 # check for an HTML version first, then TEXT, otherwise use first.
522 my $part_text="";
523 foreach my $message_part (@message_parts) {
524 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
525 {
526 # Use the HTML version
527 $part_text= $self->text_from_part($message_part);
528 $mimetype="text/html";
529 last;
530 }
531 }
532 if ($part_text eq "") { # try getting a text part instead
533 foreach my $message_part (@message_parts) {
534 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
535 {
536 # Use the plain version
537 $part_text= $self->text_from_part($message_part);
538 if ($part_text =~/[^\s]/) {
539 $part_text = text_into_html($part_text);
540 }
541 $mimetype="text/plain";
542 last;
543 }
544 }
545 }
546 if ($part_text eq "") { #use first part (no html/text part found)
547 $part_text = $self->text_from_part(shift @message_parts);
548 $part_text = text_into_html($part_text);
549 }
550 if ($part_text eq "") { # we couldn't get anything!!!
551 # or it was an empty message...
552 # do nothing...
553 print $outhandle "EMAILPlug: no text - empty body?\n";
554 } else {
555 $text = $part_text;
556 }
557 } elsif ($mimetype =~ m@multipart/(mixed|digest|related|signed)@) {
558 $text = "";
559 # signed is for PGP/GPG messages... the last part is a hash
560 if ($mimetype =~ m@multipart/signed@) {
561 pop @message_parts;
562 }
563 foreach my $message_part (@message_parts) {
564 my $part_header=$message_part;
565 my $part_body;
566 if ($message_part=~ /^\s*\n/) {
567 # no header... use defaults
568 $part_body=$message_part;
569 $part_header="Content-type: text/plain; charset=us-ascii";
570 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//s) {
571 $part_body=$1;
572 } else {
573 # something's gone wrong...
574 $part_header="";
575 $part_body=$message_part;
576 }
577
578 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
579 my $part_content_type="";
580 my $part_content_info="";
581 if ($mimetype eq "multipart/digest") {
582 # default type - RTFRFC!!
583 $part_content_type="message/rfc822";
584 }
585 if ($part_header =~ m@^content\-type:\s*([\w\.\-/]+)\s*\;?\s*(.*?)\s*$@mi) {
586 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
587 $part_content_info=$2;
588 }
589 my $filename="";
590 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
591 $filename=$1;
592 }
593
594 # disposition - either inline or attachment.
595 # NOT CURRENTLY USED - we display all text types instead...
596 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
597
598 # add <<attachment>> to each part except the first...
599 if ($text ne "") {
600 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
601 # add part info header
602 $text.="<br>Type: $part_content_type<br>\n";
603 if ($filename ne "") {
604 $text.="Filename: $filename\n";
605 }
606 $text.="</strong></p>\n";
607 }
608 if ($part_content_type =~ m@text/@)
609 {
610 my $part_text= $self->text_from_part($message_part);
611 if ($part_content_type !~ m@text/(ht|x)ml@) {
612 $part_text = text_into_html($part_text);
613 }
614 if ($part_text eq "") {
615 $part_text = '&lt;&lt;empty message&gt;&gt;';
616 }
617 $text .= $part_text;
618 } elsif ($part_content_type =~ m@message/rfc822@) {
619 # This is a forwarded message
620 my $message_part_headers=$part_body;
621 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
622 my $message_part_body=$1;
623 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
624
625 my $rfc822_formatted_body=""; # put result in here
626 if ($message_part_headers =~
627 /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.*?)\s*$/ims)
628 {
629 # The message header uses MIME flags
630 my $message_content_type=$1;
631 my $message_content_info=$2;
632 if (!defined($message_content_info)) {
633 $message_content_info="";
634 }
635 $message_content_type =~ tr/A-Z/a-z/;
636 if ($message_content_type =~ /multipart/) {
637 $rfc822_formatted_body=
638 $self->text_from_mime_message($message_content_type,
639 $message_content_info,
640 $message_part_body,
641 $outhandle);
642 } else {
643 $message_part_body= $self->text_from_part($part_body);
644 $rfc822_formatted_body=text_into_html($message_part_body);
645 }
646 } else {
647 # message doesn't use MIME flags
648 $rfc822_formatted_body=text_into_html($message_part_body);
649 }
650 # Add the returned text to the output
651 # don't put all the headers...
652# $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
653 my $brief_headers=get_brief_headers($message_part_headers);
654 $text.=text_into_html($brief_headers);
655 $text.="<p>\n";
656 $text.=$rfc822_formatted_body;
657 $text.="</p>\n";
658 # end of message/rfc822
659 } elsif ($part_content_type =~ /multipart/) {
660 # recurse again
661
662 $tmptext= $self->text_from_mime_message($part_content_type,
663 $part_content_info,
664 $part_body,
665 $outhandle);
666 $text.=$tmptext;
667 } else {
668 # this part isn't text/* or another message...
669 if ($text eq "") {
670 # this is the first part of a multipart, or only part!
671 $text="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
672 # add part info header
673 $text.="<br>Type: $part_content_type<br>\n";
674 $text.="Filename: $filename</strong></p>\n";
675 }
676
677 # save attachment by default
678 if (!$self->{'ignore_attachments'}
679 && $filename ne "") { # this part has a file...
680 my $encoding="8bit";
681 if ($part_header =~
682 /^content-transfer-encoding:\s*(\w+)/mi ) {
683 $encoding=$1; $encoding =~ tr/A-Z/a-z/;
684 }
685 my $tmpdir=$ENV{'GSDLHOME'} . "/tmp";
686 my $save_filename="$filename";
687
688 # make sure we don't clobber files with same name;
689 # need to keep state between .mbx files
690 my $assoc_files=$self->{'assoc_filenames'};
691 if ($assoc_files{$filename}) { # it's been set...
692 $assoc_files{$filename}++;
693 $filename =~ m/(.+)\.(\w+)$/;
694 my ($filestem, $ext)=($1,$2);
695 $save_filename="${filestem}_"
696 . $assoc_files{$filename} . ".$ext";
697 } else { # first file with this name
698 $assoc_files{$filename}=1;
699 }
700 open (SAVE, ">$tmpdir/$save_filename") ||
701 warn "EMAILPlug: Can't save attachment as $tmpdir/$save_filename: $!";
702 $part_text = $message_part;
703 $part_text =~ s/(.*?)\r?\n\r?\n//s; # remove header
704 if ($encoding eq "base64") {
705 print SAVE base64_decode($part_text);
706 } elsif ($encoding eq "quoted-printable") {
707 print SAVE qp_decode($part_text);
708 } else { # 7bit, 8bit, binary, etc...
709 print SAVE $part_text;
710 }
711 close SAVE;
712 my $doc_obj=$self->{'doc_obj'};
713 $doc_obj->associate_file("$tmpdir/$save_filename",
714 "$save_filename",
715 $part_content_type # mimetype
716 );
717 # clean up tmp area...
718 # Can't do this as it hasn't been copied/linked yet!!!
719# &util::rm("$tmpdir/$save_filename");
720 print $outhandle "EMAILPlug: saving attachment \"$filename\"\n"; #
721
722 # be nice if "download" was a translatable macro :(
723 $text .="<a href=\"_httpdocimg_/$save_filename\">download</a>";
724 } # end of save attachment
725 } # end of !text/message part
726 } # foreach message part.
727 } else {
728 # we can't handle this multipart type (not mixed or alternative)
729 # the RFC also mentions "parallel".
730 }
731 } # end of ($mimetype =~ multipart)
732 elsif ($mimetype =~ m@message/rfc822@) {
733 my $msg_header = $text;
734 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
735 $text = $1;
736
737 if ($msg_header =~ /^content\-type:\s*([\w\.\-\/]+)\s*\;?\s*(.+?)\s*$/mi)
738 {
739 $mimetype=$1;
740 $mimetype =~ tr/[A-Z]/[a-z]/;
741 $mimeinfo=$2;
742 if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
743 $charset = $1;
744 }
745 my $msg_text;
746 if ($mimetype =~ m@multipart/@) {
747 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
748 $text, $outhandle);
749 } else {$msg_text=$text;}
750
751 my $brief_header=text_into_html(get_brief_headers($msg_header));
752 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
753 $text.= "<table><tr><td width=\"5%\"> </td>\n";
754 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
755 . "</td></tr></table>";
756 }
757 } else {
758 # we don't do any processing of the content.
759 }
760
761 return $text;
762}
763
764
765# Return only the "important" headers from a set of message headers
766sub get_brief_headers {
767 my $msg_header = shift;
768 my $brief_header = "";
769
770 # Order matters!
771 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1\n";}
772 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1\n";}
773 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1\n";}
774 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1\n";}
775 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1\n";}
776
777 return $brief_header;
778}
779
780
781# Process a MIME part. Return "" if we can't decode it.
782sub text_from_part {
783 my $self = shift;
784 my $text = shift || '';
785 my $part_header = $text;
786
787 # check for empty part header (leading blank line)
788 if ($text =~ /^\s*\r?\n/) {
789 $part_header="Content-type: text/plain; charset=us-ascii";
790 } else {
791 $part_header =~ s/\r?\n\r?\n(.*)$//s;
792 $text=$1; if (!defined($text)) {$text="";}
793 }
794 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
795 $part_header =~ /content\-type:\s*([\w\.\-\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
796 my $type=$1;
797 my $charset=$2;
798 if (!defined($type)) {$type="";}
799 if (!defined($charset)) {$charset="ascii";}
800 my $encoding="";
801 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
802 $encoding=$1; $encoding=~tr/A-Z/a-z/;
803 }
804 # Content-Transfer-Encoding is per-part
805 if ($encoding ne "") {
806 if ($encoding =~ /quoted\-printable/) {
807 $text=qp_decode($text);
808 } elsif ($encoding =~ /base64/) {
809 $text=base64_decode($text);
810 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
811 # rfc2045 also allows binary, which we ignore (for now).
812 my $outhandle=$self->{'outhandle'};
813 print $outhandle "EMAILPlug: unknown transfer encoding: $encoding\n";
814 return "";
815 }
816 }
817 if ($type eq "text/html") {
818 # only get stuff between <body> tags, or <html> tags.
819 $text =~ s@^.*<html[^>]*>@@is;
820 $text =~ s@</html>.*$@@is;
821 $text =~ s/^.*?<body[^>]*>//si;
822 $text =~ s/<\/body>.*$//si;
823 }
824 elsif ($type eq "text/xml") {
825 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
826 $text="<pre>\n$text\n</pre>\n";
827 }
828 # convert to unicode
829 $self->convert2unicode($charset, \$text);
830 return $text;
831}
832
833
834# decode quoted-printable text
835sub qp_decode {
836 my $text=shift;
837
838 # if a line ends with "=\s*", it is a soft line break, otherwise
839 # keep in any newline characters.
840
841 $text =~ s/=\s*\r?\n//mg;
842 $text =~ s/=([0-9A-Fa-f]{2})/chr (hex "0x$1")/eg;
843 return $text;
844}
845
846# decode base64 text. This is fairly slow (since it's interpreted perl rather
847# than compiled XS stuff like in the ::MIME modules, but this is more portable
848# for us at least).
849# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
850# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
851# from each encoded byte.
852
853
854sub base64_decode {
855 my $enc_text = shift;
856# A=>0, B=>1, ..., '+'=>62, '/'=>63
857# also '=' is used for padding at the end, but we remove it anyway.
858 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
859# map each MIME char into it's value, for more efficient lookup.
860 my %index;
861 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
862# remove all non-base64 chars. eval to get variable in transliteration...
863# also remove '=' - we'll assume (!!) that there are no errors in the encoding
864 eval "\$enc_text =~ tr|$mimechars||cd";
865 my $decoded="";
866 while (length ($enc_text)>3)
867 {
868 my $fourchars=substr($enc_text,0,4,"");
869 my @chars=(split '',$fourchars);
870 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
871 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
872 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
873 }
874# if there are any input chars left, there are either
875# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
876 my @chars=(split '',$enc_text);
877 if (length($enc_text)) {
878 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
879 }
880 if (length($enc_text)==3) {
881 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
882 }
883 return $decoded;
884}
885
886sub convert2unicode {
887 my $self = shift(@_);
888 my ($charset, $textref) = @_;
889
890 # first get our character encoding name in the right form.
891 $charset = "iso_8859_1" unless defined $charset;
892 $charset=~tr/A-Z/a-z/;
893 $charset=~s/\-/_/g;
894 $charset=~s/gb2312/gb/;
895 # assumes EUC-KR, not ISO-2022 !?
896 $charset=~s/ks_c_5601_1987/korean/;
897
898 if ($charset eq "utf_8") {
899 # nothing to do!
900 return;
901 }
902
903 # It appears that we can't always trust ascii text so we'll treat it
904 # as iso-8859-1 (letting characters above 0x80 through without
905 # converting them to utf-8 will result in invalid XML documents
906 # which can't be parsed at build time).
907 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
908
909 if ($charset eq "iso_8859_1") {
910 # test if the mailer lied, and it has win1252 chars in it...
911 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
912 if ($$textref =~ m/[\x80-\x9f]/) {
913 my $outhandle = $self->{'outhandle'};
914 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
915 print $outhandle "codepage 1252 detected.\n";
916 $charset = "windows_1252";
917 }
918 }
919 my $utf8_text=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
920
921 if ($utf8_text ne "") {
922 $$textref=$utf8_text;
923 } else {
924 # we didn't get any text... unsupported encoding perhaps? Or it is
925 # empty anyway. We'll try to continue, assuming 8859-1. We could strip
926 # characters out here if this causes problems...
927 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1",$textref));
928
929 }
930}
931
932
933# Perl packages have to return true if they are run.
9341;
Note: See TracBrowser for help on using the repository browser.