source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3136

Last change on this file since 3136 was 3136, checked in by paynter, 20 years ago

Reconciled John's version of my changes to EMAILPlug with my version
of my changes. It turns out they differed only in formatting and
documentation.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.6 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter (gwp@cs.waikato.ac.nz)
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63package EMAILPlug;
64
65use SplitPlug;
66
67use unicode;
68
69use sorttools;
70use util;
71
72
73# EMAILPlug is a sub-class of SplitPlug.
74
75sub BEGIN {
76 @ISA = ('SplitPlug');
77}
78
79# Create a new EMAILPlug object with which to parse a file.
80# Accomplished by creating a new BasPlug and using bless to
81# turn it into an EMAILPlug.
82
83sub new {
84 my ($class) = @_;
85 my $self = new BasPlug ("EMAILPlug", @_);
86 # this might not actually be true at read-time, but after processing
87 # it should all be utf8.
88 $self->{'input_encoding'}="utf8";
89 return bless $self, $class;
90}
91
92sub get_default_process_exp {
93 my $self = shift (@_);
94 # mbx/email for mailbox file format, \d+ for maildir (each message is
95 # in a separate file, with a unique number for filename)
96 # mozilla and IE will save individual mbx format files with a ".eml" ext.
97 return q@([\\/]\d+|\.(mbx|email|eml))$@;
98}
99
100# This plugin splits the mbox mail files at lines starting with From<sp>
101# It is supposed to be "\n\nFrom ", but this isn't always used.
102sub get_default_split_exp {
103 return q^\nFrom .*\n^;
104}
105
106
107# do plugin specific processing of doc_obj
108sub process {
109
110 my $self = shift (@_);
111 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112 my $outhandle = $self->{'outhandle'};
113
114 # Check that we're dealing with a valid mail file
115 # mbox message files start with "From "
116 # maildir messages usually start with Return-Path and Delivered-To
117 # mh is very similar to maildir
118 my $startoffile=substr($$textref,0,256);
119 if (($startoffile !~ /^(From )/) &&
120 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
121 return undef;
122 }
123
124
125 print $outhandle "EMAILPlug: processing $file\n"
126 if $self->{'verbosity'} > 1;
127
128 my $cursection = $doc_obj->get_top_section();
129
130 #
131 # Parse the document's text and extract metadata
132 #
133
134 # Protect backslashes
135 $$textref =~ s@\\@\\\\@g;
136
137 # Separate header from body of message
138 my $Headers = $$textref;
139 $Headers =~ s/\r?\n\r?\n(.*)$//s;
140 $$textref = $1;
141
142 # Unfold headers - see rfc822
143 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
144 # Extract basic metadata from header
145 my @headers = ("From", "To", "Subject", "Date");
146 my %raw;
147 foreach my $name (@headers) {
148 $raw{$name} = "No $name value";
149 }
150
151 # Get a default encoding for the header - RFC says should be ascii...
152 my $default_heading_encoding="iso_8859_1";
153
154 # We don't know what character set is the user's default...
155 # We could use textcat to guess... for now we'll look at mime content-type
156# if ($Headers =~ /([[:^ascii:]])/) {
157# }
158 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
159 $default_header_encoding=$1;
160 $default_header_encoding =~ s@\-@_@g;
161 $default_header_encoding =~ tr/A-Z/a-z/;
162 }
163
164
165 # Examine each line of the headers
166 my ($line, $name, $value);
167 my @parts;
168 foreach $line (split(/\n/, $Headers)) {
169
170 # Ignore lines with no content or which begin with whitespace
171 next unless ($line =~ /:/);
172 next if ($line =~ /^\s/);
173
174 # Find out what metadata is on this line
175 @parts = split(/:/, $line);
176 $name = shift @parts;
177 # get fieldname in canonical form - first cap, then lower case.
178 $name =~ tr/A-Z/a-z/;
179 # uppercase the first character according to the current locale
180 $name=~s/(.+)/\u$1/;
181 next unless $name;
182 next unless ($raw{$name});
183
184 # Find the value of that metadata
185 $value = join(":", @parts);
186 $value =~ s/^\s+//;
187 $value =~ s/\s+$//;
188
189 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
190 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
191 my $original_value=$value;
192 my $encoded=$value;
193 $value="";
194 # this isn't quite right yet regarding spaces between encoded-texts
195 # (see examples, section 8. of rfc).
196 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
197 my ($charset, $encoding, $data)=($2,$3,$4);
198 my ($decoded_data);
199 $value.="$1"; # any leading chars
200 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
201 chomp $data;
202 $encoding =~ tr/BQ/bq/;
203 if ($encoding eq "q") { # quoted printable
204 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
205 $decoded_data=qp_decode($data);
206 } else { # base 64
207 $decoded_data=base64_decode($data);
208 }
209 $self->convert2unicode($charset, \$decoded_data);
210 $value .= $decoded_data;
211 } # end of while loop
212
213 # get any trailing characters
214 $self->convert2unicode($default_header_encoding, \$encoded);
215 $value.=$encoded;
216
217 if ($value =~ /^\s*$/) { # we couldn't extract anything...
218 $self->convert2unicode($default_header_encoding,
219 \$original_value);
220 $value=original_value;
221 }
222 } # end of if =?...?=
223
224 # In the absense of other charset information, assume the
225 # header is the default (usually "iso_8859_1") and convert it to unicode.
226 else {
227 $self->convert2unicode($default_header_encoding, \$value);
228 }
229
230 # Store the metadata
231 $raw{$name} = $value;
232 }
233
234 # Extract the name and e-mail address from the From metadata
235 $frommeta = $raw{"From"};
236 my $fromnamemeta;
237 my $fromaddrmeta;
238
239 $frommeta =~ s/\s*$//; # Remove trailing space, if any
240
241 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
242 $fromnamemeta=$1;
243 $fromaddrmeta=$2;
244 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
245 $fromnamemeta=$2;
246 $fromaddrmeta=$1;
247 }
248 if (!defined($fromaddrmeta)) {
249 $fromaddrmeta=$frommeta;
250 }
251 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
252 # minor attempt to prevent spam-bots from harvesting addresses...
253 $fromaddrmeta=~s/@/&#64;/;
254 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
255
256 if (defined($fromnamemeta)) {
257 $fromnamemeta =~ s/\"//g;
258 }
259 else {
260 $fromnamemeta = $fromaddrmeta;
261 }
262 # if name is an address
263 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
264 $fromnamemeta=~s/@/&#64\;/;
265 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
266
267 $raw{"From"}=$frommeta;
268
269 # Process Date information
270 if ($raw{"Date"} !~ /No Date/) {
271 $raw{"DateText"} = $raw{"Date"};
272
273 # Convert the date text to internal date format
274 $value = $raw{"Date"};
275 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
276 if ($year < 100) { $year += 1900; }
277 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
278
279 } else {
280 # We have not extracted a date
281 $raw{"DateText"} = "Unknown.";
282 $raw{"Date"} = "19000000";
283 }
284
285 # Add extracted metadata to document object
286 foreach my $name (keys %raw) {
287 $value = $raw{$name};
288 if ($value) {
289 # assume subject, etc headers have no special HTML meaning.
290 $value = &text_into_html($value);
291 # escape [] so it isn't re-interpreted as metadata
292 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
293 } else {
294 $value = "No $name field";
295 }
296 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
297 }
298
299 my $mimetype="text/plain";
300 my $mimeinfo="";
301 my $charset = $default_header_encoding;
302 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
303 # more than one parameter given to Content-type.
304 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
305 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
306 {
307 $mimetype=$1;
308 $mimetype =~ tr/[A-Z]/[a-z]/;
309
310 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
311 $mimetype = "text/plain";
312 }
313
314 $mimeinfo=$2;
315 if (!defined $mimeinfo) {
316 $mimeinfo="";
317 } else { # strip leading and trailing stuff
318 $mimeinfo =~ s/^\;\s*//;
319 $mimeinfo =~ s/\s*$//;
320 }
321 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
322 $charset = $1;
323 }
324 }
325
326 my $transfer_encoding="7bit";
327 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
328 $transfer_encoding=$1;
329 }
330 if ($mimetype eq "text/html") {
331 $$textref= $self->text_from_part("$Headers\n$$textref");
332 } elsif ($mimetype ne "text/plain") {
333 $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
334 $outhandle);
335 } elsif ($transfer_encoding =~ /quoted\-printable/) {
336 $$textref=qp_decode($$textref);
337 $self->convert2unicode($charset, $textref);
338 } elsif ($transfer_encoding =~ /base64/) {
339 $$textref=base64_decode($$textref);
340 $self->convert2unicode($charset, $textref);
341 } else {
342 $self->convert2unicode($charset, $textref);
343 }
344
345
346 # Add "All headers" metadata
347 $Headers = &text_into_html($Headers);
348
349 $Headers = "No headers" unless ($Headers =~ /\w/);
350 $Headers =~ s/@/&#64\;/g;
351 # escape [] so it isn't re-interpreted as metadata
352 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
353 $self->convert2unicode($charset, \$Headers);
354 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
355
356
357 # Add Title metadata
358 my $Title = text_into_html($raw{'Subject'});
359 $Title .= "<br>From: " . text_into_html($raw{'From'});
360 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
361 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
362
363 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
364
365
366 # Add text to document object
367 if ($mimetype eq "text/plain") {
368 $$textref = &text_into_html($$textref);
369 }
370 $$textref = "No message" unless ($$textref =~ /\w/);
371 $doc_obj->add_utf8_text($cursection, $$textref);
372
373 return 1;
374}
375
376
377# Convert a text string into HTML.
378#
379# The HTML is going to be inserted into a GML file, so
380# we have to be careful not to use symbols like ">",
381# which ocurs frequently in email messages (and use
382# &gt instead.
383#
384# This function also turns links and email addresses into hyperlinks,
385# and replaces carriage returns with <BR> tags (and multiple carriage
386# returns with <P> tags).
387
388
389sub text_into_html {
390 my ($text) = @_;
391
392 # Convert problem characters into HTML symbols
393 $text =~ s/&/&amp;/g;
394 $text =~ s/</&lt;/g;
395 $text =~ s/>/&gt;/g;
396 $text =~ s/\"/&quot;/g;
397
398 # convert email addresses and URIs into links
399# don't markup email addresses for now
400# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
401
402 # try to munge email addresses a little bit...
403 $text =~ s/@/&#64;/;
404 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
405 # assume URI doesn't finish with a '.'
406 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
407
408
409 # Clean up whitespace and convert \n charaters to <BR> or <P>
410 $text =~ s/ +/ /g;
411 $text =~ s/\s*$//g;
412 $text =~ s/^\s*//g;
413 $text =~ s/\n/\n<br>/g;
414 $text =~ s/<br>\s*<br>/<p>/gi;
415
416 return $text;
417}
418
419
420
421
422#Process a MIME message.
423# the textref we are given DOES NOT include the header.
424sub text_from_mime_message {
425 my $self = shift(@_);
426 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
427
428 # Check for multiparts - $mimeinfo will be a boundary
429 if ($mimetype =~ /multipart/) {
430 $boundary="";
431 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
432 $boundary=$1;
433 if ($boundary =~ m@^\"@) {
434 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
435 }
436 } else {
437 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
438 }
439 # parts start with "--$boundary"
440 # message ends with "--$boundary--"
441 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
442 # that perl might want to interpolate. Also allows spaces...
443 $boundary=~s/\\/\\\\/g;
444 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
445 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
446 # remove first "part" and last "part" (final --)
447 shift @message_parts;
448 my $last=pop @message_parts;
449 # if our boundaries are a bit dodgy and we only found 1 part...
450 if (!defined($last)) {$last="";}
451 # make sure it is only -- and whitespace
452 if ($last !~ /^\-\-\s*$/ms) {
453 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
454 }
455 foreach my $message_part (@message_parts) {
456 # remove the leading newline left from split.
457 $message_part=~s/^\r?\n//;
458 }
459 if ($mimetype eq "multipart/alternative") {
460 # check for an HTML version first, then TEXT, otherwise use first.
461 my $part_text="";
462 foreach my $message_part (@message_parts) {
463 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
464 {
465 # Use the HTML version
466 $part_text= $self->text_from_part($message_part);
467 $mimetype="text/html";
468 last;
469 }
470 }
471 if ($part_text eq "") { # try getting a text part instead
472 foreach my $message_part (@message_parts) {
473 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
474 {
475 # Use the plain version
476 $part_text= $self->text_from_part($message_part);
477 if ($part_text =~/[^\s]/) {
478 $part_text="<pre>".$part_text."</pre>";
479 }
480 $mimetype="text/plain";
481 last;
482 }
483 }
484 }
485 if ($part_text eq "") { # use first part
486 $part_text= $self->text_from_part(shift @message_parts);
487 }
488 if ($part_text eq "") { # we couldn't get anything!!!
489 # or it was an empty message...
490 # do nothing...
491 print $outhandle "EMAILPlug: no text - empty body?\n";
492 } else {
493 $text=$part_text;
494 }
495 } elsif ($mimetype =~ m@multipart/(mixed|digest|related)@) {
496 $text="";
497 foreach my $message_part (@message_parts) {
498 my $part_header=$message_part;
499 my $part_body;
500 if ($message_part=~ /^\s*\n/) {
501 # no header... use defaults
502 $part_body=$message_part;
503 $part_header="Content-type: text/plain; charset=us-ascii";
504 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
505 $part_body=$1;
506 } else {
507 # something's gone wrong...
508 $part_header="";
509 $part_body=$message_part;
510 }
511
512 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
513 my $part_content_type="";
514 my $part_content_info="";
515 if ($mimetype eq "multipart/digest") {
516 # default type - RTFRFC!!
517 $part_content_type="message/rfc822";
518 }
519 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*(.*?)\s*$@mi) {
520 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
521 $part_content_info=$2;
522 }
523 my $filename="";
524 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
525 $filename=$1;
526 }
527
528 # disposition - either inline or attachment.
529 # NOT CURRENTLY USED - we display all text types instead...
530 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
531
532 # add <<attachment>> to each part except the first...
533 if ($text ne "") {
534 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
535 # add part info header
536 $text.="<br>Type: $part_content_type<br>\n";
537 if ($filename ne "") {
538 $text.="Filename: $filename\n";
539 }
540 $text.="</strong></p>\n";
541 }
542
543 if ($part_content_type =~ m@text/@)
544 {
545 my $part_text= $self->text_from_part($message_part);
546 if ($part_content_type !~ m@text/(ht|x)ml@) {
547 $part_text=text_into_html($part_text);
548 }
549 if ($part_text eq "") {
550 $part_text='&lt;&lt;empty message&gt;&gt;';
551 }
552 $text.=$part_text;
553 } elsif ($part_content_type =~ m@message/rfc822@) {
554 # This is a forwarded message
555 my $message_part_headers=$part_body;
556 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
557 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
558 my $message_part_body=$1;
559
560 my $rfc822_formatted_body=""; # put result in here
561 if ($message_part_headers =~
562 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.*?)\s*$/ims)
563 {
564 # The message header uses MIME flags
565 my $message_content_type=$1;
566 my $message_content_info=$2;
567 if (!defined($message_content_info)) {
568 $message_content_info="";
569 }
570 $message_content_type =~ tr/A-Z/a-z/;
571 if ($message_content_type =~ /multipart/) {
572 $rfc822_formatted_body=
573 $self->text_from_mime_message($message_content_type,
574 $message_content_info,
575 $message_part_body,
576 $outhandle);
577 } else {
578 $message_part_body= $self->text_from_part($part_body);
579 $rfc822_formatted_body=text_into_html($message_part_body);
580 }
581 } else {
582 # message doesn't use MIME flags
583 $rfc822_formatted_body=text_into_html($message_part_body);
584 }
585 # Add the returned text to the output
586 # don't put all the headers...
587 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
588 $text.=text_into_html($message_part_headers);
589 $text.="<p>\n";
590 $text.=$rfc822_formatted_body;
591 # end of message/rfc822
592 } elsif ($part_content_type =~ /multipart/) {
593 # recurse again
594
595 $tmptext= $self->text_from_mime_message($part_content_type,
596 $part_content_info,
597 $part_body,
598 $outhandle);
599 $text.=$tmptext;
600 } elsif ($text eq "") {
601 # we can't do anything with this part, but if it's the first
602 # part then make sure it is mentioned..
603
604 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
605 # add part info header
606 $text.="<br>Type: $part_content_type<br>\n";
607 if ($filename ne "") {
608 $text.="Filename: $filename\n";
609 }
610 $text.="</strong></p>\n";
611 }
612 } # foreach message part.
613 } else {
614 # we can't handle this multipart type (not mixed or alternative)
615 # the RFC also mentions "parallel".
616 }
617 } # end of ($mimetype !~ multipart)
618 elsif ($mimetype =~ m@message/rfc822@) {
619 my $msg_header = $text;
620 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
621 $text = $1;
622
623 if ($msg_header =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
624 {
625 $mimetype=$1;
626 $mimetype =~ tr/[A-Z]/[a-z]/;
627 $mimeinfo=$2;
628 if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
629 $charset = $1;
630 }
631 my $msg_text;
632 if ($mimetype =~ m@multipart/@) {
633 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
634 $text, $outhandle);
635 } else {$msg_text=$text;}
636
637 my $brief_header="";
638 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
639 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
640 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
641 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
642 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
643 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
644 $text.= "<table><tr><td width=\"5%\"> </td>\n";
645 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
646 . "</td></tr></table>";
647 }
648 } else {
649 # we don't do any processing of the content.
650 }
651
652 return $text;
653}
654
655
656
657
658
659
660# Process a MIME part. Return "" if we can't decode it.
661sub text_from_part {
662 my $self = shift;
663 my $text = shift || '';
664 my $part_header = $text;
665
666 # check for empty part header (leading blank line)
667 if ($text =~ /^\s*\r?\n/) {
668 $part_header="Content-type: text/plain; charset=us-ascii";
669 } else {
670 $part_header =~ s/\r?\n\r?\n(.*)$//s;
671 $text=$1; if (!defined($text)) {$text="";}
672 }
673 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
674 $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
675 my $type=$1;
676 my $charset=$2;
677 if (!defined($type)) {$type="";}
678 if (!defined($charset)) {$charset="ascii";}
679 my $encoding="";
680 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
681 $encoding=$1; $encoding=~tr/A-Z/a-z/;
682 }
683 # Content-Transfer-Encoding is per-part
684 if ($encoding ne "") {
685 if ($encoding =~ /quoted\-printable/) {
686 $text=qp_decode($text);
687 } elsif ($encoding =~ /base64/) {
688 $text=base64_decode($text);
689 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
690 # rfc2045 also allows binary, which we ignore (for now).
691 # maybe this shouldn't go to stderr, but anyway...
692 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
693 return "";
694 }
695 }
696 if ($type eq "text/html") {
697 # only get stuff between <body> tags, or <html> tags.
698 $text =~ s@^.*<html[^>]*>@@is;
699 $text =~ s@</html>.*$@@is;
700 $text =~ s/^.*?<body[^>]*>//si;
701 $text =~ s/<\/body>.*$//si;
702 }
703 elsif ($type eq "text/xml") {
704 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
705 $text="<pre>\n$text\n</pre>\n";
706 }
707 # convert to unicode
708 $self->convert2unicode($charset, \$text);
709 return $text;
710}
711
712
713# decode quoted-printable text
714sub qp_decode {
715 my $text=shift;
716
717 my @lines=split('\n', $text);
718
719 # if a line ends with "=\s*", it is a soft line break, otherwise
720 # keep in any newline characters.
721 foreach my $line (@lines) {
722 if ($line !~ s/=\s*$//) {$line.="\n";}
723
724 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
725 my @hexcode_segments=split('=',$line);
726 shift @hexcode_segments;
727 my @hexcodes;
728 foreach my $hexcode (@hexcode_segments) {
729 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
730 chomp($hexcode); # just in case...
731 my $char=chr (hex "0x$hexcode");
732 $line =~ s/=$hexcode/$char/g;
733 }
734 }
735 }
736 $text= join('', @lines);
737 return $text;
738}
739
740# decode base64 text. This is fairly slow (since it's interpreted perl rather
741# than compiled XS stuff like in the ::MIME modules, but this is more portable
742# for us at least).
743# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
744# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
745# from each encoded byte.
746
747
748sub base64_decode {
749 my $enc_text = shift;
750# A=>0, B=>1, ..., '+'=>62, '/'=>63
751# also '=' is used for padding at the end, but we remove it anyway.
752 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
753# map each MIME char into it's value, for more efficient lookup.
754 my %index;
755 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
756# remove all non-base64 chars. eval to get variable in transliteration...
757# also remove '=' - we'll assume (!!) that there are no errors in the encoding
758 eval "\$enc_text =~ tr|$mimechars||cd";
759 my $decoded="";
760 while (length ($enc_text)>3)
761 {
762 my $fourchars=substr($enc_text,0,4,"");
763 my @chars=(split '',$fourchars);
764 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
765 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
766 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
767 }
768# if there are any input chars left, there are either
769# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
770 my @chars=(split '',$enc_text);
771 if (length($enc_text)) {
772 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
773 }
774 if (length($enc_text)==3) {
775 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
776 }
777 return $decoded;
778}
779
780sub convert2unicode {
781 my $self = shift(@_);
782 my ($charset, $textref) = @_;
783
784 # first get our character encoding name in the right form.
785 $charset = "iso_8859_1" unless defined $charset;
786 $charset=~tr/A-Z/a-z/;
787 $charset=~s/\-/_/g;
788 $charset=~s/gb2312/gb/;
789 # assumes EUC-KR, not ISO-2022 !?
790 $charset=~s/ks_c_5601_1987/korean/;
791
792 # It appears that we can't always trust ascii text so we'll treat it
793 # as iso-8859-1 (letting characters above 0x80 through without
794 # converting them to utf-8 will result in invalid XML documents
795 # which can't be parsed at build time).
796 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
797
798 if ($charset eq "iso_8859_1") {
799 # test if the mailer lied, and it has win1252 chars in it...
800 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
801 if ($$textref =~ m/[\x80-\x9f]/) {
802 my $outhandle = $self->{'outhandle'};
803 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
804 print $outhandle "codepage 1252 detected.\n";
805 $charset = "windows_1252";
806 }
807 }
808 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
809}
810
811
812# Perl packages have to return true if they are run.
8131;
Note: See TracBrowser for help on using the repository browser.