source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3132

Last change on this file since 3132 was 3132, checked in by jrm21, 22 years ago

Try to determine the encoding used in the headers in case it is not ascii.
(rfc-822 says it should be).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.3 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63package EMAILPlug;
64
65use SplitPlug;
66
67use unicode;
68
69use sorttools;
70use util;
71
72
73# EMAILPlug is a sub-class of SplitPlug.
74
75sub BEGIN {
76 @ISA = ('SplitPlug');
77}
78
79# Create a new EMAILPlug object with which to parse a file.
80# Accomplished by creating a new BasPlug and using bless to
81# turn it into an EMAILPlug.
82
83sub new {
84 my ($class) = @_;
85 my $self = new BasPlug ("EMAILPlug", @_);
86 # this might not actually be true at read-time, but after processing
87 # it should all be utf8.
88 $self->{'input_encoding'}="utf8";
89 return bless $self, $class;
90}
91
92sub get_default_process_exp {
93 my $self = shift (@_);
94 # mbx/email for mailbox file format, \d+ for maildir (each message is
95 # in a separate file, with a unique number for filename)
96 # mozilla and IE will save individual mbx format files with a ".eml" ext.
97 return q@([\\/]\d+|\.(mbx|email|eml))$@;
98}
99
100# This plugin splits the mbox mail files at lines starting with From<sp>
101# It is supposed to be "\n\nFrom ", but this isn't always used.
102sub get_default_split_exp {
103 return q^\nFrom .*\n^;
104}
105
106
107# do plugin specific processing of doc_obj
108sub process {
109
110 my $self = shift (@_);
111 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112 my $outhandle = $self->{'outhandle'};
113
114 # Check that we're dealing with a valid mail file
115 # mbox message files start with "From "
116 # maildir messages usually start with Return-Path and Delivered-To
117 # mh is very similar to maildir
118 my $startoffile=substr($$textref,0,256);
119 if (($startoffile !~ /^(From )/) &&
120 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
121 return undef;
122 }
123
124
125 print $outhandle "EMAILPlug: processing $file\n"
126 if $self->{'verbosity'} > 1;
127
128 my $cursection = $doc_obj->get_top_section();
129
130 #
131 # Parse the document's text and extract metadata
132 #
133
134 # Protect backslashes
135 $$textref =~ s@\\@\\\\@g;
136
137 # Separate header from body of message
138 my $Headers = $$textref;
139 $Headers =~ s/\r?\n\r?\n(.*)$//s;
140 $$textref = $1;
141
142 # Unfold headers - see rfc822
143 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
144 # Extract basic metadata from header
145 my @headers = ("From", "To", "Subject", "Date");
146 my %raw;
147 foreach my $name (@headers) {
148 $raw{$name} = "No $name value";
149 }
150
151 # Get a default encoding for the header - RFC says should be ascii...
152 my $default_heading_encoding="iso_8859_1";
153
154 # We don't know what character set is the user's default...
155 # We could use textcat to guess... for now we'll look at mime content-type
156# if ($Headers =~ /([[:^ascii:]])/) {
157# }
158 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
159 $default_header_encoding=$1;
160 $default_header_encoding =~ s@\-@_@g;
161 $default_header_encoding =~ tr/A-Z/a-z/;
162 }
163
164
165 # Examine each line of the headers
166 my ($line, $name, $value);
167 my @parts;
168 foreach $line (split(/\n/, $Headers)) {
169
170 # Ignore lines with no content or which begin with whitespace
171 next unless ($line =~ /:/);
172 next if ($line =~ /^\s/);
173
174 # Find out what metadata is on this line
175 @parts = split(/:/, $line);
176 $name = shift @parts;
177# uppercase the first character according to the current locale
178 $name=~s/(.+)/\u$1/;
179 next unless $name;
180 next unless ($raw{$name});
181
182 # Find the value of that metadata
183 $value = join(":", @parts);
184 $value =~ s/^\s+//;
185 $value =~ s/\s+$//;
186
187 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
188 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
189 my $original_value=$value;
190 my $encoded=$value;
191 $value="";
192 # this isn't quite right yet regarding spaces between encoded-texts
193 # (see examples, section 8. of rfc).
194 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
195 my ($charset, $encoding, $data)=($2,$3,$4);
196 my ($decoded_data);
197 $value.="$1"; # any leading chars
198 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
199 chomp $data;
200 $encoding =~ tr/BQ/bq/;
201 if ($encoding eq "q") { # quoted printable
202 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
203 $decoded_data=qp_decode($data);
204 } else { # base 64
205 $decoded_data=base64_decode($data);
206 }
207 $self->convert2unicode($charset, \$decoded_data);
208 $value .= $decoded_data;
209 } # end of while loop
210
211 # get any trailing characters
212 $self->convert2unicode($default_header_encoding, \$encoded);
213 $value.=$encoded;
214
215 if ($value =~ /^\s*$/) { # we couldn't extract anything...
216 $self->convert2unicode($default_header_encoding,
217 \$original_value);
218 $value=original_value;
219 }
220 } else { # end of if =?...?=
221 $self->convert2unicode($default_header_encoding, \$value);
222 }
223
224
225
226 # Store the metadata
227 $raw{$name} = $value;
228 }
229
230 # Extract the name and e-mail address from the From metadata
231 $frommeta = $raw{"From"};
232 my $fromnamemeta;
233 my $fromaddrmeta;
234
235 $frommeta =~ s/\s*$//; # Remove trailing space, if any
236
237 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
238 $fromnamemeta=$1;
239 $fromaddrmeta=$2;
240 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
241 $fromnamemeta=$2;
242 $fromaddrmeta=$1;
243 }
244 if (!defined($fromaddrmeta)) {
245 $fromaddrmeta=$frommeta;
246 }
247 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
248 # minor attempt to prevent spam-bots from harvesting addresses...
249 $fromaddrmeta=~s/@/&#64;/;
250 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
251
252 if (defined($fromnamemeta)) {
253 $fromnamemeta =~ s/\"//g;
254 }
255 else {
256 $fromnamemeta = $fromaddrmeta;
257 }
258 # if name is an address
259 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
260 $fromnamemeta=~s/@/&#64\;/;
261 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
262
263 $raw{"From"}=$frommeta;
264
265 # Process Date information
266 if ($raw{"Date"} !~ /No Date/) {
267 $raw{"DateText"} = $raw{"Date"};
268
269 # Convert the date text to internal date format
270 $value = $raw{"Date"};
271 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
272 if ($year < 100) { $year += 1900; }
273 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
274
275 } else {
276 # We have not extracted a date
277 $raw{"DateText"} = "Unknown.";
278 $raw{"Date"} = "19000000";
279 }
280
281 # Add extracted metadata to document object
282 foreach my $name (keys %raw) {
283 $value = $raw{$name};
284 if ($value) {
285 # assume subject, etc headers have no special HTML meaning.
286 $value = &text_into_html($value);
287 # escape [] so it isn't re-interpreted as metadata
288 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
289 } else {
290 $value = "No $name field";
291 }
292 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
293 }
294
295 my $mimetype="text/plain";
296 my $mimeinfo="";
297 my $charset = $default_header_encoding;
298 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
299 # more than one parameter given to Content-type.
300 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
301 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
302 {
303 $mimetype=$1;
304 $mimetype =~ tr/[A-Z]/[a-z]/;
305
306 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
307 $mimetype = "text/plain";
308 }
309
310 $mimeinfo=$2;
311 if (!defined $mimeinfo) {
312 $mimeinfo="";
313 } else { # strip leading and trailing stuff
314 $mimeinfo =~ s/^\;\s*//;
315 $mimeinfo =~ s/\s*$//;
316 }
317 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
318 $charset = $1;
319 }
320 }
321
322 my $transfer_encoding="7bit";
323 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
324 $transfer_encoding=$1;
325 }
326 if ($mimetype eq "text/html") {
327 $$textref= $self->text_from_part("$Headers\n$$textref");
328 } elsif ($mimetype ne "text/plain") {
329 $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
330 $outhandle);
331 } elsif ($transfer_encoding =~ /quoted\-printable/) {
332 $$textref=qp_decode($$textref);
333 $self->convert2unicode($charset, $textref);
334 } elsif ($transfer_encoding =~ /base64/) {
335 $$textref=base64_decode($$textref);
336 $self->convert2unicode($charset, $textref);
337 } else {
338 $self->convert2unicode($charset, $textref);
339 }
340
341
342 # Add "All headers" metadata
343 $Headers = &text_into_html($Headers);
344
345 $Headers = "No headers" unless ($Headers =~ /\w/);
346 $Headers =~ s/@/&#64\;/g;
347 # escape [] so it isn't re-interpreted as metadata
348 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
349
350 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
351
352
353 # Add Title metadata
354 my $Title = text_into_html($raw{'Subject'});
355 $Title .= "<br>From: " . text_into_html($raw{'From'});
356 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
357 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
358
359 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
360
361
362 # Add text to document object
363 if ($mimetype eq "text/plain") {
364 $$textref = &text_into_html($$textref);
365 }
366 $$textref = "No message" unless ($$textref =~ /\w/);
367 $doc_obj->add_utf8_text($cursection, $$textref);
368
369 return 1;
370}
371
372
373# Convert a text string into HTML.
374#
375# The HTML is going to be inserted into a GML file, so
376# we have to be careful not to use symbols like ">",
377# which ocurs frequently in email messages (and use
378# &gt instead.
379#
380# This function also turns links and email addresses into hyperlinks,
381# and replaces carriage returns with <BR> tags (and multiple carriage
382# returns with <P> tags).
383
384
385sub text_into_html {
386 my ($text) = @_;
387
388 # Convert problem characters into HTML symbols
389 $text =~ s/&/&amp;/g;
390 $text =~ s/</&lt;/g;
391 $text =~ s/>/&gt;/g;
392 $text =~ s/\"/&quot;/g;
393
394 # convert email addresses and URIs into links
395# don't markup email addresses for now
396# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
397
398 # try to munge email addresses a little bit...
399 $text =~ s/@/&#64;/;
400 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
401 # assume URI doesn't finish with a '.'
402 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
403
404
405 # Clean up whitespace and convert \n charaters to <BR> or <P>
406 $text =~ s/ +/ /g;
407 $text =~ s/\s*$//g;
408 $text =~ s/^\s*//g;
409 $text =~ s/\n/\n<br>/g;
410 $text =~ s/<br>\s*<br>/<p>/gi;
411
412 return $text;
413}
414
415
416
417
418#Process a MIME message.
419# the textref we are given DOES NOT include the header.
420sub text_from_mime_message {
421 my $self = shift(@_);
422 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
423
424 # Check for multiparts - $mimeinfo will be a boundary
425 if ($mimetype =~ /multipart/) {
426 $boundary="";
427 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
428 $boundary=$1;
429 if ($boundary =~ m@^\"@) {
430 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
431 }
432 } else {
433 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
434 }
435 # parts start with "--$boundary"
436 # message ends with "--$boundary--"
437 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
438 # that perl might want to interpolate. Also allows spaces...
439 $boundary=~s/\\/\\\\/g;
440 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
441 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
442 # remove first "part" and last "part" (final --)
443 shift @message_parts;
444 my $last=pop @message_parts;
445 # if our boundaries are a bit dodgy and we only found 1 part...
446 if (!defined($last)) {$last="";}
447 # make sure it is only -- and whitespace
448 if ($last !~ /^\-\-\s*$/ms) {
449 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
450 }
451 foreach my $message_part (@message_parts) {
452 # remove the leading newline left from split.
453 $message_part=~s/^\r?\n//;
454 }
455 if ($mimetype eq "multipart/alternative") {
456 # check for an HTML version first, then TEXT, otherwise use first.
457 my $part_text="";
458 foreach my $message_part (@message_parts) {
459 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
460 {
461 # Use the HTML version
462 $part_text= $self->text_from_part($message_part);
463 $mimetype="text/html";
464 last;
465 }
466 }
467 if ($part_text eq "") { # try getting a text part instead
468 foreach my $message_part (@message_parts) {
469 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
470 {
471 # Use the plain version
472 $part_text= $self->text_from_part($message_part);
473 if ($part_text =~/[^\s]/) {
474 $part_text="<pre>".$part_text."</pre>";
475 }
476 $mimetype="text/plain";
477 last;
478 }
479 }
480 }
481 if ($part_text eq "") { # use first part
482 $part_text= $self->text_from_part(shift @message_parts);
483 }
484 if ($part_text eq "") { # we couldn't get anything!!!
485 # or it was an empty message...
486 # do nothing...
487 print $outhandle "EMAILPlug: no text - empty body?\n";
488 } else {
489 $text=$part_text;
490 }
491 } elsif ($mimetype =~ m@multipart/(mixed|digest|related)@) {
492 $text="";
493 foreach my $message_part (@message_parts) {
494 my $part_header=$message_part;
495 my $part_body;
496 if ($message_part=~ /^\s*\n/) {
497 # no header... use defaults
498 $part_body=$message_part;
499 $part_header="Content-type: text/plain; charset=us-ascii";
500 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
501 $part_body=$1;
502 } else {
503 # something's gone wrong...
504 $part_header="";
505 $part_body=$message_part;
506 }
507
508 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
509 my $part_content_type="";
510 my $part_content_info="";
511 if ($mimetype eq "multipart/digest") {
512 # default type - RTFRFC!!
513 $part_content_type="message/rfc822";
514 }
515 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*(.*?)\s*$@mi) {
516 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
517 $part_content_info=$2;
518 }
519 my $filename="";
520 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
521 $filename=$1;
522 }
523
524 # disposition - either inline or attachment.
525 # NOT CURRENTLY USED - we display all text types instead...
526 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
527
528 # add <<attachment>> to each part except the first...
529 if ($text ne "") {
530 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
531 # add part info header
532 $text.="<br>Type: $part_content_type<br>\n";
533 if ($filename ne "") {
534 $text.="Filename: $filename\n";
535 }
536 $text.="</strong></p>\n";
537 }
538
539 if ($part_content_type =~ m@text/@)
540 {
541 my $part_text= $self->text_from_part($message_part);
542 if ($part_content_type !~ m@text/(ht|x)ml@) {
543 $part_text=text_into_html($part_text);
544 }
545 if ($part_text eq "") {
546 $part_text='&lt;&lt;empty message&gt;&gt;';
547 }
548 $text.=$part_text;
549 } elsif ($part_content_type =~ m@message/rfc822@) {
550 # This is a forwarded message
551 my $message_part_headers=$part_body;
552 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
553 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
554 my $message_part_body=$1;
555
556 my $rfc822_formatted_body=""; # put result in here
557 if ($message_part_headers =~
558 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.*?)\s*$/ims)
559 {
560 # The message header uses MIME flags
561 my $message_content_type=$1;
562 my $message_content_info=$2;
563 if (!defined($message_content_info)) {
564 $message_content_info="";
565 }
566 $message_content_type =~ tr/A-Z/a-z/;
567 if ($message_content_type =~ /multipart/) {
568 $rfc822_formatted_body=
569 $self->text_from_mime_message($message_content_type,
570 $message_content_info,
571 $message_part_body,
572 $outhandle);
573 } else {
574 $message_part_body= $self->text_from_part($part_body);
575 $rfc822_formatted_body=text_into_html($message_part_body);
576 }
577 } else {
578 # message doesn't use MIME flags
579 $rfc822_formatted_body=text_into_html($message_part_body);
580 }
581 # Add the returned text to the output
582 # don't put all the headers...
583 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
584 $text.=text_into_html($message_part_headers);
585 $text.="<p>\n";
586 $text.=$rfc822_formatted_body;
587 # end of message/rfc822
588 } elsif ($part_content_type =~ /multipart/) {
589 # recurse again
590
591 $tmptext= $self->text_from_mime_message($part_content_type,
592 $part_content_info,
593 $part_body,
594 $outhandle);
595 $text.=$tmptext;
596 } elsif ($text eq "") {
597 # we can't do anything with this part, but if it's the first
598 # part then make sure it is mentioned..
599
600 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
601 # add part info header
602 $text.="<br>Type: $part_content_type<br>\n";
603 if ($filename ne "") {
604 $text.="Filename: $filename\n";
605 }
606 $text.="</strong></p>\n";
607 }
608 } # foreach message part.
609 } else {
610 # we can't handle this multipart type (not mixed or alternative)
611 # the RFC also mentions "parallel".
612 }
613 } # end of ($mimetype !~ multipart)
614 elsif ($mimetype =~ m@message/rfc822@) {
615 my $msg_header = $text;
616 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
617 $text = $1;
618
619 if ($msg_header =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
620 {
621 $mimetype=$1;
622 $mimetype =~ tr/[A-Z]/[a-z]/;
623 $mimeinfo=$2;
624 if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
625 $charset = $1;
626 }
627 my $msg_text;
628 if ($mimetype =~ m@multipart/@) {
629 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
630 $text, $outhandle);
631 } else {$msg_text=$text;}
632
633 my $brief_header="";
634 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
635 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
636 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
637 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
638 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
639 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
640 $text.= "<table><tr><td width=\"5%\"> </td>\n";
641 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
642 . "</td></tr></table>";
643 }
644 } else {
645 # we don't do any processing of the content.
646 }
647
648 return $text;
649}
650
651
652
653
654
655
656# Process a MIME part. Return "" if we can't decode it.
657sub text_from_part {
658 my $self = shift;
659 my $text=shift || '';
660 my $part_header=$text;
661 # check for empty part header (leading blank line)
662 if ($text =~ /^\s*\r?\n/) {
663 $part_header="Content-type: text/plain; charset=us-ascii";
664 } else {
665 $part_header =~ s/\r?\n\r?\n(.*)$//s;
666 $text=$1; if (!defined($text)) {$text="";}
667 }
668 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
669 $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
670 my $type=$1;
671 my $charset=$2;
672 if (!defined($type)) {$type="";}
673 if (!defined($charset)) {$charset="ascii";}
674 my $encoding="";
675 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
676 $encoding=$1; $encoding=~tr/A-Z/a-z/;
677 }
678 # Content-Transfer-Encoding is per-part
679 if ($encoding ne "") {
680 if ($encoding =~ /quoted\-printable/) {
681 $text=qp_decode($text);
682 } elsif ($encoding =~ /base64/) {
683 $text=base64_decode($text);
684 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
685 # rfc2045 also allows binary, which we ignore (for now).
686 # maybe this shouldn't go to stderr, but anyway...
687 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
688 return "";
689 }
690 }
691 if ($type eq "text/html") {
692 # only get stuff between <body> tags, or <html> tags.
693 $text =~ s@^.*<html[^>]*>@@is;
694 $text =~ s@</html>.*$@@is;
695 $text =~ s/^.*?<body[^>]*>//si;
696 $text =~ s/<\/body>.*$//si;
697 }
698 elsif ($type eq "text/xml") {
699 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
700 $text="<pre>\n$text\n</pre>\n";
701 }
702 # convert to unicode
703 $self->convert2unicode($charset, \$text);
704 return $text;
705}
706
707
708# decode quoted-printable text
709sub qp_decode {
710 my $text=shift;
711
712 my @lines=split('\n', $text);
713
714 # if a line ends with "=\s*", it is a soft line break, otherwise
715 # keep in any newline characters.
716 foreach my $line (@lines) {
717 if ($line !~ s/=\s*$//) {$line.="\n";}
718
719 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
720 my @hexcode_segments=split('=',$line);
721 shift @hexcode_segments;
722 my @hexcodes;
723 foreach my $hexcode (@hexcode_segments) {
724 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
725 chomp($hexcode); # just in case...
726 my $char=chr (hex "0x$hexcode");
727 $line =~ s/=$hexcode/$char/g;
728 }
729 }
730 }
731 $text= join('', @lines);
732 return $text;
733}
734
735# decode base64 text. This is fairly slow (since it's interpreted perl rather
736# than compiled XS stuff like in the ::MIME modules, but this is more portable
737# for us at least).
738# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
739# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
740# from each encoded byte.
741
742
743sub base64_decode {
744 my $enc_text = shift;
745# A=>0, B=>1, ..., '+'=>62, '/'=>63
746# also '=' is used for padding at the end, but we remove it anyway.
747 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
748# map each MIME char into it's value, for more efficient lookup.
749 my %index;
750 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
751# remove all non-base64 chars. eval to get variable in transliteration...
752# also remove '=' - we'll assume (!!) that there are no errors in the encoding
753 eval "\$enc_text =~ tr|$mimechars||cd";
754 my $decoded="";
755 while (length ($enc_text)>3)
756 {
757 my $fourchars=substr($enc_text,0,4,"");
758 my @chars=(split '',$fourchars);
759 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
760 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
761 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
762 }
763# if there are any input chars left, there are either
764# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
765 my @chars=(split '',$enc_text);
766 if (length($enc_text)) {
767 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
768 }
769 if (length($enc_text)==3) {
770 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
771 }
772 return $decoded;
773}
774
775sub convert2unicode {
776 my $self = shift(@_);
777 my ($charset, $textref) = @_;
778
779 # first get our character encoding name in the right form.
780 $charset = "iso_8859_1" unless defined $charset;
781 $charset=~tr/A-Z/a-z/;
782 $charset=~s/\-/_/g;
783 $charset=~s/gb2312/gb/;
784 # assumes EUC-KR, not ISO-2022 !?
785 $charset=~s/ks_c_5601_1987/korean/;
786
787 # It appears that we can't always trust ascii text so we'll treat it
788 # as iso-8859-1 (letting characters above 0x80 through without
789 # converting them to utf-8 will result in invalid XML documents
790 # which can't be parsed at build time).
791 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
792
793 if ($charset eq "iso_8859_1") {
794 # test if the mailer lied, and it has win1252 chars in it...
795 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
796 if ($$textref =~ m/[\x80-\x9f]/) {
797 my $outhandle = $self->{'outhandle'};
798 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
799 print $outhandle "codepage 1252 detected.\n";
800 $charset = "windows_1252";
801 }
802 }
803 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
804}
805
806
807# Perl packages have to return true if they are run.
8081;
Note: See TracBrowser for help on using the repository browser.