source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3134

Last change on this file since 3134 was 3134, checked in by jrm21, 20 years ago

1) Convert headers to detected charset if possible.

2) Convert header field names to lower case (except first char) - we weren't
matching TO, FROM, SUBJECT, etc.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.4 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter (gwp@cs.waikato.ac.nz)
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63package EMAILPlug;
64
65use SplitPlug;
66
67use unicode;
68
69use sorttools;
70use util;
71
72
73# EMAILPlug is a sub-class of SplitPlug.
74
75sub BEGIN {
76 @ISA = ('SplitPlug');
77}
78
79# Create a new EMAILPlug object with which to parse a file.
80# Accomplished by creating a new BasPlug and using bless to
81# turn it into an EMAILPlug.
82
83sub new {
84 my ($class) = @_;
85 my $self = new BasPlug ("EMAILPlug", @_);
86 # this might not actually be true at read-time, but after processing
87 # it should all be utf8.
88 $self->{'input_encoding'}="utf8";
89 return bless $self, $class;
90}
91
92sub get_default_process_exp {
93 my $self = shift (@_);
94 # mbx/email for mailbox file format, \d+ for maildir (each message is
95 # in a separate file, with a unique number for filename)
96 # mozilla and IE will save individual mbx format files with a ".eml" ext.
97 return q@([\\/]\d+|\.(mbx|email|eml))$@;
98}
99
100# This plugin splits the mbox mail files at lines starting with From<sp>
101# It is supposed to be "\n\nFrom ", but this isn't always used.
102sub get_default_split_exp {
103 return q^\nFrom .*\n^;
104}
105
106
107# do plugin specific processing of doc_obj
108sub process {
109
110 my $self = shift (@_);
111 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112 my $outhandle = $self->{'outhandle'};
113
114 # Check that we're dealing with a valid mail file
115 # mbox message files start with "From "
116 # maildir messages usually start with Return-Path and Delivered-To
117 # mh is very similar to maildir
118 my $startoffile=substr($$textref,0,256);
119 if (($startoffile !~ /^(From )/) &&
120 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
121 return undef;
122 }
123
124
125 print $outhandle "EMAILPlug: processing $file\n"
126 if $self->{'verbosity'} > 1;
127
128 my $cursection = $doc_obj->get_top_section();
129
130 #
131 # Parse the document's text and extract metadata
132 #
133
134 # Protect backslashes
135 $$textref =~ s@\\@\\\\@g;
136
137 # Separate header from body of message
138 my $Headers = $$textref;
139 $Headers =~ s/\r?\n\r?\n(.*)$//s;
140 $$textref = $1;
141
142 # Unfold headers - see rfc822
143 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
144 # Extract basic metadata from header
145 my @headers = ("From", "To", "Subject", "Date");
146 my %raw;
147 foreach my $name (@headers) {
148 $raw{$name} = "No $name value";
149 }
150
151 # Get a default encoding for the header - RFC says should be ascii...
152 my $default_heading_encoding="iso_8859_1";
153
154 # We don't know what character set is the user's default...
155 # We could use textcat to guess... for now we'll look at mime content-type
156# if ($Headers =~ /([[:^ascii:]])/) {
157# }
158 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
159 $default_header_encoding=$1;
160 $default_header_encoding =~ s@\-@_@g;
161 $default_header_encoding =~ tr/A-Z/a-z/;
162 }
163
164
165 # Examine each line of the headers
166 my ($line, $name, $value);
167 my @parts;
168 foreach $line (split(/\n/, $Headers)) {
169
170 # Ignore lines with no content or which begin with whitespace
171 next unless ($line =~ /:/);
172 next if ($line =~ /^\s/);
173
174 # Find out what metadata is on this line
175 @parts = split(/:/, $line);
176 $name = shift @parts;
177# get fieldname in canonical form - first cap, then lower case.
178 $name =~ tr/A-Z/a-z/;
179# uppercase the first character according to the current locale
180 $name=~s/(.+)/\u$1/;
181 next unless $name;
182 next unless ($raw{$name});
183
184 # Find the value of that metadata
185 $value = join(":", @parts);
186 $value =~ s/^\s+//;
187 $value =~ s/\s+$//;
188
189 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
190 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
191 my $original_value=$value;
192 my $encoded=$value;
193 $value="";
194 # this isn't quite right yet regarding spaces between encoded-texts
195 # (see examples, section 8. of rfc).
196 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
197 my ($charset, $encoding, $data)=($2,$3,$4);
198 my ($decoded_data);
199 $value.="$1"; # any leading chars
200 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
201 chomp $data;
202 $encoding =~ tr/BQ/bq/;
203 if ($encoding eq "q") { # quoted printable
204 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
205 $decoded_data=qp_decode($data);
206 } else { # base 64
207 $decoded_data=base64_decode($data);
208 }
209 $self->convert2unicode($charset, \$decoded_data);
210 $value .= $decoded_data;
211 } # end of while loop
212
213 # get any trailing characters
214 $self->convert2unicode($default_header_encoding, \$encoded);
215 $value.=$encoded;
216
217 if ($value =~ /^\s*$/) { # we couldn't extract anything...
218 $self->convert2unicode($default_header_encoding,
219 \$original_value);
220 $value=original_value;
221 }
222 } else { # end of if =?...?=
223 $self->convert2unicode($default_header_encoding, \$value);
224 }
225
226
227
228 # Store the metadata
229 $raw{$name} = $value;
230 }
231
232 # Extract the name and e-mail address from the From metadata
233 $frommeta = $raw{"From"};
234 my $fromnamemeta;
235 my $fromaddrmeta;
236
237 $frommeta =~ s/\s*$//; # Remove trailing space, if any
238
239 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
240 $fromnamemeta=$1;
241 $fromaddrmeta=$2;
242 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
243 $fromnamemeta=$2;
244 $fromaddrmeta=$1;
245 }
246 if (!defined($fromaddrmeta)) {
247 $fromaddrmeta=$frommeta;
248 }
249 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
250 # minor attempt to prevent spam-bots from harvesting addresses...
251 $fromaddrmeta=~s/@/&#64;/;
252 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
253
254 if (defined($fromnamemeta)) {
255 $fromnamemeta =~ s/\"//g;
256 }
257 else {
258 $fromnamemeta = $fromaddrmeta;
259 }
260 # if name is an address
261 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
262 $fromnamemeta=~s/@/&#64\;/;
263 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
264
265 $raw{"From"}=$frommeta;
266
267 # Process Date information
268 if ($raw{"Date"} !~ /No Date/) {
269 $raw{"DateText"} = $raw{"Date"};
270
271 # Convert the date text to internal date format
272 $value = $raw{"Date"};
273 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
274 if ($year < 100) { $year += 1900; }
275 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
276
277 } else {
278 # We have not extracted a date
279 $raw{"DateText"} = "Unknown.";
280 $raw{"Date"} = "19000000";
281 }
282
283 # Add extracted metadata to document object
284 foreach my $name (keys %raw) {
285 $value = $raw{$name};
286 if ($value) {
287 # assume subject, etc headers have no special HTML meaning.
288 $value = &text_into_html($value);
289 # escape [] so it isn't re-interpreted as metadata
290 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
291 } else {
292 $value = "No $name field";
293 }
294 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
295 }
296
297 my $mimetype="text/plain";
298 my $mimeinfo="";
299 my $charset = $default_header_encoding;
300 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
301 # more than one parameter given to Content-type.
302 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
303 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
304 {
305 $mimetype=$1;
306 $mimetype =~ tr/[A-Z]/[a-z]/;
307
308 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
309 $mimetype = "text/plain";
310 }
311
312 $mimeinfo=$2;
313 if (!defined $mimeinfo) {
314 $mimeinfo="";
315 } else { # strip leading and trailing stuff
316 $mimeinfo =~ s/^\;\s*//;
317 $mimeinfo =~ s/\s*$//;
318 }
319 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
320 $charset = $1;
321 }
322 }
323
324 my $transfer_encoding="7bit";
325 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
326 $transfer_encoding=$1;
327 }
328 if ($mimetype eq "text/html") {
329 $$textref= $self->text_from_part("$Headers\n$$textref");
330 } elsif ($mimetype ne "text/plain") {
331 $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
332 $outhandle);
333 } elsif ($transfer_encoding =~ /quoted\-printable/) {
334 $$textref=qp_decode($$textref);
335 $self->convert2unicode($charset, $textref);
336 } elsif ($transfer_encoding =~ /base64/) {
337 $$textref=base64_decode($$textref);
338 $self->convert2unicode($charset, $textref);
339 } else {
340 $self->convert2unicode($charset, $textref);
341 }
342
343
344 # Add "All headers" metadata
345 $Headers = &text_into_html($Headers);
346
347 $Headers = "No headers" unless ($Headers =~ /\w/);
348 $Headers =~ s/@/&#64\;/g;
349 # escape [] so it isn't re-interpreted as metadata
350 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
351 $self->convert2unicode($charset, \$Headers);
352 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
353
354
355 # Add Title metadata
356 my $Title = text_into_html($raw{'Subject'});
357 $Title .= "<br>From: " . text_into_html($raw{'From'});
358 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
359 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
360
361 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
362
363
364 # Add text to document object
365 if ($mimetype eq "text/plain") {
366 $$textref = &text_into_html($$textref);
367 }
368 $$textref = "No message" unless ($$textref =~ /\w/);
369 $doc_obj->add_utf8_text($cursection, $$textref);
370
371 return 1;
372}
373
374
375# Convert a text string into HTML.
376#
377# The HTML is going to be inserted into a GML file, so
378# we have to be careful not to use symbols like ">",
379# which ocurs frequently in email messages (and use
380# &gt instead.
381#
382# This function also turns links and email addresses into hyperlinks,
383# and replaces carriage returns with <BR> tags (and multiple carriage
384# returns with <P> tags).
385
386
387sub text_into_html {
388 my ($text) = @_;
389
390 # Convert problem characters into HTML symbols
391 $text =~ s/&/&amp;/g;
392 $text =~ s/</&lt;/g;
393 $text =~ s/>/&gt;/g;
394 $text =~ s/\"/&quot;/g;
395
396 # convert email addresses and URIs into links
397# don't markup email addresses for now
398# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
399
400 # try to munge email addresses a little bit...
401 $text =~ s/@/&#64;/;
402 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
403 # assume URI doesn't finish with a '.'
404 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
405
406
407 # Clean up whitespace and convert \n charaters to <BR> or <P>
408 $text =~ s/ +/ /g;
409 $text =~ s/\s*$//g;
410 $text =~ s/^\s*//g;
411 $text =~ s/\n/\n<br>/g;
412 $text =~ s/<br>\s*<br>/<p>/gi;
413
414 return $text;
415}
416
417
418
419
420#Process a MIME message.
421# the textref we are given DOES NOT include the header.
422sub text_from_mime_message {
423 my $self = shift(@_);
424 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
425
426 # Check for multiparts - $mimeinfo will be a boundary
427 if ($mimetype =~ /multipart/) {
428 $boundary="";
429 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
430 $boundary=$1;
431 if ($boundary =~ m@^\"@) {
432 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
433 }
434 } else {
435 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
436 }
437 # parts start with "--$boundary"
438 # message ends with "--$boundary--"
439 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
440 # that perl might want to interpolate. Also allows spaces...
441 $boundary=~s/\\/\\\\/g;
442 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
443 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
444 # remove first "part" and last "part" (final --)
445 shift @message_parts;
446 my $last=pop @message_parts;
447 # if our boundaries are a bit dodgy and we only found 1 part...
448 if (!defined($last)) {$last="";}
449 # make sure it is only -- and whitespace
450 if ($last !~ /^\-\-\s*$/ms) {
451 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
452 }
453 foreach my $message_part (@message_parts) {
454 # remove the leading newline left from split.
455 $message_part=~s/^\r?\n//;
456 }
457 if ($mimetype eq "multipart/alternative") {
458 # check for an HTML version first, then TEXT, otherwise use first.
459 my $part_text="";
460 foreach my $message_part (@message_parts) {
461 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
462 {
463 # Use the HTML version
464 $part_text= $self->text_from_part($message_part);
465 $mimetype="text/html";
466 last;
467 }
468 }
469 if ($part_text eq "") { # try getting a text part instead
470 foreach my $message_part (@message_parts) {
471 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
472 {
473 # Use the plain version
474 $part_text= $self->text_from_part($message_part);
475 if ($part_text =~/[^\s]/) {
476 $part_text="<pre>".$part_text."</pre>";
477 }
478 $mimetype="text/plain";
479 last;
480 }
481 }
482 }
483 if ($part_text eq "") { # use first part
484 $part_text= $self->text_from_part(shift @message_parts);
485 }
486 if ($part_text eq "") { # we couldn't get anything!!!
487 # or it was an empty message...
488 # do nothing...
489 print $outhandle "EMAILPlug: no text - empty body?\n";
490 } else {
491 $text=$part_text;
492 }
493 } elsif ($mimetype =~ m@multipart/(mixed|digest|related)@) {
494 $text="";
495 foreach my $message_part (@message_parts) {
496 my $part_header=$message_part;
497 my $part_body;
498 if ($message_part=~ /^\s*\n/) {
499 # no header... use defaults
500 $part_body=$message_part;
501 $part_header="Content-type: text/plain; charset=us-ascii";
502 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
503 $part_body=$1;
504 } else {
505 # something's gone wrong...
506 $part_header="";
507 $part_body=$message_part;
508 }
509
510 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
511 my $part_content_type="";
512 my $part_content_info="";
513 if ($mimetype eq "multipart/digest") {
514 # default type - RTFRFC!!
515 $part_content_type="message/rfc822";
516 }
517 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*(.*?)\s*$@mi) {
518 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
519 $part_content_info=$2;
520 }
521 my $filename="";
522 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
523 $filename=$1;
524 }
525
526 # disposition - either inline or attachment.
527 # NOT CURRENTLY USED - we display all text types instead...
528 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
529
530 # add <<attachment>> to each part except the first...
531 if ($text ne "") {
532 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
533 # add part info header
534 $text.="<br>Type: $part_content_type<br>\n";
535 if ($filename ne "") {
536 $text.="Filename: $filename\n";
537 }
538 $text.="</strong></p>\n";
539 }
540
541 if ($part_content_type =~ m@text/@)
542 {
543 my $part_text= $self->text_from_part($message_part);
544 if ($part_content_type !~ m@text/(ht|x)ml@) {
545 $part_text=text_into_html($part_text);
546 }
547 if ($part_text eq "") {
548 $part_text='&lt;&lt;empty message&gt;&gt;';
549 }
550 $text.=$part_text;
551 } elsif ($part_content_type =~ m@message/rfc822@) {
552 # This is a forwarded message
553 my $message_part_headers=$part_body;
554 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
555 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
556 my $message_part_body=$1;
557
558 my $rfc822_formatted_body=""; # put result in here
559 if ($message_part_headers =~
560 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.*?)\s*$/ims)
561 {
562 # The message header uses MIME flags
563 my $message_content_type=$1;
564 my $message_content_info=$2;
565 if (!defined($message_content_info)) {
566 $message_content_info="";
567 }
568 $message_content_type =~ tr/A-Z/a-z/;
569 if ($message_content_type =~ /multipart/) {
570 $rfc822_formatted_body=
571 $self->text_from_mime_message($message_content_type,
572 $message_content_info,
573 $message_part_body,
574 $outhandle);
575 } else {
576 $message_part_body= $self->text_from_part($part_body);
577 $rfc822_formatted_body=text_into_html($message_part_body);
578 }
579 } else {
580 # message doesn't use MIME flags
581 $rfc822_formatted_body=text_into_html($message_part_body);
582 }
583 # Add the returned text to the output
584 # don't put all the headers...
585 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
586 $text.=text_into_html($message_part_headers);
587 $text.="<p>\n";
588 $text.=$rfc822_formatted_body;
589 # end of message/rfc822
590 } elsif ($part_content_type =~ /multipart/) {
591 # recurse again
592
593 $tmptext= $self->text_from_mime_message($part_content_type,
594 $part_content_info,
595 $part_body,
596 $outhandle);
597 $text.=$tmptext;
598 } elsif ($text eq "") {
599 # we can't do anything with this part, but if it's the first
600 # part then make sure it is mentioned..
601
602 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
603 # add part info header
604 $text.="<br>Type: $part_content_type<br>\n";
605 if ($filename ne "") {
606 $text.="Filename: $filename\n";
607 }
608 $text.="</strong></p>\n";
609 }
610 } # foreach message part.
611 } else {
612 # we can't handle this multipart type (not mixed or alternative)
613 # the RFC also mentions "parallel".
614 }
615 } # end of ($mimetype !~ multipart)
616 elsif ($mimetype =~ m@message/rfc822@) {
617 my $msg_header = $text;
618 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
619 $text = $1;
620
621 if ($msg_header =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
622 {
623 $mimetype=$1;
624 $mimetype =~ tr/[A-Z]/[a-z]/;
625 $mimeinfo=$2;
626 if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
627 $charset = $1;
628 }
629 my $msg_text;
630 if ($mimetype =~ m@multipart/@) {
631 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
632 $text, $outhandle);
633 } else {$msg_text=$text;}
634
635 my $brief_header="";
636 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
637 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
638 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
639 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
640 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
641 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
642 $text.= "<table><tr><td width=\"5%\"> </td>\n";
643 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
644 . "</td></tr></table>";
645 }
646 } else {
647 # we don't do any processing of the content.
648 }
649
650 return $text;
651}
652
653
654
655
656
657
658# Process a MIME part. Return "" if we can't decode it.
659sub text_from_part {
660 my $self = shift;
661 my $text=shift || '';
662 my $part_header=$text;
663 # check for empty part header (leading blank line)
664 if ($text =~ /^\s*\r?\n/) {
665 $part_header="Content-type: text/plain; charset=us-ascii";
666 } else {
667 $part_header =~ s/\r?\n\r?\n(.*)$//s;
668 $text=$1; if (!defined($text)) {$text="";}
669 }
670 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
671 $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
672 my $type=$1;
673 my $charset=$2;
674 if (!defined($type)) {$type="";}
675 if (!defined($charset)) {$charset="ascii";}
676 my $encoding="";
677 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
678 $encoding=$1; $encoding=~tr/A-Z/a-z/;
679 }
680 # Content-Transfer-Encoding is per-part
681 if ($encoding ne "") {
682 if ($encoding =~ /quoted\-printable/) {
683 $text=qp_decode($text);
684 } elsif ($encoding =~ /base64/) {
685 $text=base64_decode($text);
686 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
687 # rfc2045 also allows binary, which we ignore (for now).
688 # maybe this shouldn't go to stderr, but anyway...
689 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
690 return "";
691 }
692 }
693 if ($type eq "text/html") {
694 # only get stuff between <body> tags, or <html> tags.
695 $text =~ s@^.*<html[^>]*>@@is;
696 $text =~ s@</html>.*$@@is;
697 $text =~ s/^.*?<body[^>]*>//si;
698 $text =~ s/<\/body>.*$//si;
699 }
700 elsif ($type eq "text/xml") {
701 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
702 $text="<pre>\n$text\n</pre>\n";
703 }
704 # convert to unicode
705 $self->convert2unicode($charset, \$text);
706 return $text;
707}
708
709
710# decode quoted-printable text
711sub qp_decode {
712 my $text=shift;
713
714 my @lines=split('\n', $text);
715
716 # if a line ends with "=\s*", it is a soft line break, otherwise
717 # keep in any newline characters.
718 foreach my $line (@lines) {
719 if ($line !~ s/=\s*$//) {$line.="\n";}
720
721 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
722 my @hexcode_segments=split('=',$line);
723 shift @hexcode_segments;
724 my @hexcodes;
725 foreach my $hexcode (@hexcode_segments) {
726 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
727 chomp($hexcode); # just in case...
728 my $char=chr (hex "0x$hexcode");
729 $line =~ s/=$hexcode/$char/g;
730 }
731 }
732 }
733 $text= join('', @lines);
734 return $text;
735}
736
737# decode base64 text. This is fairly slow (since it's interpreted perl rather
738# than compiled XS stuff like in the ::MIME modules, but this is more portable
739# for us at least).
740# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
741# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
742# from each encoded byte.
743
744
745sub base64_decode {
746 my $enc_text = shift;
747# A=>0, B=>1, ..., '+'=>62, '/'=>63
748# also '=' is used for padding at the end, but we remove it anyway.
749 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
750# map each MIME char into it's value, for more efficient lookup.
751 my %index;
752 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
753# remove all non-base64 chars. eval to get variable in transliteration...
754# also remove '=' - we'll assume (!!) that there are no errors in the encoding
755 eval "\$enc_text =~ tr|$mimechars||cd";
756 my $decoded="";
757 while (length ($enc_text)>3)
758 {
759 my $fourchars=substr($enc_text,0,4,"");
760 my @chars=(split '',$fourchars);
761 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
762 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
763 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
764 }
765# if there are any input chars left, there are either
766# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
767 my @chars=(split '',$enc_text);
768 if (length($enc_text)) {
769 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
770 }
771 if (length($enc_text)==3) {
772 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
773 }
774 return $decoded;
775}
776
777sub convert2unicode {
778 my $self = shift(@_);
779 my ($charset, $textref) = @_;
780
781 # first get our character encoding name in the right form.
782 $charset = "iso_8859_1" unless defined $charset;
783 $charset=~tr/A-Z/a-z/;
784 $charset=~s/\-/_/g;
785 $charset=~s/gb2312/gb/;
786 # assumes EUC-KR, not ISO-2022 !?
787 $charset=~s/ks_c_5601_1987/korean/;
788
789 # It appears that we can't always trust ascii text so we'll treat it
790 # as iso-8859-1 (letting characters above 0x80 through without
791 # converting them to utf-8 will result in invalid XML documents
792 # which can't be parsed at build time).
793 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
794
795 if ($charset eq "iso_8859_1") {
796 # test if the mailer lied, and it has win1252 chars in it...
797 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
798 if ($$textref =~ m/[\x80-\x9f]/) {
799 my $outhandle = $self->{'outhandle'};
800 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
801 print $outhandle "codepage 1252 detected.\n";
802 $charset = "windows_1252";
803 }
804 }
805 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
806}
807
808
809# Perl packages have to return true if they are run.
8101;
Note: See TracBrowser for help on using the repository browser.