source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3111

Last change on this file since 3111 was 3111, checked in by jrm21, 22 years ago

Allow .eml extension (IE and mozilla default to this for individual files).

Tidy up the checks for testing for valid mail file format.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.0 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter ([email protected])
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63package EMAILPlug;
64
65use SplitPlug;
66
67use unicode;
68
69use sorttools;
70use util;
71
72
73# EMAILPlug is a sub-class of SplitPlug.
74
75sub BEGIN {
76 @ISA = ('SplitPlug');
77}
78
79# Create a new EMAILPlug object with which to parse a file.
80# Accomplished by creating a new BasPlug and using bless to
81# turn it into an EMAILPlug.
82
83sub new {
84 my ($class) = @_;
85 my $self = new BasPlug ("EMAILPlug", @_);
86 # this might not actually be true at read-time, but after processing
87 # it should all be utf8.
88 $self->{'input_encoding'}="utf8";
89 return bless $self, $class;
90}
91
92sub get_default_process_exp {
93 my $self = shift (@_);
94 # mbx/email for mailbox file format, \d+ for maildir (each message is
95 # in a separate file, with a unique number for filename)
96 # mozilla and IE will save individual mbx format files with a ".eml" ext.
97 return q@([\\/]\d+|\.(mbx|email|eml))$@;
98}
99
100# This plugin splits the mbox mail files at lines starting with From<sp>
101# It is supposed to be "\n\nFrom ", but this isn't always used.
102sub get_default_split_exp {
103 return q^\nFrom .*\n^;
104}
105
106
107# do plugin specific processing of doc_obj
108sub process {
109
110 my $self = shift (@_);
111 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112 my $outhandle = $self->{'outhandle'};
113
114 # Check that we're dealing with a valid mail file
115 # mbox message files start with "From "
116 # maildir messages usually start with Return-Path and Delivered-To
117 # mh is very similar to maildir
118 my $startoffile=substr($$textref,0,256);
119 if (($startoffile !~ /^(From )/) &&
120 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
121 return undef;
122 }
123
124
125 print $outhandle "EMAILPlug: processing $file\n"
126 if $self->{'verbosity'} > 1;
127
128 my $cursection = $doc_obj->get_top_section();
129
130 #
131 # Parse the document's text and extract metadata
132 #
133
134 # Protect backslashes
135 $$textref =~ s@\\@\\\\@g;
136
137 # Separate header from body of message
138 my $Headers = $$textref;
139 $Headers =~ s/\r?\n\r?\n(.*)$//s;
140 $$textref = $1;
141
142 # See if headers include non-ascii - RFC says whole header should be ascii.
143# not yet implemented, as we don't know what character set is the
144# user's default... We can do textcat to guess, or we can just choose
145# one of the charset fields later in the document (if there are any...)
146# if ($Headers =~ /([[:^ascii:]])/) {
147# }
148
149 # Unfold headers - see rfc822
150 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
151 # Extract basic metadata from header
152 my @headers = ("From", "To", "Subject", "Date");
153 my %raw;
154 foreach my $name (@headers) {
155 $raw{$name} = "No $name value";
156 }
157
158 # Examine each line of the headers
159 my ($line, $name, $value);
160 my @parts;
161 foreach $line (split(/\n/, $Headers)) {
162
163 # Ignore lines with no content or which begin with whitespace
164 next unless ($line =~ /:/);
165 next if ($line =~ /^\s/);
166
167 # Find out what metadata is on this line
168 @parts = split(/:/, $line);
169 $name = shift @parts;
170# uppercase the first character according to the current locale
171 $name=~s/(.+)/\u$1/;
172 next unless $name;
173 next unless ($raw{$name});
174
175 # Find the value of that metadata
176 $value = join(":", @parts);
177 $value =~ s/^\s+//;
178 $value =~ s/\s+$//;
179
180 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
181 if ($value =~ /=\?/) {
182 my $original_value=$value;
183 my $encoded=$value;
184 $value="";
185 # this isn't quite right yet regarding spaces between encoded-texts
186 # (see examples, section 8. of rfc).
187 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
188 my ($charset, $encoding, $data)=($2,$3,$4);
189 my ($decoded_data);
190 $value.="$1"; # any leading chars
191 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
192 chomp $data;
193 $encoding =~ tr/BQ/bq/;
194 if ($encoding eq "q") { # quoted printable
195 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
196 $decoded_data=qp_decode($data);
197 } else { # base 64
198 $decoded_data=base64_decode($data);
199 }
200 $self->convert2unicode($charset, \$decoded_data);
201 $value .= $decoded_data;
202 } # end of while loop
203
204 # get any trailing characters
205 $self->convert2unicode("iso_8859_1", \$encoded);
206 $value.=$encoded;
207
208 if ($value =~ /^\s*$/) { # we couldn't extract anything...
209 $self->convert2unicode("iso_8859_1", \$original_value);
210 $value=original_value;
211 }
212 } # end of if =?...?=
213
214 # Store the metadata
215 $raw{$name} = $value;
216 }
217
218 # Extract the name and e-mail address from the From metadata
219 $frommeta = $raw{"From"};
220 my $fromnamemeta;
221 my $fromaddrmeta;
222
223 $frommeta =~ s/\s*$//; # Remove trailing space, if any
224
225 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
226 $fromnamemeta=$1;
227 $fromaddrmeta=$2;
228 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
229 $fromnamemeta=$2;
230 $fromaddrmeta=$1;
231 }
232 if (!defined($fromaddrmeta)) {
233 $fromaddrmeta=$frommeta;
234 }
235 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
236 # minor attempt to prevent spam-bots from harvesting addresses...
237 $fromaddrmeta=~s/@/&#64;/;
238 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
239
240 if (defined($fromnamemeta)) {
241 $fromnamemeta =~ s/\"//g;
242 }
243 else {
244 $fromnamemeta = $fromaddrmeta;
245 }
246 # if name is an address
247 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
248 $fromnamemeta=~s/@/&#64\;/;
249 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
250
251 $raw{"From"}=$frommeta;
252
253 # Process Date information
254 if ($raw{"Date"} !~ /No Date/) {
255 $raw{"DateText"} = $raw{"Date"};
256
257 # Convert the date text to internal date format
258 $value = $raw{"Date"};
259 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
260 if ($year < 100) { $year += 1900; }
261 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
262
263 } else {
264 # We have not extracted a date
265 $raw{"DateText"} = "Unknown.";
266 $raw{"Date"} = "19000000";
267 }
268
269 # Add extracted metadata to document object
270 foreach my $name (keys %raw) {
271 $value = $raw{$name};
272 if ($value) {
273 # assume subject, etc headers have no special HTML meaning.
274 $value = &text_into_html($value);
275 # escape [] so it isn't re-interpreted as metadata
276 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
277 } else {
278 $value = "No $name field";
279 }
280 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
281 }
282
283 my $mimetype="text/plain";
284 my $mimeinfo="";
285 my $charset = "iso_8859_1";
286 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
287 # more than one parameter given to Content-type.
288 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
289 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
290 {
291 $mimetype=$1;
292 $mimetype =~ tr/[A-Z]/[a-z]/;
293
294 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
295 $mimetype = "text/plain";
296 }
297
298 $mimeinfo=$2;
299 if (!defined $mimeinfo) {
300 $mimeinfo="";
301 } else { # strip leading and trailing stuff
302 $mimeinfo =~ s/^\;\s*//;
303 $mimeinfo =~ s/\s*$//;
304 }
305 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
306 $charset = $1;
307 }
308 }
309
310 my $transfer_encoding="7bit";
311 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
312 $transfer_encoding=$1;
313 }
314 if ($mimetype eq "text/html") {
315 $$textref= $self->text_from_part("$Headers\n$$textref");
316 } elsif ($mimetype ne "text/plain") {
317 $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
318 $outhandle);
319 } elsif ($transfer_encoding =~ /quoted\-printable/) {
320 $$textref=qp_decode($$textref);
321 $self->convert2unicode($charset, $textref);
322 } elsif ($transfer_encoding =~ /base64/) {
323 $$textref=base64_decode($$textref);
324 $self->convert2unicode($charset, $textref);
325 } else {
326 $self->convert2unicode($charset, $textref);
327 }
328
329
330 # Add "All headers" metadata
331 $Headers = &text_into_html($Headers);
332
333 $Headers = "No headers" unless ($Headers =~ /\w/);
334 $Headers =~ s/@/&#64\;/g;
335 # escape [] so it isn't re-interpreted as metadata
336 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
337
338 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
339
340
341 # Add Title metadata
342 my $Title = text_into_html($raw{'Subject'});
343 $Title .= "<br>From: " . text_into_html($raw{'From'});
344 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
345 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
346
347 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
348
349
350 # Add text to document object
351 if ($mimetype eq "text/plain") {
352 $$textref = &text_into_html($$textref);
353 }
354 $$textref = "No message" unless ($$textref =~ /\w/);
355 $doc_obj->add_utf8_text($cursection, $$textref);
356
357 return 1;
358}
359
360
361# Convert a text string into HTML.
362#
363# The HTML is going to be inserted into a GML file, so
364# we have to be careful not to use symbols like ">",
365# which ocurs frequently in email messages (and use
366# &gt instead.
367#
368# This function also turns links and email addresses into hyperlinks,
369# and replaces carriage returns with <BR> tags (and multiple carriage
370# returns with <P> tags).
371
372
373sub text_into_html {
374 my ($text) = @_;
375
376 # Convert problem characters into HTML symbols
377 $text =~ s/&/&amp;/go;
378 $text =~ s/</&lt;/go;
379 $text =~ s/>/&gt;/go;
380 $text =~ s/\"/&quot;/go;
381
382 # convert email addresses and URIs into links
383# don't markup email addresses for now
384# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
385
386 # try to munge email addresses a little bit...
387 $text =~ s/@/&#64;/;
388 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
389 # assume URI doesn't finish with a '.'
390 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
391
392
393 # Clean up whitespace and convert \n charaters to <BR> or <P>
394 $text =~ s/ +/ /go;
395 $text =~ s/\s*$//o;
396 $text =~ s/^\s*//o;
397 $text =~ s/\n/\n<BR>/go;
398 $text =~ s/<BR>\s*<BR>/<P>/go;
399
400 return $text;
401}
402
403
404
405
406#Process a MIME message.
407# the textref we are given DOES NOT include the header.
408sub text_from_mime_message {
409 my $self = shift(@_);
410 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
411
412 # Check for multiparts - $mimeinfo will be a boundary
413 if ($mimetype =~ /multipart/) {
414 $boundary="";
415 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
416 $boundary=$1;
417 if ($boundary =~ m@^\"@) {
418 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
419 }
420 } else {
421 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
422 }
423 # parts start with "--$boundary"
424 # message ends with "--$boundary--"
425 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
426 # that perl might want to interpolate. Also allows spaces...
427 $boundary=~s/\\/\\\\/g;
428 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
429 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
430 # remove first "part" and last "part" (final --)
431 shift @message_parts;
432 my $last=pop @message_parts;
433 # if our boundaries are a bit dodgy and we only found 1 part...
434 if (!defined($last)) {$last="";}
435 # make sure it is only -- and whitespace
436 if ($last !~ /^\-\-\s*$/ms) {
437 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
438 }
439 foreach my $message_part (@message_parts) {
440 # remove the leading newline left from split.
441 $message_part=~s/^\r?\n//;
442 }
443 if ($mimetype eq "multipart/alternative") {
444 # check for an HTML version first, then TEXT, otherwise use first.
445 my $part_text="";
446 foreach my $message_part (@message_parts) {
447 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
448 {
449 # Use the HTML version
450 $part_text= $self->text_from_part($message_part);
451 $mimetype="text/html";
452 last;
453 }
454 }
455 if ($part_text eq "") { # try getting a text part instead
456 foreach my $message_part (@message_parts) {
457 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
458 {
459 # Use the plain version
460 $part_text= $self->text_from_part($message_part);
461 if ($part_text =~/[^\s]/) {
462 $part_text="<pre>".$part_text."</pre>";
463 }
464 $mimetype="text/plain";
465 last;
466 }
467 }
468 }
469 if ($part_text eq "") { # use first part
470 $part_text= $self->text_from_part(shift @message_parts);
471 }
472 if ($part_text eq "") { # we couldn't get anything!!!
473 # or it was an empty message...
474 # do nothing...
475 print $outhandle "EMAILPlug: no text - empty body?\n";
476 } else {
477 $text=$part_text;
478 }
479 } elsif ($mimetype =~ m@multipart/(mixed|digest|related)@) {
480 $text="";
481 foreach my $message_part (@message_parts) {
482 my $part_header=$message_part;
483 my $part_body;
484 if ($message_part=~ /^\s*\n/) {
485 # no header... use defaults
486 $part_body=$message_part;
487 $part_header="Content-type: text/plain; charset=us-ascii";
488 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
489 $part_body=$1;
490 } else {
491 # something's gone wrong...
492 $part_header="";
493 $part_body=$message_part;
494 }
495
496 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
497 my $part_content_type="";
498 my $part_content_info="";
499 if ($mimetype eq "multipart/digest") {
500 # default type - RTFRFC!!
501 $part_content_type="message/rfc822";
502 }
503 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*(.*?)\s*$@mi) {
504 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
505 $part_content_info=$2;
506 }
507 my $filename="";
508 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
509 $filename=$1;
510 }
511
512 # disposition - either inline or attachment.
513 # NOT CURRENTLY USED - we display all text types instead...
514 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
515
516 # add <<attachment>> to each part except the first...
517 if ($text ne "") {
518 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
519 # add part info header
520 $text.="<br>Type: $part_content_type<br>\n";
521 if ($filename ne "") {
522 $text.="Filename: $filename\n";
523 }
524 $text.="</strong></p>\n";
525 }
526
527 if ($part_content_type =~ m@text/@)
528 {
529 my $part_text= $self->text_from_part($message_part);
530 if ($part_content_type !~ m@text/(ht|x)ml@) {
531 $part_text=text_into_html($part_text);
532 }
533 if ($part_text eq "") {
534 $part_text='&lt;&lt;empty message&gt;&gt;';
535 }
536 $text.=$part_text;
537 } elsif ($part_content_type =~ m@message/rfc822@) {
538 # This is a forwarded message
539 my $message_part_headers=$part_body;
540 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
541 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
542 my $message_part_body=$1;
543
544 my $rfc822_formatted_body=""; # put result in here
545 if ($message_part_headers =~
546 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.*?)\s*$/ims)
547 {
548 # The message header uses MIME flags
549 my $message_content_type=$1;
550 my $message_content_info=$2;
551 if (!defined($message_content_info)) {
552 $message_content_info="";
553 }
554 $message_content_type =~ tr/A-Z/a-z/;
555 if ($message_content_type =~ /multipart/) {
556 $rfc822_formatted_body=
557 $self->text_from_mime_message($message_content_type,
558 $message_content_info,
559 $message_part_body,
560 $outhandle);
561 } else {
562 $message_part_body= $self->text_from_part($part_body);
563 $rfc822_formatted_body=text_into_html($message_part_body);
564 }
565 } else {
566 # message doesn't use MIME flags
567 $rfc822_formatted_body=text_into_html($message_part_body);
568 }
569 # Add the returned text to the output
570 # don't put all the headers...
571 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
572 $text.=text_into_html($message_part_headers);
573 $text.="<p>\n";
574 $text.=$rfc822_formatted_body;
575 # end of message/rfc822
576 } elsif ($part_content_type =~ /multipart/) {
577 # recurse again
578
579 $tmptext= $self->text_from_mime_message($part_content_type,
580 $part_content_info,
581 $part_body,
582 $outhandle);
583 $text.=$tmptext;
584 } elsif ($text eq "") {
585 # we can't do anything with this part, but if it's the first
586 # part then make sure it is mentioned..
587
588 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
589 # add part info header
590 $text.="<br>Type: $part_content_type<br>\n";
591 if ($filename ne "") {
592 $text.="Filename: $filename\n";
593 }
594 $text.="</strong></p>\n";
595 }
596 } # foreach message part.
597 } else {
598 # we can't handle this multipart type (not mixed or alternative)
599 # the RFC also mentions "parallel".
600 }
601 } # end of ($mimetype !~ multipart)
602 elsif ($mimetype =~ m@message/rfc822@) {
603 my $msg_header = $text;
604 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
605 $text = $1;
606
607 if ($msg_header =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
608 {
609 $mimetype=$1;
610 $mimetype =~ tr/[A-Z]/[a-z]/;
611 $mimeinfo=$2;
612 if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
613 $charset = $1;
614 }
615 my $msg_text;
616 if ($mimetype =~ m@multipart/@) {
617 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
618 $text, $outhandle);
619 } else {$msg_text=$text;}
620
621 my $brief_header="";
622 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
623 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
624 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
625 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
626 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
627 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
628 $text.= "<table><tr><td width=\"5%\"> </td>\n";
629 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
630 . "</td></tr></table>";
631 }
632 } else {
633 # we don't do any processing of the content.
634 }
635
636 return $text;
637}
638
639
640
641
642
643
644# Process a MIME part. Return "" if we can't decode it.
645sub text_from_part {
646 my $self = shift(@_);
647 my $text=shift;
648 my $part_header=$text;
649 # check for empty part header (leading blank line)
650 if ($text =~ /^\s*\r?\n/) {
651 $part_header="Content-type: text/plain; charset=us-ascii";
652 } else {
653 $part_header =~ s/\r?\n\r?\n(.*)$//s;
654 $text=$1; if (!defined($text)) {$text="";}
655 }
656 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
657 $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
658 my $type=$1;
659 my $charset=$2;
660 if (!defined($type)) {$type="";}
661 if (!defined($charset)) {$charset="ascii";}
662 my $encoding="";
663 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
664 $encoding=$1; $encoding=~tr/A-Z/a-z/;
665 }
666 # Content-Transfer-Encoding is per-part
667 if ($encoding ne "") {
668 if ($encoding =~ /quoted\-printable/) {
669 $text=qp_decode($text);
670 } elsif ($encoding =~ /base64/) {
671 $text=base64_decode($text);
672 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
673 # rfc2045 also allows binary, which we ignore (for now).
674 # maybe this shouldn't go to stderr, but anyway...
675 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
676 return "";
677 }
678 }
679 if ($type eq "text/html") {
680 # only get stuff between <body> tags, or <html> tags.
681 $text =~ s@^.*<html[^>]*>@@is;
682 $text =~ s@</html>.*$@@is;
683 $text =~ s/^.*?<body[^>]*>//si;
684 $text =~ s/<\/body>.*$//si;
685 }
686 elsif ($type eq "text/xml") {
687 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
688 $text="<pre>\n$text\n</pre>\n";
689 }
690 # convert to unicode
691 $self->convert2unicode($charset, \$text);
692 return $text;
693}
694
695
696# decode quoted-printable text
697sub qp_decode {
698 my $text=shift;
699
700 my @lines=split('\n', $text);
701
702 # if a line ends with "=\s*", it is a soft line break, otherwise
703 # keep in any newline characters.
704 foreach my $line (@lines) {
705 if ($line !~ s/=\s*$//) {$line.="\n";}
706
707 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
708 my @hexcode_segments=split('=',$line);
709 shift @hexcode_segments;
710 my @hexcodes;
711 foreach my $hexcode (@hexcode_segments) {
712 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
713 chomp($hexcode); # just in case...
714 my $char=chr (hex "0x$hexcode");
715 $line =~ s/=$hexcode/$char/g;
716 }
717 }
718 }
719 $text= join('', @lines);
720 return $text;
721}
722
723# decode base64 text. This is fairly slow (since it's interpreted perl rather
724# than compiled XS stuff like in the ::MIME modules, but this is more portable
725# for us at least).
726# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
727# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
728# from each encoded byte.
729
730
731sub base64_decode {
732 my $enc_text = shift;
733# A=>0, B=>1, ..., '+'=>62, '/'=>63
734# also '=' is used for padding at the end, but we remove it anyway.
735 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
736# map each MIME char into it's value, for more efficient lookup.
737 my %index;
738 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
739# remove all non-base64 chars. eval to get variable in transliteration...
740# also remove '=' - we'll assume (!!) that there are no errors in the encoding
741 eval "\$enc_text =~ tr|$mimechars||cd";
742 my $decoded="";
743 while (length ($enc_text)>3)
744 {
745 my $fourchars=substr($enc_text,0,4,"");
746 my @chars=(split '',$fourchars);
747 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
748 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
749 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
750 }
751# if there are any input chars left, there are either
752# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
753 my @chars=(split '',$enc_text);
754 if (length($enc_text)) {
755 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
756 }
757 if (length($enc_text)==3) {
758 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
759 }
760 return $decoded;
761}
762
763sub convert2unicode {
764 my $self = shift(@_);
765 my ($charset, $textref) = @_;
766
767 # first get our character encoding name in the right form.
768 $charset = "iso_8859_1" unless defined $charset;
769 $charset=~tr/A-Z/a-z/;
770 $charset=~s/\-/_/g;
771 $charset=~s/gb2312/gb/;
772 # assumes EUC-KR, not ISO-2022 !?
773 $charset=~s/ks_c_5601_1987/korean/;
774
775 # It appears that we can't always trust ascii text so we'll treat it
776 # as iso-8859-1 (letting characters above 0x80 through without
777 # converting them to utf-8 will result in invalid XML documents
778 # which can't be parsed at build time).
779 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
780
781 if ($charset eq "iso_8859_1") {
782 # test if the mailer lied, and it has win1252 chars in it...
783 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
784 if ($$textref =~ m/[\x80-\x9f]/) {
785 my $outhandle = $self->{'outhandle'};
786 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
787 print $outhandle "codepage 1252 detected.\n";
788 $charset = "windows_1252";
789 }
790 }
791 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
792}
793
794
795# Perl packages have to return true if they are run.
7961;
Note: See TracBrowser for help on using the repository browser.