source: trunk/gsdl/perllib/plugins/EMAILPlug.pm@ 3143

Last change on this file since 3143 was 3143, checked in by jrm21, 20 years ago

Minor tweak for badly formatted dates. We now use a window, so anything less than 20 is assumed to be 2000 - 2020, and anything over 20 is assumed to be 1920. (I got spam dated "22 May 01" or some such). Previously we were just adding 1900, which meant 1901 in this case.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.8 KB
Line 
1###########################################################################
2#
3# EMAILPlug.pm - a plugin for parsing email files
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999-2001 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28
29# EMAILPlug
30#
31# by Gordon Paynter (gwp@cs.waikato.ac.nz)
32#
33# Email plug reads email files. These are named with a simple
34# number (i.e. as they appear in maildir folders) or with the
35# extension .mbx (for mbox mail file format)
36#
37# Document text:
38# The document text consists of all the text
39# after the first blank line in the document.
40#
41# Metadata (not Dublin Core!):
42# $Headers All the header content
43# $Subject Subject: header
44# $To To: header
45# $From From: header
46# $FromName Name of sender (where available)
47# $FromAddr E-mail address of sender
48# $DateText Date: header
49# $Date Date: header in GSDL format (eg: 19990924)
50#
51# $Title made up of Subject, Date and Sender (for default formatting)
52#
53#
54# John McPherson - June/July 2001
55# added (basic) MIME support and quoted-printable and base64 decodings.
56# Minor fixes for names that are actually email addresses (ie <...> was lost)
57#
58# See: * RFC 822 - ARPA Internet Text Messages
59# * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
60# * RFC 2046 - MIME (part 2) Media Types (and multipart messages)
61# * RFC 2047 - MIME (part 3) Message Header Extensions
62# * RFC 1806 - Content Dispositions (ie inline/attachment)
63package EMAILPlug;
64
65use SplitPlug;
66
67use unicode;
68
69use sorttools;
70use util;
71
72
73# EMAILPlug is a sub-class of SplitPlug.
74
75sub BEGIN {
76 @ISA = ('SplitPlug');
77}
78
79# Create a new EMAILPlug object with which to parse a file.
80# Accomplished by creating a new BasPlug and using bless to
81# turn it into an EMAILPlug.
82
83sub new {
84 my ($class) = @_;
85 my $self = new BasPlug ("EMAILPlug", @_);
86 # this might not actually be true at read-time, but after processing
87 # it should all be utf8.
88 $self->{'input_encoding'}="utf8";
89 return bless $self, $class;
90}
91
92sub get_default_process_exp {
93 my $self = shift (@_);
94 # mbx/email for mailbox file format, \d+ for maildir (each message is
95 # in a separate file, with a unique number for filename)
96 # mozilla and IE will save individual mbx format files with a ".eml" ext.
97 return q@([\\/]\d+|\.(mbx|email|eml))$@;
98}
99
100# This plugin splits the mbox mail files at lines starting with From<sp>
101# It is supposed to be "\n\nFrom ", but this isn't always used.
102sub get_default_split_exp {
103 return q^\nFrom .*\n^;
104}
105
106
107# do plugin specific processing of doc_obj
108sub process {
109
110 my $self = shift (@_);
111 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
112 my $outhandle = $self->{'outhandle'};
113
114 # Check that we're dealing with a valid mail file
115 # mbox message files start with "From "
116 # maildir messages usually start with Return-Path and Delivered-To
117 # mh is very similar to maildir
118 my $startoffile=substr($$textref,0,256);
119 if (($startoffile !~ /^(From )/) &&
120 ($startoffile !~ /^(From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\-.*|MIME-Version|Forwarded):/im)) {
121 return undef;
122 }
123
124
125 print $outhandle "EMAILPlug: processing $file\n"
126 if $self->{'verbosity'} > 1;
127
128 my $cursection = $doc_obj->get_top_section();
129
130 #
131 # Parse the document's text and extract metadata
132 #
133
134 # Protect backslashes
135 $$textref =~ s@\\@\\\\@g;
136
137 # Separate header from body of message
138 my $Headers = $$textref;
139 $Headers =~ s/\r?\n\r?\n(.*)$//s;
140 $$textref = $1;
141
142 # Unfold headers - see rfc822
143 $Headers =~ s/\r?\n[\t\ ]+/ /gs;
144 # Extract basic metadata from header
145 my @headers = ("From", "To", "Subject", "Date");
146 my %raw;
147 foreach my $name (@headers) {
148 $raw{$name} = "No $name value";
149 }
150
151 # Get a default encoding for the header - RFC says should be ascii...
152 my $default_heading_encoding="iso_8859_1";
153
154 # We don't know what character set is the user's default...
155 # We could use textcat to guess... for now we'll look at mime content-type
156# if ($Headers =~ /([[:^ascii:]])/) {
157# }
158 if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
159 $default_header_encoding=$1;
160 $default_header_encoding =~ s@\-@_@g;
161 $default_header_encoding =~ tr/A-Z/a-z/;
162 }
163
164
165 # Examine each line of the headers
166 my ($line, $name, $value);
167 my @parts;
168 foreach $line (split(/\n/, $Headers)) {
169
170 # Ignore lines with no content or which begin with whitespace
171 next unless ($line =~ /:/);
172 next if ($line =~ /^\s/);
173
174 # Find out what metadata is on this line
175 @parts = split(/:/, $line);
176 $name = shift @parts;
177 # get fieldname in canonical form - first cap, then lower case.
178 $name =~ tr/A-Z/a-z/;
179 # uppercase the first character according to the current locale
180 $name=~s/(.+)/\u$1/;
181 next unless $name;
182 next unless ($raw{$name});
183
184 # Find the value of that metadata
185 $value = join(":", @parts);
186 $value =~ s/^\s+//;
187 $value =~ s/\s+$//;
188
189 # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
190 if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
191 my $original_value=$value;
192 my $encoded=$value;
193 $value="";
194 # this isn't quite right yet regarding spaces between encoded-texts
195 # (see examples, section 8. of rfc).
196 while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
197 my ($charset, $encoding, $data)=($2,$3,$4);
198 my ($decoded_data);
199 $value.="$1"; # any leading chars
200 $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
201 chomp $data;
202 $encoding =~ tr/BQ/bq/;
203 if ($encoding eq "q") { # quoted printable
204 $data =~ s/_/\ /g; # from rfc2047 (sec 4.2.2)
205 $decoded_data=qp_decode($data);
206 } else { # base 64
207 $decoded_data=base64_decode($data);
208 }
209 $self->convert2unicode($charset, \$decoded_data);
210 $value .= $decoded_data;
211 } # end of while loop
212
213 # get any trailing characters
214 $self->convert2unicode($default_header_encoding, \$encoded);
215 $value.=$encoded;
216
217 if ($value =~ /^\s*$/) { # we couldn't extract anything...
218 $self->convert2unicode($default_header_encoding,
219 \$original_value);
220 $value=original_value;
221 }
222 } # end of if =?...?=
223
224 # In the absense of other charset information, assume the
225 # header is the default (usually "iso_8859_1") and convert it to unicode.
226 else {
227 $self->convert2unicode($default_header_encoding, \$value);
228 }
229
230 # Store the metadata
231 $raw{$name} = $value;
232 }
233
234 # Extract the name and e-mail address from the From metadata
235 $frommeta = $raw{"From"};
236 my $fromnamemeta;
237 my $fromaddrmeta;
238
239 $frommeta =~ s/\s*$//; # Remove trailing space, if any
240
241 if ($frommeta =~ m/(.+)\s*<(.+)>/) {
242 $fromnamemeta=$1;
243 $fromaddrmeta=$2;
244 } elsif ($frommeta =~ m/(.+@.+)\s+\((.*)\)/) {
245 $fromnamemeta=$2;
246 $fromaddrmeta=$1;
247 }
248 if (!defined($fromaddrmeta)) {
249 $fromaddrmeta=$frommeta;
250 }
251 $fromaddrmeta=~s/<//; $fromaddrmeta=~s/>//;
252 # minor attempt to prevent spam-bots from harvesting addresses...
253 $fromaddrmeta=~s/@/&#64;/;
254 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
255
256 if (defined($fromnamemeta)) {
257 $fromnamemeta =~ s/\"//g;
258 }
259 else {
260 $fromnamemeta = $fromaddrmeta;
261 }
262 # if name is an address
263 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
264 $fromnamemeta=~s/@/&#64\;/;
265 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
266
267 $raw{"From"}=$frommeta;
268
269 # Process Date information
270 if ($raw{"Date"} !~ /No Date/) {
271 $raw{"DateText"} = $raw{"Date"};
272
273 # Convert the date text to internal date format
274 $value = $raw{"Date"};
275 my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
276 # make some assumptions about the year formatting...
277 # some (old) software thinks 2001 is 101, some think 2001 is 01
278 if ($year < 20) { $year += 2000; } # assume not really 1920...
279 elsif ($year < 150) { $year += 1900; } # assume not really 2150...
280 $raw{"Date"} = &sorttools::format_date($day, $month, $year);
281
282 } else {
283 # We have not extracted a date
284 $raw{"DateText"} = "Unknown.";
285 $raw{"Date"} = "19000000";
286 }
287
288 # Add extracted metadata to document object
289 foreach my $name (keys %raw) {
290 $value = $raw{$name};
291 if ($value) {
292 # assume subject, etc headers have no special HTML meaning.
293 $value = &text_into_html($value);
294 # escape [] so it isn't re-interpreted as metadata
295 $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
296 } else {
297 $value = "No $name field";
298 }
299 $doc_obj->add_utf8_metadata ($cursection, $name, $value);
300 }
301
302 my $mimetype="text/plain";
303 my $mimeinfo="";
304 my $charset = $default_header_encoding;
305 # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
306 # more than one parameter given to Content-type.
307 # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
308 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
309 {
310 $mimetype=$1;
311 $mimetype =~ tr/[A-Z]/[a-z]/;
312
313 if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
314 $mimetype = "text/plain";
315 }
316
317 $mimeinfo=$2;
318 if (!defined $mimeinfo) {
319 $mimeinfo="";
320 } else { # strip leading and trailing stuff
321 $mimeinfo =~ s/^\;\s*//;
322 $mimeinfo =~ s/\s*$//;
323 }
324 if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
325 $charset = $1;
326 }
327 }
328
329 my $transfer_encoding="7bit";
330 if ($Headers =~ /^content-transfer-encoding:\s*([^\s]+)\s*$/mi) {
331 $transfer_encoding=$1;
332 }
333 if ($mimetype eq "text/html") {
334 $$textref= $self->text_from_part("$Headers\n$$textref");
335 } elsif ($mimetype ne "text/plain") {
336 $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
337 $outhandle);
338 } elsif ($transfer_encoding =~ /quoted\-printable/) {
339 $$textref=qp_decode($$textref);
340 $self->convert2unicode($charset, $textref);
341 } elsif ($transfer_encoding =~ /base64/) {
342 $$textref=base64_decode($$textref);
343 $self->convert2unicode($charset, $textref);
344 } else {
345 $self->convert2unicode($charset, $textref);
346 }
347
348
349 # Add "All headers" metadata
350 $Headers = &text_into_html($Headers);
351
352 $Headers = "No headers" unless ($Headers =~ /\w/);
353 $Headers =~ s/@/&#64\;/g;
354 # escape [] so it isn't re-interpreted as metadata
355 $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
356 $self->convert2unicode($charset, \$Headers);
357 $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
358
359
360 # Add Title metadata
361 my $Title = text_into_html($raw{'Subject'});
362 $Title .= "<br>From: " . text_into_html($raw{'From'});
363 $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
364 $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
365
366 $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
367
368
369 # Add text to document object
370 if ($mimetype eq "text/plain") {
371 $$textref = &text_into_html($$textref);
372 }
373 $$textref = "No message" unless ($$textref =~ /\w/);
374 $doc_obj->add_utf8_text($cursection, $$textref);
375
376 return 1;
377}
378
379
380# Convert a text string into HTML.
381#
382# The HTML is going to be inserted into a GML file, so
383# we have to be careful not to use symbols like ">",
384# which ocurs frequently in email messages (and use
385# &gt instead.
386#
387# This function also turns links and email addresses into hyperlinks,
388# and replaces carriage returns with <BR> tags (and multiple carriage
389# returns with <P> tags).
390
391
392sub text_into_html {
393 my ($text) = @_;
394
395 # Convert problem characters into HTML symbols
396 $text =~ s/&/&amp;/g;
397 $text =~ s/</&lt;/g;
398 $text =~ s/>/&gt;/g;
399 $text =~ s/\"/&quot;/g;
400
401 # convert email addresses and URIs into links
402# don't markup email addresses for now
403# $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
404
405 # try to munge email addresses a little bit...
406 $text =~ s/@/&#64;/;
407 # assume hostnames are \.\w\- only, then might have a trailing '/.*'
408 # assume URI doesn't finish with a '.'
409 $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
410
411
412 # Clean up whitespace and convert \n charaters to <BR> or <P>
413 $text =~ s/ +/ /g;
414 $text =~ s/\s*$//g;
415 $text =~ s/^\s*//g;
416 $text =~ s/\n/\n<br>/g;
417 $text =~ s/<br>\s*<br>/<p>/gi;
418
419 return $text;
420}
421
422
423
424
425#Process a MIME message.
426# the textref we are given DOES NOT include the header.
427sub text_from_mime_message {
428 my $self = shift(@_);
429 my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
430
431 # Check for multiparts - $mimeinfo will be a boundary
432 if ($mimetype =~ /multipart/) {
433 $boundary="";
434 if ($mimeinfo =~ m@boundary=(\"[^\"]+\"|[^\s]+)\s*$@im) {
435 $boundary=$1;
436 if ($boundary =~ m@^\"@) {
437 $boundary =~ s@^\"@@; $boundary =~ s@\"$@@;
438 }
439 } else {
440 print $outhandle "EMAILPlug: (warning) couldn't parse MIME boundary\n";
441 }
442 # parts start with "--$boundary"
443 # message ends with "--$boundary--"
444 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
445 # that perl might want to interpolate. Also allows spaces...
446 $boundary=~s/\\/\\\\/g;
447 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
448 my @message_parts = split("\r?\n\-\-$boundary", "\n$text");
449 # remove first "part" and last "part" (final --)
450 shift @message_parts;
451 my $last=pop @message_parts;
452 # if our boundaries are a bit dodgy and we only found 1 part...
453 if (!defined($last)) {$last="";}
454 # make sure it is only -- and whitespace
455 if ($last !~ /^\-\-\s*$/ms) {
456 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
457 }
458 foreach my $message_part (@message_parts) {
459 # remove the leading newline left from split.
460 $message_part=~s/^\r?\n//;
461 }
462 if ($mimetype eq "multipart/alternative") {
463 # check for an HTML version first, then TEXT, otherwise use first.
464 my $part_text="";
465 foreach my $message_part (@message_parts) {
466 if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
467 {
468 # Use the HTML version
469 $part_text= $self->text_from_part($message_part);
470 $mimetype="text/html";
471 last;
472 }
473 }
474 if ($part_text eq "") { # try getting a text part instead
475 foreach my $message_part (@message_parts) {
476 if ($message_part =~ m@^content\-type:\s*text/plain@mis)
477 {
478 # Use the plain version
479 $part_text= $self->text_from_part($message_part);
480 if ($part_text =~/[^\s]/) {
481 $part_text="<pre>".$part_text."</pre>";
482 }
483 $mimetype="text/plain";
484 last;
485 }
486 }
487 }
488 if ($part_text eq "") { # use first part
489 $part_text= $self->text_from_part(shift @message_parts);
490 }
491 if ($part_text eq "") { # we couldn't get anything!!!
492 # or it was an empty message...
493 # do nothing...
494 print $outhandle "EMAILPlug: no text - empty body?\n";
495 } else {
496 $text=$part_text;
497 }
498 } elsif ($mimetype =~ m@multipart/(mixed|digest|related)@) {
499 $text="";
500 foreach my $message_part (@message_parts) {
501 my $part_header=$message_part;
502 my $part_body;
503 if ($message_part=~ /^\s*\n/) {
504 # no header... use defaults
505 $part_body=$message_part;
506 $part_header="Content-type: text/plain; charset=us-ascii";
507 } elsif ($part_header=~s/\r?\n\r?\n(.*)$//sg) {
508 $part_body=$1;
509 } else {
510 # something's gone wrong...
511 $part_header="";
512 $part_body=$message_part;
513 }
514
515 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
516 my $part_content_type="";
517 my $part_content_info="";
518 if ($mimetype eq "multipart/digest") {
519 # default type - RTFRFC!!
520 $part_content_type="message/rfc822";
521 }
522 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*(.*?)\s*$@mi) {
523 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
524 $part_content_info=$2;
525 }
526 my $filename="";
527 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
528 $filename=$1;
529 }
530
531 # disposition - either inline or attachment.
532 # NOT CURRENTLY USED - we display all text types instead...
533 # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
534
535 # add <<attachment>> to each part except the first...
536 if ($text ne "") {
537 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
538 # add part info header
539 $text.="<br>Type: $part_content_type<br>\n";
540 if ($filename ne "") {
541 $text.="Filename: $filename\n";
542 }
543 $text.="</strong></p>\n";
544 }
545
546 if ($part_content_type =~ m@text/@)
547 {
548 my $part_text= $self->text_from_part($message_part);
549 if ($part_content_type !~ m@text/(ht|x)ml@) {
550 $part_text=text_into_html($part_text);
551 }
552 if ($part_text eq "") {
553 $part_text='&lt;&lt;empty message&gt;&gt;';
554 }
555 $text.=$part_text;
556 } elsif ($part_content_type =~ m@message/rfc822@) {
557 # This is a forwarded message
558 my $message_part_headers=$part_body;
559 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
560 $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
561 my $message_part_body=$1;
562
563 my $rfc822_formatted_body=""; # put result in here
564 if ($message_part_headers =~
565 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.*?)\s*$/ims)
566 {
567 # The message header uses MIME flags
568 my $message_content_type=$1;
569 my $message_content_info=$2;
570 if (!defined($message_content_info)) {
571 $message_content_info="";
572 }
573 $message_content_type =~ tr/A-Z/a-z/;
574 if ($message_content_type =~ /multipart/) {
575 $rfc822_formatted_body=
576 $self->text_from_mime_message($message_content_type,
577 $message_content_info,
578 $message_part_body,
579 $outhandle);
580 } else {
581 $message_part_body= $self->text_from_part($part_body);
582 $rfc822_formatted_body=text_into_html($message_part_body);
583 }
584 } else {
585 # message doesn't use MIME flags
586 $rfc822_formatted_body=text_into_html($message_part_body);
587 }
588 # Add the returned text to the output
589 # don't put all the headers...
590 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
591 $text.=text_into_html($message_part_headers);
592 $text.="<p>\n";
593 $text.=$rfc822_formatted_body;
594 # end of message/rfc822
595 } elsif ($part_content_type =~ /multipart/) {
596 # recurse again
597
598 $tmptext= $self->text_from_mime_message($part_content_type,
599 $part_content_info,
600 $part_body,
601 $outhandle);
602 $text.=$tmptext;
603 } elsif ($text eq "") {
604 # we can't do anything with this part, but if it's the first
605 # part then make sure it is mentioned..
606
607 $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
608 # add part info header
609 $text.="<br>Type: $part_content_type<br>\n";
610 if ($filename ne "") {
611 $text.="Filename: $filename\n";
612 }
613 $text.="</strong></p>\n";
614 }
615 } # foreach message part.
616 } else {
617 # we can't handle this multipart type (not mixed or alternative)
618 # the RFC also mentions "parallel".
619 }
620 } # end of ($mimetype !~ multipart)
621 elsif ($mimetype =~ m@message/rfc822@) {
622 my $msg_header = $text;
623 $msg_header =~ s/\r?\n\r?\n(.*)$//s;
624 $text = $1;
625
626 if ($msg_header =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
627 {
628 $mimetype=$1;
629 $mimetype =~ tr/[A-Z]/[a-z]/;
630 $mimeinfo=$2;
631 if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
632 $charset = $1;
633 }
634 my $msg_text;
635 if ($mimetype =~ m@multipart/@) {
636 $msg_text = text_from_mime_message($self, $mimetype, $mimeinfo,
637 $text, $outhandle);
638 } else {$msg_text=$text;}
639
640 my $brief_header="";
641 if ($msg_header =~ /^(From:.*)$/im) {$brief_header.="$1<br>";}
642 if ($msg_header =~ /^(To:.*)$/im) {$brief_header.="$1<br>";}
643 if ($msg_header =~ /^(Cc:.*)$/im) {$brief_header.="$1<br>";}
644 if ($msg_header =~ /^(Subject:.*)$/im) {$brief_header.="$1<br>";}
645 if ($msg_header =~ /^(Date:.*)$/im) {$brief_header.="$1<br>";}
646 $text= "\n<b>&lt;&lt;attached message&gt;&gt;</b><br>";
647 $text.= "<table><tr><td width=\"5%\"> </td>\n";
648 $text.="<td>" . $brief_header . "\n</p>" . $msg_text
649 . "</td></tr></table>";
650 }
651 } else {
652 # we don't do any processing of the content.
653 }
654
655 return $text;
656}
657
658
659
660
661
662
663# Process a MIME part. Return "" if we can't decode it.
664sub text_from_part {
665 my $self = shift;
666 my $text = shift || '';
667 my $part_header = $text;
668
669 # check for empty part header (leading blank line)
670 if ($text =~ /^\s*\r?\n/) {
671 $part_header="Content-type: text/plain; charset=us-ascii";
672 } else {
673 $part_header =~ s/\r?\n\r?\n(.*)$//s;
674 $text=$1; if (!defined($text)) {$text="";}
675 }
676 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
677 $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
678 my $type=$1;
679 my $charset=$2;
680 if (!defined($type)) {$type="";}
681 if (!defined($charset)) {$charset="ascii";}
682 my $encoding="";
683 if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
684 $encoding=$1; $encoding=~tr/A-Z/a-z/;
685 }
686 # Content-Transfer-Encoding is per-part
687 if ($encoding ne "") {
688 if ($encoding =~ /quoted\-printable/) {
689 $text=qp_decode($text);
690 } elsif ($encoding =~ /base64/) {
691 $text=base64_decode($text);
692 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
693 # rfc2045 also allows binary, which we ignore (for now).
694 # maybe this shouldn't go to stderr, but anyway...
695 print STDERR "EMAILPlug: unknown encoding: $encoding\n";
696 return "";
697 }
698 }
699 if ($type eq "text/html") {
700 # only get stuff between <body> tags, or <html> tags.
701 $text =~ s@^.*<html[^>]*>@@is;
702 $text =~ s@</html>.*$@@is;
703 $text =~ s/^.*?<body[^>]*>//si;
704 $text =~ s/<\/body>.*$//si;
705 }
706 elsif ($type eq "text/xml") {
707 $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
708 $text="<pre>\n$text\n</pre>\n";
709 }
710 # convert to unicode
711 $self->convert2unicode($charset, \$text);
712 return $text;
713}
714
715
716# decode quoted-printable text
717sub qp_decode {
718 my $text=shift;
719
720 my @lines=split('\n', $text);
721
722 # if a line ends with "=\s*", it is a soft line break, otherwise
723 # keep in any newline characters.
724 foreach my $line (@lines) {
725 if ($line !~ s/=\s*$//) {$line.="\n";}
726
727 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
728 my @hexcode_segments=split('=',$line);
729 shift @hexcode_segments;
730 my @hexcodes;
731 foreach my $hexcode (@hexcode_segments) {
732 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars
733 chomp($hexcode); # just in case...
734 my $char=chr (hex "0x$hexcode");
735 $line =~ s/=$hexcode/$char/g;
736 }
737 }
738 }
739 $text= join('', @lines);
740 return $text;
741}
742
743# decode base64 text. This is fairly slow (since it's interpreted perl rather
744# than compiled XS stuff like in the ::MIME modules, but this is more portable
745# for us at least).
746# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
747# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
748# from each encoded byte.
749
750
751sub base64_decode {
752 my $enc_text = shift;
753# A=>0, B=>1, ..., '+'=>62, '/'=>63
754# also '=' is used for padding at the end, but we remove it anyway.
755 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
756# map each MIME char into it's value, for more efficient lookup.
757 my %index;
758 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
759# remove all non-base64 chars. eval to get variable in transliteration...
760# also remove '=' - we'll assume (!!) that there are no errors in the encoding
761 eval "\$enc_text =~ tr|$mimechars||cd";
762 my $decoded="";
763 while (length ($enc_text)>3)
764 {
765 my $fourchars=substr($enc_text,0,4,"");
766 my @chars=(split '',$fourchars);
767 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4);
768 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
769 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]});
770 }
771# if there are any input chars left, there are either
772# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
773 my @chars=(split '',$enc_text);
774 if (length($enc_text)) {
775 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
776 }
777 if (length($enc_text)==3) {
778 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
779 }
780 return $decoded;
781}
782
783sub convert2unicode {
784 my $self = shift(@_);
785 my ($charset, $textref) = @_;
786
787 # first get our character encoding name in the right form.
788 $charset = "iso_8859_1" unless defined $charset;
789 $charset=~tr/A-Z/a-z/;
790 $charset=~s/\-/_/g;
791 $charset=~s/gb2312/gb/;
792 # assumes EUC-KR, not ISO-2022 !?
793 $charset=~s/ks_c_5601_1987/korean/;
794
795 # It appears that we can't always trust ascii text so we'll treat it
796 # as iso-8859-1 (letting characters above 0x80 through without
797 # converting them to utf-8 will result in invalid XML documents
798 # which can't be parsed at build time).
799 $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
800
801 if ($charset eq "iso_8859_1") {
802 # test if the mailer lied, and it has win1252 chars in it...
803 # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
804 if ($$textref =~ m/[\x80-\x9f]/) {
805 my $outhandle = $self->{'outhandle'};
806 print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
807 print $outhandle "codepage 1252 detected.\n";
808 $charset = "windows_1252";
809 }
810 }
811 $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
812}
813
814
815# Perl packages have to return true if they are run.
8161;
Note: See TracBrowser for help on using the repository browser.