Changeset 2630 for trunk/gsdl/perllib/plugins/EMAILPlug.pm
- Timestamp:
- 2001-07-09T19:09:06+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r2493 r2630 44 44 # $To To: header 45 45 # $From From: header - this will be stored as Creator 46 # $FromName Name of sender (where available) 47 # $FromAddr E-mail address of sender 46 48 # $DateText Date: header 47 49 # $Date Date: header in GSDL format (eg: 19990924) 48 50 # 51 # 52 # John McPherson - June/July 2001 53 # added (basic) MIME support and quoted-printable and base64 decodings. 54 # Minor fixes for names that are actually email addresses (ie <...> was lost) 55 # 56 # See: * RFC 822 - ARPA Internet Text Messages 57 # * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1 58 # * RFC 2046 - MIME (part 2) Media Types (and multipart messages) 59 # * RFC 1806 - Content Dispositions (ie inline/attachment) 49 60 package EMAILPlug; 50 61 … … 68 79 my ($class) = @_; 69 80 my $self = new BasPlug ("EMAILPlug", @_); 70 81 # make sure we don't run textcat (defaults to "auto"); 82 $self->{'input_encoding'}="ascii"; 71 83 return bless $self, $class; 72 84 } … … 76 88 # mbx/email for mailbox file format, \d+ for maildir (each message is 77 89 # in a separate file, with a unique number for filename) 78 return q@ [\\/]\d+|\.(mbx|email)$@;90 return q@([\\/]\d+|\.(mbx|email))$@; 79 91 } 80 92 … … 87 99 # do plugin specific processing of doc_obj 88 100 sub process { 101 89 102 my $self = shift (@_); 90 103 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; … … 92 105 93 106 # Check that we're dealing with a valid mail file 94 return undef unless (($$textref =~ / ^From:/m) || ($$textref =~ /^To:/m));107 return undef unless (($$textref =~ /From:/m) || ($$textref =~ /To:/m)); 95 108 96 109 # slightly more strict validity check, to prevent us from matching … … 110 123 # Separate header from body of message 111 124 my $Headers = $$textref; 112 #$Headers =~ s/\n\n.*//s; # This line changed at Marcio's request113 $ Headers =~ s/\x0a\x0d?\x0a.*//s;114 $$textref = substr $$textref, (length $Headers); 115 116 125 $Headers =~ s/\r?\n\r?\n(.*)$//s; 126 $$textref = $1; 127 128 # Unfold headers - see rfc822 129 $Headers =~ s/\r?\n[\t\ ]+/ /gs; 117 130 # Extract basic metadata from header 118 131 my @headers = ("From", "To", "Subject", "Date"); … … 134 147 @parts = split(/:/, $line); 135 148 $name = shift @parts; 149 # uppercase the first character according to the current locale 150 $name=~s/(.+)/\u$1/; 136 151 next unless $name; 137 152 next unless ($raw{$name}); … … 145 160 $raw{$name} = $value; 146 161 } 162 163 # Extract the name and e-mail address from the From metadata 164 $frommeta = $raw{"From"}; 165 $frommeta =~ m/(.*)<(.*)>/; 166 my $fromnamemeta=$1; 167 my $fromaddrmeta=$2; 168 if (!defined($fromaddrmeta)) { 169 $fromaddrmeta=$frommeta; 170 } 171 $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta); 172 173 if (defined($fromnameneta)) { 174 $fromnamemeta =~ s/\"//g; 175 $fromnamemeta =~ s/(.*) /$1/; # Remove trailing space 176 } 177 else { 178 $fromnamemeta = $fromaddrmeta; 179 } 180 # if name is an address 181 $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g; 182 $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta); 183 184 # Escape < and > in the whole From field; 185 $frommeta =~ s/</</g; $frommeta =~ s/>/>/g; 186 $raw{"From"}=$frommeta; 147 187 148 188 # Process Date information … … 174 214 } 175 215 216 my $mimetype="text/plain"; 217 my $mimeinfo=""; 218 # Do MIME and encoding stuff 219 if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*([^\s]+)\s*$/mi) 220 { 221 $mimetype=$1; 222 $mimetype =~ tr/[A-Z]/[a-z]/; 223 $mimeinfo=$2; 224 } 225 226 if ($mimetype ne "text/plain") { 227 $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref); 228 } # end of not text/plain 229 230 176 231 # Add "All headers" metadata 177 232 $Headers = &text_into_html($Headers); … … 180 235 181 236 # Add text to document object 182 $$textref = &text_into_html($$textref); 237 if ($mimetype eq "text/plain") { 238 $$textref = &text_into_html($$textref); 239 } 183 240 $$textref = "No message" unless ($$textref =~ /\w/); 184 241 $doc_obj->add_utf8_text($cursection, $$textref); … … 209 266 $text =~ s/\"/"/go; 210 267 211 # convert email addresses and URLs into links 212 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 213 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g; 268 # convert email addresses and URIs into links 269 # don't markup email addresses for now 270 # $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 271 272 # assume hostnames are \.\w\d\- only, then might have a trailing '/.*' 273 # URI can't finish with a '.' 274 $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g; 275 214 276 215 277 # Clean up whitespace and convert \n charaters to <BR> or <P> … … 224 286 225 287 288 289 290 #Process a MIME message. 291 # the textref we are given DOES NOT include the header. 292 sub text_from_mime_message { 293 my ($mimetype,$mimeinfo,$text)=(@_); 294 295 # Check for multiparts - $mimeinfo will be a boundary 296 if ($mimetype =~ /multipart/) { 297 $boundary=""; 298 if ($mimeinfo =~ /boundary="?([^\s]+?)"?\s*$/im) { 299 $boundary=$1; 300 } 301 # parts start with "--$boundary" 302 # message ends with "--$boundary--" 303 # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any 304 # that perl might want to interpolate. 305 306 $boundary=~s/\\/\\\\/g; 307 $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g; 308 my @message_parts = split("\r?\n\-\-$boundary", $text); 309 # remove first "part" and last "part" (final --) 310 shift @message_parts; 311 my $last=pop @message_parts; 312 # make sure it is only -- and whitespace 313 if ($last !~ /^\-\-\s*$/ms) { 314 print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n"; 315 } 316 foreach my $message_part (@message_parts) { 317 # remove the leading newline left from split. 318 $message_part=~s/^\r?\n//; 319 } 320 if ($mimetype eq "multipart/alternative") { 321 # check for an HTML version first, then TEXT, otherwise use first. 322 my $part_text=""; 323 foreach my $message_part (@message_parts) { 324 if ($message_part =~ m@\s*content\-type:\s*text/html@mis) 325 { 326 # Use the HTML version 327 $part_text=text_from_part($message_part); 328 $mimetype="text/html"; 329 last; 330 } 331 } 332 if ($part_text eq "") { # try getting a text part instead 333 foreach my $message_part (@message_parts) { 334 if ($message_part =~ m@^content\-type:\s*text/plain@mis) 335 { 336 # Use the plain version 337 $part_text=text_from_part($message_part); 338 $mimetype="text/plain"; 339 last; 340 } 341 } 342 } 343 if ($part_text eq "") { # use first part 344 $part_text=text_from_part(shift @message_parts); 345 } 346 if ($part_text eq "") { # we couldn't get anything!!! 347 # or it was an empty message... 348 # do nothing... 349 print $outhandle "EMAILPlug: no text - empty body?\n"; 350 } else { 351 $text=$part_text; 352 } 353 } elsif ($mimetype eq "multipart/mixed" || 354 $mimetype eq "multipart/digest") { 355 $text=""; 356 foreach my $message_part (@message_parts) { 357 my $part_header=$message_part; 358 $part_header=~s/\r?\n\r?\n(.*)$//sg; 359 my $part_body=$1; 360 $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold 361 my $part_content_type=""; 362 my $part_content_info=""; 363 if ($mimetype eq "multipart/digest") { 364 # default type - RTFRFC!! 365 $part_content_type="message/rfc822"; 366 } 367 if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*([^\s]+)@mi) { 368 $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/; 369 $part_content_info=$2; 370 } 371 my $filename=""; 372 if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) { 373 $filename=$1; 374 } 375 376 # disposition - either inline or attachment. 377 # NOT CURRENTLY USED - we display all text types instead... 378 # $part_header =~ /^content\-disposition:\s*([\w+])/mis; 379 380 # add <<attachment>> to each part except the first... 381 if ($text ne "") { 382 $text.="<p><hr><strong><<attachment>></>"; 383 # add part info header 384 $text.="<br>Type: $part_content_type<br>\n"; 385 if ($filename ne "") { 386 $text.="Filename: $filename\n"; 387 } 388 $text.="</strong></p>\n"; 389 } 390 391 if ($part_content_type =~ m@text/@) 392 { 393 my $part_text=text_from_part($message_part); 394 if ($part_content_type !~ m@text/(ht|x)ml@) { 395 $part_text=text_into_html($part_text); 396 } 397 if ($part_text eq "") { 398 $part_text='<<empty message>>'; 399 } 400 $text.=$part_text; 401 } elsif ($part_content_type =~ m@message/rfc822@) { 402 # This is a forwarded message 403 my $message_part_headers=$part_body; 404 $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold 405 $message_part_headers=~s/\r?\n\r?\n(.*)$//s; 406 my $message_part_body=$1; 407 408 my $rfc822_formatted_body=""; # put result in here 409 if ($message_part_headers =~ 410 /^content\-type:\s*([\w\/\-]+)\s*\;?\s*?([^\s]+)?\s*$/ims) 411 { 412 # The message header uses MIME flags 413 my $message_content_type=$1; 414 my $message_content_info=$2; 415 if (!defined($message_content_info)) { 416 $message_content_info=""; 417 } 418 $message_content_type =~ tr/A-Z/a-z/; 419 if ($message_content_type =~ /multipart/) { 420 $rfc822_formatted_body= 421 text_from_mime_message($message_content_type, 422 $message_content_info, 423 $message_part_body); 424 } else { 425 $message_part_body=text_from_part($part_body); 426 $rfc822_formatted_body=text_into_html($message_part_body); 427 } 428 } else { 429 # message doesn't use MIME flags 430 $rfc822_formatted_body=text_into_html($message_part_body); 431 } 432 # Add the returned text to the output 433 # don't put all the headers... 434 $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img; 435 $text.=text_into_html($message_part_headers); 436 $text.="<p>\n"; 437 $text.=$rfc822_formatted_body; 438 # end of message/rfc822 439 } elsif ($part_content_type =~ /multipart/) { 440 # recurse again 441 $tmptext=text_from_mime_message($part_content_type, 442 $part_content_info, 443 $part_body); 444 $text.=$tmptext; 445 } elsif ($text eq "") { 446 # we can't do anything with this part, but if it's the first 447 # part then make sure it is mentioned.. 448 449 $text.="<p><hr><strong><<attachment>></>"; 450 # add part info header 451 $text.="<br>Type: $part_content_type<br>\n"; 452 if ($filename ne "") { 453 $text.="Filename: $filename\n"; 454 } 455 $text.="</strong></p>\n"; 456 } 457 } # foreach message part. 458 } else { 459 # we can't handle this multipart type (not mixed or alternative) 460 # the RFC also mentions "parallel". 461 } 462 } # end of multipart 463 return $text; 464 } 465 466 467 468 469 470 471 # Process a MIME part. Return "" if we can't decode it. 472 sub text_from_part { 473 my $text=shift; 474 my $part_header=$text; 475 $part_header =~ s/\r?\n\r?\n(.*)$//s; 476 $text=$1; 477 $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold 478 $part_header =~ /content\-type:\s*([\w\/]+)/is; 479 my $type=$1; 480 my $encoding=""; 481 if ($part_header =~ /^content\-transfer\-encoding:\s*([^s]+)/mis) { 482 $encoding=$1; $encoding=~tr/A-Z/a-z/; 483 } 484 # Content-Transfer-Encoding is per-part 485 if ($encoding ne "") { 486 if ($encoding =~ /quoted\-printable/) { 487 $text=qp_decode($text); 488 } elsif ($encoding =~ /base64/) { 489 $text=base64_decode($text); 490 } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is. 491 # rfc2045 also allows binary, which we ignore (for now). 492 # maybe this shouldn't go to stderr, but anyway... 493 print STDERR "EMAILPlug: unknown encoding: $encoding\n"; 494 return ""; 495 } 496 } 497 498 if ($type eq "text/html") { 499 # only get stuff between <body> tags, or <html> tags. 500 $text =~ s/^.*?<(html|HTML)[^>]*>//s; 501 $text =~ s/<\/(html|HTML)>.*$//s; 502 503 $text =~ s/^.*?<(body|BODY)[^>]*>//s; 504 $text =~ s/<\/(body|BODY)>.*$//s; 505 } 506 elsif ($type eq "text/xml") { 507 $text=~s/</</g;$text=~s/>/>/g; 508 $text="<pre>\n$text\n</pre>\n"; 509 } 510 return $text; 511 } 512 513 514 # decode quoted-printable text 515 sub qp_decode { 516 my $text=shift; 517 518 my @lines=split('\n', $text); 519 520 # if a line ends with "=\s*", it is a soft line break, otherwise 521 # keep in any newline characters. 522 foreach my $line (@lines) { 523 if ($line =~ s/=\s*$//) {} 524 else {$line.="\n";} 525 526 if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char 527 my @hexcode_segments=split('=',$line); 528 shift @hexcode_segments; 529 my @hexcodes; 530 foreach my $hexcode (@hexcode_segments) { 531 $hexcode =~ s/^(..).*$/$1/; # only need first 2 chars 532 chomp($hexcode); # just in case... 533 my $char=chr (hex "0x$hexcode"); 534 $line =~ s/=$hexcode/$char/g; 535 } 536 } 537 } 538 $text= join('', @lines); 539 return $text; 540 } 541 542 # decode base64 text. This is fairly slow (since it's interpreted perl rather 543 # than compiled XS stuff like in the ::MIME modules, but this is more portable 544 # for us at least). 545 # see rfc2045 for description, but basically, bits 7 and 8 are set to zero; 546 # 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits 547 # from each byte. 548 549 550 sub base64_decode { 551 my $enc_text = shift; 552 # A=>0, B=>1, ..., '+'=>62, '/'=>63 553 # also '=' is used for padding at the end, but we remove it anyway. 554 my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 555 # map each MIME char into it's value, for more efficient lookup. 556 my %index; 557 map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars)); 558 # remove all non-base64 chars. eval to get variable in transliteration... 559 # also remove '=' - we'll assume (!!) that there are no errors in the encoding 560 eval "\$enc_text =~ tr|$mimechars||cd"; 561 my $decoded=""; 562 while (length ($enc_text)>3) 563 { 564 my $fourchars=substr($enc_text,0,4,""); 565 my @chars=(split '',$fourchars); 566 $decoded.=chr( $index{$chars[0]} << 2 | $index{$chars[1]} >> 4); 567 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2); 568 $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 | $index{$chars[3]}); 569 } 570 # if there are any input chars left, there are either 571 # 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left. 572 my @chars=(split '',$enc_text); 573 if (length($enc_text)) { 574 $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4)); 575 } 576 if (length($enc_text)==3) { 577 $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2); 578 } 579 return $decoded; 580 } 581 582 226 583 # Perl packages have to return true if they are run. 227 584 1;
Note:
See TracChangeset
for help on using the changeset viewer.