Ignore:
Timestamp:
2001-07-09T19:09:06+12:00 (23 years ago)
Author:
jrm21
Message:

Mime support for multipart messages. Doesn't extract attachments yet...
Also made sure we don't use textcat to guess language - it runs over the
whole file, and grows really big really quickly.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r2493 r2630  
    4444#   $To           To: header
    4545#   $From         From: header - this will be stored as Creator
     46#   $FromName     Name of sender (where available)
     47#   $FromAddr     E-mail address of sender
    4648#   $DateText     Date: header
    4749#   $Date         Date: header in GSDL format (eg: 19990924)
    48 
     50#
     51#
     52# John McPherson - June/July 2001
     53# added (basic) MIME support and quoted-printable and base64 decodings.
     54# Minor fixes for names that are actually email addresses (ie <...> was lost)
     55#
     56# See:  * RFC 822  - ARPA Internet Text Messages
     57#       * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
     58#       * RFC 2046 - MIME (part 2)  Media Types (and multipart messages)
     59#       * RFC 1806 - Content Dispositions (ie inline/attachment)
    4960package EMAILPlug;
    5061
     
    6879    my ($class) = @_;
    6980    my $self = new BasPlug ("EMAILPlug", @_);
    70 
     81    # make sure we don't run textcat (defaults to "auto");
     82    $self->{'input_encoding'}="ascii";
    7183    return bless $self, $class;
    7284}
     
    7688    # mbx/email for mailbox file format, \d+ for maildir (each message is
    7789    # in a separate file, with a unique number for filename)
    78     return q@[\\/]\d+|\.(mbx|email)$@;
     90    return q@([\\/]\d+|\.(mbx|email))$@;
    7991}
    8092
     
    8799# do plugin specific processing of doc_obj
    88100sub process {
     101
    89102    my $self = shift (@_);
    90103    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     
    92105
    93106    # Check that we're dealing with a valid mail file
    94     return undef unless (($$textref =~ /^From:/m) || ($$textref =~ /^To:/m));
     107    return undef unless (($$textref =~ /From:/m) || ($$textref =~ /To:/m));
    95108
    96109    # slightly more strict validity check, to prevent us from matching
     
    110123    # Separate header from body of message
    111124    my $Headers = $$textref;
    112     #$Headers =~ s/\n\n.*//s;  # This line changed at Marcio's request
    113     $Headers =~ s/\x0a\x0d?\x0a.*//s;
    114     $$textref = substr $$textref, (length $Headers);
    115 
    116 
     125    $Headers =~ s/\r?\n\r?\n(.*)$//s;
     126    $$textref = $1;
     127
     128    # Unfold headers - see rfc822
     129    $Headers =~ s/\r?\n[\t\ ]+/ /gs;
    117130    # Extract basic metadata from header
    118131    my @headers = ("From", "To", "Subject", "Date");
     
    134147    @parts = split(/:/, $line);
    135148    $name = shift @parts;
     149# uppercase the first character according to the current locale
     150    $name=~s/(.+)/\u$1/;
    136151    next unless $name;
    137152    next unless ($raw{$name});
     
    145160    $raw{$name} = $value;
    146161    }
     162
     163    # Extract the name and e-mail address from the From metadata
     164    $frommeta = $raw{"From"};
     165    $frommeta =~ m/(.*)<(.*)>/;
     166    my $fromnamemeta=$1;
     167    my $fromaddrmeta=$2;
     168    if (!defined($fromaddrmeta)) {
     169    $fromaddrmeta=$frommeta;
     170    }
     171    $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
     172
     173    if (defined($fromnameneta)) {
     174    $fromnamemeta =~ s/\"//g;
     175    $fromnamemeta =~ s/(.*) /$1/;  # Remove trailing space
     176    }
     177    else {
     178    $fromnamemeta = $fromaddrmeta;
     179    }
     180    # if name is an address
     181    $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
     182    $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
     183
     184    # Escape < and > in the whole From field;
     185    $frommeta =~ s/</&lt;/g; $frommeta =~ s/>/&gt;/g;
     186    $raw{"From"}=$frommeta;
    147187
    148188    # Process Date information
     
    174214    }
    175215
     216    my $mimetype="text/plain";
     217    my $mimeinfo="";
     218    # Do MIME and encoding stuff
     219    if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*([^\s]+)\s*$/mi)
     220    {
     221        $mimetype=$1;
     222        $mimetype =~ tr/[A-Z]/[a-z]/;
     223        $mimeinfo=$2;
     224    }
     225
     226    if ($mimetype ne "text/plain") {
     227    $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref);
     228    } # end of not text/plain
     229   
     230
    176231    # Add "All headers" metadata
    177232    $Headers = &text_into_html($Headers);
     
    180235
    181236    # Add text to document object
    182     $$textref = &text_into_html($$textref);
     237    if ($mimetype eq "text/plain") {
     238    $$textref = &text_into_html($$textref);
     239    }
    183240    $$textref = "No message" unless ($$textref =~ /\w/);
    184241    $doc_obj->add_utf8_text($cursection, $$textref);
     
    209266    $text =~ s/\"/&quot;/go;
    210267
    211     # convert email addresses and URLs into links
    212     $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    213     $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
     268    # convert email addresses and URIs into links
     269# don't markup email addresses for now
     270#    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
     271
     272    # assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
     273    # URI can't finish with a '.'
     274    $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&amp;|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;
     275
    214276
    215277    # Clean up whitespace and convert \n charaters to <BR> or <P>
     
    224286
    225287
     288
     289
     290#Process a MIME message.
     291# the textref we are given DOES NOT include the header.
     292sub text_from_mime_message {
     293    my ($mimetype,$mimeinfo,$text)=(@_);
     294
     295    # Check for multiparts - $mimeinfo will be a boundary
     296    if ($mimetype =~ /multipart/) {
     297    $boundary="";
     298    if ($mimeinfo =~ /boundary="?([^\s]+?)"?\s*$/im) {
     299        $boundary=$1;
     300    }
     301    # parts start with "--$boundary"
     302    # message ends with "--$boundary--"
     303    # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
     304    # that perl might want to interpolate.
     305
     306    $boundary=~s/\\/\\\\/g;
     307    $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
     308    my @message_parts = split("\r?\n\-\-$boundary", $text);
     309    # remove first "part" and last "part" (final --)
     310    shift @message_parts;
     311    my $last=pop @message_parts;
     312    # make sure it is only -- and whitespace
     313    if ($last !~ /^\-\-\s*$/ms) {
     314        print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
     315    }
     316    foreach my $message_part (@message_parts) {
     317        # remove the leading newline left from split.
     318        $message_part=~s/^\r?\n//;
     319    }
     320    if ($mimetype eq "multipart/alternative") {
     321        # check for an HTML version first, then TEXT, otherwise use first.
     322        my $part_text="";
     323        foreach my $message_part (@message_parts) {
     324        if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
     325        {
     326            # Use the HTML version
     327            $part_text=text_from_part($message_part);
     328            $mimetype="text/html";
     329            last;
     330        }
     331        }
     332        if ($part_text eq "") { # try getting a text part instead
     333        foreach my $message_part (@message_parts) {
     334            if ($message_part =~ m@^content\-type:\s*text/plain@mis)
     335            {
     336            # Use the plain version
     337            $part_text=text_from_part($message_part);
     338            $mimetype="text/plain";
     339            last;
     340            }
     341        }
     342        }
     343        if ($part_text eq "") { # use first part
     344        $part_text=text_from_part(shift @message_parts);
     345        }
     346        if ($part_text eq "") { # we couldn't get anything!!!
     347        # or it was an empty message...
     348        # do nothing...
     349        print $outhandle "EMAILPlug: no text - empty body?\n";
     350        } else {
     351        $text=$part_text;
     352        }
     353    } elsif ($mimetype eq "multipart/mixed" ||
     354         $mimetype eq "multipart/digest") {
     355        $text="";
     356        foreach my $message_part (@message_parts) {
     357        my $part_header=$message_part;
     358        $part_header=~s/\r?\n\r?\n(.*)$//sg;
     359        my $part_body=$1;
     360        $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
     361        my $part_content_type="";
     362        my $part_content_info="";
     363        if ($mimetype eq "multipart/digest") {
     364            # default type - RTFRFC!!
     365            $part_content_type="message/rfc822";
     366        }
     367        if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*([^\s]+)@mi) {
     368            $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
     369            $part_content_info=$2;
     370        }
     371        my $filename="";
     372        if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
     373            $filename=$1;
     374        }
     375
     376        # disposition - either inline or attachment.
     377        # NOT CURRENTLY USED - we display all text types instead...
     378        # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
     379
     380        # add <<attachment>> to each part except the first...
     381        if ($text ne "") {
     382            $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
     383            # add part info header
     384            $text.="<br>Type: $part_content_type<br>\n";
     385            if ($filename ne "") {
     386            $text.="Filename: $filename\n";
     387            }
     388            $text.="</strong></p>\n";
     389        }
     390
     391        if ($part_content_type =~ m@text/@)
     392        {
     393            my $part_text=text_from_part($message_part);
     394            if ($part_content_type !~ m@text/(ht|x)ml@) {
     395            $part_text=text_into_html($part_text);
     396            }
     397            if ($part_text eq "") {
     398            $part_text='&lt;&lt;empty message&gt;&gt;';
     399            }
     400            $text.=$part_text;
     401        } elsif ($part_content_type =~ m@message/rfc822@) {
     402            # This is a forwarded message
     403            my $message_part_headers=$part_body;
     404            $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
     405            $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
     406            my $message_part_body=$1;
     407
     408            my $rfc822_formatted_body=""; # put result in here
     409            if ($message_part_headers =~
     410            /^content\-type:\s*([\w\/\-]+)\s*\;?\s*?([^\s]+)?\s*$/ims)
     411            {
     412            # The message header uses MIME flags
     413            my $message_content_type=$1;
     414            my $message_content_info=$2;
     415            if (!defined($message_content_info)) {
     416                $message_content_info="";
     417            }
     418            $message_content_type =~ tr/A-Z/a-z/;
     419            if ($message_content_type =~ /multipart/) {
     420                $rfc822_formatted_body=
     421                text_from_mime_message($message_content_type,
     422                               $message_content_info,
     423                               $message_part_body);
     424            } else {
     425                $message_part_body=text_from_part($part_body);
     426                $rfc822_formatted_body=text_into_html($message_part_body);
     427            }
     428            } else {
     429            # message doesn't use MIME flags
     430            $rfc822_formatted_body=text_into_html($message_part_body);
     431            }
     432            # Add the returned text to the output
     433            # don't put all the headers...
     434            $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
     435            $text.=text_into_html($message_part_headers);
     436            $text.="<p>\n";
     437            $text.=$rfc822_formatted_body;
     438            # end of message/rfc822
     439        } elsif ($part_content_type =~ /multipart/) {
     440            # recurse again
     441            $tmptext=text_from_mime_message($part_content_type,
     442                            $part_content_info,
     443                            $part_body);
     444            $text.=$tmptext;
     445        } elsif ($text eq "") {
     446            # we can't do anything with this part, but if it's the first
     447            # part then make sure it is mentioned..
     448           
     449            $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
     450            # add part info header
     451            $text.="<br>Type: $part_content_type<br>\n";
     452            if ($filename ne "") {
     453            $text.="Filename: $filename\n";
     454            }
     455            $text.="</strong></p>\n";
     456        }
     457        } # foreach message part.
     458    } else {
     459        # we can't handle this multipart type (not mixed or alternative)
     460        # the RFC also mentions "parallel".
     461    }
     462    } # end of multipart
     463    return $text;
     464}
     465
     466
     467
     468
     469
     470
     471# Process a MIME part. Return "" if we can't decode it.
     472sub text_from_part {
     473    my $text=shift;
     474    my $part_header=$text;
     475    $part_header =~ s/\r?\n\r?\n(.*)$//s;
     476    $text=$1;
     477    $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
     478    $part_header =~ /content\-type:\s*([\w\/]+)/is;
     479    my $type=$1;
     480    my $encoding="";
     481    if ($part_header =~ /^content\-transfer\-encoding:\s*([^s]+)/mis) {
     482    $encoding=$1; $encoding=~tr/A-Z/a-z/;
     483    }
     484    # Content-Transfer-Encoding is per-part
     485    if ($encoding ne "") {
     486    if ($encoding =~ /quoted\-printable/) {
     487        $text=qp_decode($text);
     488    } elsif ($encoding =~ /base64/) {
     489        $text=base64_decode($text);
     490    } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
     491        # rfc2045 also allows binary, which we ignore (for now).
     492        # maybe this shouldn't go to stderr, but anyway...
     493        print STDERR "EMAILPlug: unknown encoding: $encoding\n";
     494        return "";
     495    }
     496    }
     497
     498    if ($type eq "text/html") {
     499    # only get stuff between <body> tags, or <html> tags.
     500    $text =~ s/^.*?<(html|HTML)[^>]*>//s;
     501    $text =~ s/<\/(html|HTML)>.*$//s;
     502
     503    $text =~ s/^.*?<(body|BODY)[^>]*>//s;
     504    $text =~ s/<\/(body|BODY)>.*$//s;
     505    }
     506    elsif ($type eq "text/xml") {
     507    $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
     508    $text="<pre>\n$text\n</pre>\n";
     509    }
     510    return $text;
     511}
     512
     513
     514# decode quoted-printable text
     515sub qp_decode {
     516    my $text=shift;
     517
     518    my @lines=split('\n', $text);
     519
     520    # if a line ends with "=\s*", it is a soft line break, otherwise
     521    # keep in any newline characters.
     522    foreach my $line (@lines) {
     523    if ($line =~ s/=\s*$//) {}
     524    else {$line.="\n";}
     525
     526    if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
     527        my @hexcode_segments=split('=',$line);
     528        shift @hexcode_segments;
     529        my @hexcodes;
     530        foreach my $hexcode (@hexcode_segments) {
     531        $hexcode =~ s/^(..).*$/$1/;  # only need first 2 chars
     532        chomp($hexcode); # just in case...
     533        my $char=chr (hex "0x$hexcode");
     534        $line =~ s/=$hexcode/$char/g;
     535        }
     536    }
     537    }
     538    $text= join('', @lines);
     539    return $text;
     540}
     541
     542# decode base64 text. This is fairly slow (since it's interpreted perl rather
     543# than compiled XS stuff like in the ::MIME modules, but this is more portable
     544# for us at least).
     545# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
     546# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
     547# from each byte.
     548
     549
     550sub base64_decode {
     551    my $enc_text = shift;
     552# A=>0, B=>1, ..., '+'=>62, '/'=>63
     553# also '=' is used for padding at the end, but we remove it anyway.
     554    my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
     555# map each MIME char into it's value, for more efficient lookup.
     556    my %index;
     557    map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
     558# remove all non-base64 chars. eval to get variable in transliteration...
     559# also remove '=' - we'll assume (!!) that there are no errors in the encoding
     560    eval "\$enc_text =~ tr|$mimechars||cd";
     561    my $decoded="";
     562    while (length ($enc_text)>3)
     563    {
     564    my $fourchars=substr($enc_text,0,4,"");
     565    my @chars=(split '',$fourchars);
     566    $decoded.=chr( $index{$chars[0]}        << 2 | $index{$chars[1]} >> 4);
     567    $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
     568    $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 |  $index{$chars[3]});
     569    }
     570# if there are any input chars left, there are either
     571# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
     572    my @chars=(split '',$enc_text);
     573    if (length($enc_text)) {
     574    $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
     575    }
     576    if (length($enc_text)==3) {
     577    $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
     578    }
     579    return $decoded;
     580}
     581
     582
    226583# Perl packages have to return true if they are run.
    2275841;
Note: See TracChangeset for help on using the changeset viewer.