Context Navigation

← Previous Change
Next Change →

EMAILPlug.pm

Timestamp:

2001-07-09T19:09:06+12:00 (23 years ago)

Author:

jrm21

Message:

Mime support for multipart messages. Doesn't extract attachments yet...
Also made sure we don't use textcat to guess language - it runs over the
whole file, and grows really big really quickly.

File:

: 1 edited

trunk/gsdl/perllib/plugins/EMAILPlug.pm (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r2493
+              r2630
 #   $To           To: header
 #   $From         From: header - this will be stored as Creator
+#   $FromName     Name of sender (where available)
+#   $FromAddr     E-mail address of sender
 #   $DateText     Date: header
 #   $Date         Date: header in GSDL format (eg: 19990924)
+#
+#
+# John McPherson - June/July 2001
+# added (basic) MIME support and quoted-printable and base64 decodings.
+# Minor fixes for names that are actually email addresses (ie <...> was lost)
+#
+# See:  * RFC 822  - ARPA Internet Text Messages
+#       * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
+#       * RFC 2046 - MIME (part 2)  Media Types (and multipart messages)
+#       * RFC 1806 - Content Dispositions (ie inline/attachment)
 package EMAILPlug;
 …
     my ($class) = @_;
     my $self = new BasPlug ("EMAILPlug", @_);
+    # make sure we don't run textcat (defaults to "auto");
+    $self->{'input_encoding'}="ascii";
     return bless $self, $class;
+}
 …
     # mbx/email for mailbox file format, \d+ for maildir (each message is
     # in a separate file, with a unique number for filename)
     return q@[\\/]\d+|\.(mbx|email)$@;
+    return q@([\\/]\d+|\.(mbx|email))$@;
+}
 …
 # do plugin specific processing of doc_obj
 sub process {
     my $self = shift (@_);
     my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
 …
     # Check that we're dealing with a valid mail file
     return undef unless (($$textref =~ /^From:/m) || ($$textref =~ /^To:/m));
+    return undef unless (($$textref =~ /From:/m) || ($$textref =~ /To:/m));
     # slightly more strict validity check, to prevent us from matching
 …
     # Separate header from body of message
     my $Headers = $$textref;
     #$Headers =~ s/\n\n.*//s;  # This line changed at Marcio's request
     $Headers =~ s/\x0a\x0d?\x0a.*//s;
+    $$textref = substr $$textref, (length $Headers);
+    $Headers =~ s/\r?\n\r?\n(.*)$//s;
+    $$textref = $1;
+    # Unfold headers - see rfc822
+    $Headers =~ s/\r?\n[\t\ ]+/ /gs;
     # Extract basic metadata from header
     my @headers = ("From", "To", "Subject", "Date");
 …
     @parts = split(/:/, $line);
     $name = shift @parts;
+# uppercase the first character according to the current locale
+    $name=~s/(.+)/\u$1/;
     next unless $name;
     next unless ($raw{$name});
 …
     $raw{$name} = $value;
+    }
+    # Extract the name and e-mail address from the From metadata
+    $frommeta = $raw{"From"};
+    $frommeta =~ m/(.*)<(.*)>/;
+    my $fromnamemeta=$1;
+    my $fromaddrmeta=$2;
+    if (!defined($fromaddrmeta)) {
+    $fromaddrmeta=$frommeta;
+    }
+    $doc_obj->add_utf8_metadata ($cursection, "FromAddr", $fromaddrmeta);
+    if (defined($fromnameneta)) {
+    $fromnamemeta =~ s/\"//g;
+    $fromnamemeta =~ s/(.*) /$1/;  # Remove trailing space
+    }
+    else {
+    $fromnamemeta = $fromaddrmeta;
+    }
+    # if name is an address
+    $fromnamemeta =~ s/<//g; $fromnamemeta =~ s/>//g;
+    $doc_obj->add_utf8_metadata ($cursection, "FromName", $fromnamemeta);
+    # Escape < and > in the whole From field;
+    $frommeta =~ s/</&lt;/g; $frommeta =~ s/>/&gt;/g;
+    $raw{"From"}=$frommeta;
     # Process Date information
 …
+    }
+    my $mimetype="text/plain";
+    my $mimeinfo="";
+    # Do MIME and encoding stuff
+    if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*([^\s]+)\s*$/mi)
+    {
+        $mimetype=$1;
+        $mimetype =~ tr/[A-Z]/[a-z]/;
+        $mimeinfo=$2;
+    }
+    if ($mimetype ne "text/plain") {
+    $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref);
+    } # end of not text/plain
     # Add "All headers" metadata
     $Headers = &text_into_html($Headers);
 …
     # Add text to document object
+    $$textref = &text_into_html($$textref);
+    if ($mimetype eq "text/plain") {
+    $$textref = &text_into_html($$textref);
+    }
     $$textref = "No message" unless ($$textref =~ /\w/);
     $doc_obj->add_utf8_text($cursection, $$textref);
 …
     $text =~ s/\"/&quot;/go;
+    # convert email addresses and URLs into links
+    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
+    $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/<a href=\"$1\">$1<\/a>/g;
+    # convert email addresses and URIs into links
+# don't markup email addresses for now
+#    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
+    # assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
+    # URI can't finish with a '.'
+    $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&amp;|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;
     # Clean up whitespace and convert \n charaters to <BR> or <P>
 …
+#Process a MIME message.
+# the textref we are given DOES NOT include the header.
+sub text_from_mime_message {
+    my ($mimetype,$mimeinfo,$text)=(@_);
+    # Check for multiparts - $mimeinfo will be a boundary
+    if ($mimetype =~ /multipart/) {
+    $boundary="";
+    if ($mimeinfo =~ /boundary="?([^\s]+?)"?\s*$/im) {
+        $boundary=$1;
+    }
+    # parts start with "--$boundary"
+    # message ends with "--$boundary--"
+    # RFC says boundary is <70 chars, [A-Za-z'()+_,-./:=?], so escape any
+    # that perl might want to interpolate.
+    $boundary=~s/\\/\\\\/g;
+    $boundary=~s/([\?\+\.\(\)\:\/\'])/\\$1/g;
+    my @message_parts = split("\r?\n\-\-$boundary", $text);
+    # remove first "part" and last "part" (final --)
+    shift @message_parts;
+    my $last=pop @message_parts;
+    # make sure it is only -- and whitespace
+    if ($last !~ /^\-\-\s*$/ms) {
+        print $outhandle "EMAILPlug: (warning) last part of MIME message isn't empty\n";
+    }
+    foreach my $message_part (@message_parts) {
+        # remove the leading newline left from split.
+        $message_part=~s/^\r?\n//;
+    }
+    if ($mimetype eq "multipart/alternative") {
+        # check for an HTML version first, then TEXT, otherwise use first.
+        my $part_text="";
+        foreach my $message_part (@message_parts) {
+        if ($message_part =~ m@\s*content\-type:\s*text/html@mis)
+        {
+            # Use the HTML version
+            $part_text=text_from_part($message_part);
+            $mimetype="text/html";
+            last;
+        }
+        }
+        if ($part_text eq "") { # try getting a text part instead
+        foreach my $message_part (@message_parts) {
+            if ($message_part =~ m@^content\-type:\s*text/plain@mis)
+            {
+            # Use the plain version
+            $part_text=text_from_part($message_part);
+            $mimetype="text/plain";
+            last;
+            }
+        }
+        }
+        if ($part_text eq "") { # use first part
+        $part_text=text_from_part(shift @message_parts);
+        }
+        if ($part_text eq "") { # we couldn't get anything!!!
+        # or it was an empty message...
+        # do nothing...
+        print $outhandle "EMAILPlug: no text - empty body?\n";
+        } else {
+        $text=$part_text;
+        }
+    } elsif ($mimetype eq "multipart/mixed" ||
+         $mimetype eq "multipart/digest") {
+        $text="";
+        foreach my $message_part (@message_parts) {
+        my $part_header=$message_part;
+        $part_header=~s/\r?\n\r?\n(.*)$//sg;
+        my $part_body=$1;
+        $part_header =~ s/\r?\n[\t\ ]+/ /gs; #unfold
+        my $part_content_type="";
+        my $part_content_info="";
+        if ($mimetype eq "multipart/digest") {
+            # default type - RTFRFC!!
+            $part_content_type="message/rfc822";
+        }
+        if ($part_header =~ m@^content\-type:\s*([\w+/\-]+)\s*\;?\s*([^\s]+)@mi) {
+            $part_content_type=$1; $part_content_type =~ tr/A-Z/a-z/;
+            $part_content_info=$2;
+        }
+        my $filename="";
+        if ($part_header =~ m@name=\"?([\w\.\-\\/]+)\"?@mis) {
+            $filename=$1;
+        }
+        # disposition - either inline or attachment.
+        # NOT CURRENTLY USED - we display all text types instead...
+        # $part_header =~ /^content\-disposition:\s*([\w+])/mis;
+        # add <<attachment>> to each part except the first...
+        if ($text ne "") {
+            $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
+            # add part info header
+            $text.="<br>Type: $part_content_type<br>\n";
+            if ($filename ne "") {
+            $text.="Filename: $filename\n";
+            }
+            $text.="</strong></p>\n";
+        }
+        if ($part_content_type =~ m@text/@)
+        {
+            my $part_text=text_from_part($message_part);
+            if ($part_content_type !~ m@text/(ht|x)ml@) {
+            $part_text=text_into_html($part_text);
+            }
+            if ($part_text eq "") {
+            $part_text='&lt;&lt;empty message&gt;&gt;';
+            }
+            $text.=$part_text;
+        } elsif ($part_content_type =~ m@message/rfc822@) {
+            # This is a forwarded message
+            my $message_part_headers=$part_body;
+            $message_part_headers =~ s/\r?\n[\t\ ]+/ /gs; #unfold
+            $message_part_headers=~s/\r?\n\r?\n(.*)$//s;
+            my $message_part_body=$1;
+            my $rfc822_formatted_body=""; # put result in here
+            if ($message_part_headers =~
+            /^content\-type:\s*([\w\/\-]+)\s*\;?\s*?([^\s]+)?\s*$/ims)
+            {
+            # The message header uses MIME flags
+            my $message_content_type=$1;
+            my $message_content_info=$2;
+            if (!defined($message_content_info)) {
+                $message_content_info="";
+            }
+            $message_content_type =~ tr/A-Z/a-z/;
+            if ($message_content_type =~ /multipart/) {
+                $rfc822_formatted_body=
+                text_from_mime_message($message_content_type,
+                               $message_content_info,
+                               $message_part_body);
+            } else {
+                $message_part_body=text_from_part($part_body);
+                $rfc822_formatted_body=text_into_html($message_part_body);
+            }
+            } else {
+            # message doesn't use MIME flags
+            $rfc822_formatted_body=text_into_html($message_part_body);
+            }
+            # Add the returned text to the output
+            # don't put all the headers...
+            $message_part_headers =~ s/^(X\-.*|received|message\-id|return\-path):.*\n//img;
+            $text.=text_into_html($message_part_headers);
+            $text.="<p>\n";
+            $text.=$rfc822_formatted_body;
+            # end of message/rfc822
+        } elsif ($part_content_type =~ /multipart/) {
+            # recurse again
+            $tmptext=text_from_mime_message($part_content_type,
+                            $part_content_info,
+                            $part_body);
+            $text.=$tmptext;
+        } elsif ($text eq "") {
+            # we can't do anything with this part, but if it's the first
+            # part then make sure it is mentioned..
+            $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
+            # add part info header
+            $text.="<br>Type: $part_content_type<br>\n";
+            if ($filename ne "") {
+            $text.="Filename: $filename\n";
+            }
+            $text.="</strong></p>\n";
+        }
+        } # foreach message part.
+    } else {
+        # we can't handle this multipart type (not mixed or alternative)
+        # the RFC also mentions "parallel".
+    }
+    } # end of multipart
+    return $text;
+}
+# Process a MIME part. Return "" if we can't decode it.
+sub text_from_part {
+    my $text=shift;
+    my $part_header=$text;
+    $part_header =~ s/\r?\n\r?\n(.*)$//s;
+    $text=$1;
+    $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
+    $part_header =~ /content\-type:\s*([\w\/]+)/is;
+    my $type=$1;
+    my $encoding="";
+    if ($part_header =~ /^content\-transfer\-encoding:\s*([^s]+)/mis) {
+    $encoding=$1; $encoding=~tr/A-Z/a-z/;
+    }
+    # Content-Transfer-Encoding is per-part
+    if ($encoding ne "") {
+    if ($encoding =~ /quoted\-printable/) {
+        $text=qp_decode($text);
+    } elsif ($encoding =~ /base64/) {
+        $text=base64_decode($text);
+    } elsif ($encoding !~ /[78]bit/) { # leave 7/8 bit as is.
+        # rfc2045 also allows binary, which we ignore (for now).
+        # maybe this shouldn't go to stderr, but anyway...
+        print STDERR "EMAILPlug: unknown encoding: $encoding\n";
+        return "";
+    }
+    }
+    if ($type eq "text/html") {
+    # only get stuff between <body> tags, or <html> tags.
+    $text =~ s/^.*?<(html|HTML)[^>]*>//s;
+    $text =~ s/<\/(html|HTML)>.*$//s;
+    $text =~ s/^.*?<(body|BODY)[^>]*>//s;
+    $text =~ s/<\/(body|BODY)>.*$//s;
+    }
+    elsif ($type eq "text/xml") {
+    $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
+    $text="<pre>\n$text\n</pre>\n";
+    }
+    return $text;
+}
+# decode quoted-printable text
+sub qp_decode {
+    my $text=shift;
+    my @lines=split('\n', $text);
+    # if a line ends with "=\s*", it is a soft line break, otherwise
+    # keep in any newline characters.
+    foreach my $line (@lines) {
+    if ($line =~ s/=\s*$//) {}
+    else {$line.="\n";}
+    if ($line =~ /=[0-9A-Fa-f]{2}/) { # it contains an escaped char
+        my @hexcode_segments=split('=',$line);
+        shift @hexcode_segments;
+        my @hexcodes;
+        foreach my $hexcode (@hexcode_segments) {
+        $hexcode =~ s/^(..).*$/$1/;  # only need first 2 chars
+        chomp($hexcode); # just in case...
+        my $char=chr (hex "0x$hexcode");
+        $line =~ s/=$hexcode/$char/g;
+        }
+    }
+    }
+    $text= join('', @lines);
+    return $text;
+}
+# decode base64 text. This is fairly slow (since it's interpreted perl rather
+# than compiled XS stuff like in the ::MIME modules, but this is more portable
+# for us at least).
+# see rfc2045 for description, but basically, bits 7 and 8 are set to zero;
+# 4 bytes of encoded text become 3 bytes of binary - remove 2 highest bits
+# from each byte.
+sub base64_decode {
+    my $enc_text = shift;
+# A=>0, B=>1, ..., '+'=>62, '/'=>63
+# also '=' is used for padding at the end, but we remove it anyway.
+    my $mimechars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+# map each MIME char into it's value, for more efficient lookup.
+    my %index;
+    map { $index{$_} = index ($mimechars, $_) } (split ('', $mimechars));
+# remove all non-base64 chars. eval to get variable in transliteration...
+# also remove '=' - we'll assume (!!) that there are no errors in the encoding
+    eval "\$enc_text =~ tr|$mimechars||cd";
+    my $decoded="";
+    while (length ($enc_text)>3)
+    {
+    my $fourchars=substr($enc_text,0,4,"");
+    my @chars=(split '',$fourchars);
+    $decoded.=chr( $index{$chars[0]}        << 2 | $index{$chars[1]} >> 4);
+    $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
+    $decoded.=chr( ($index{$chars[2]} & 3 ) << 6 |  $index{$chars[3]});
+    }
+# if there are any input chars left, there are either
+# 2 encoded bytes (-> 1 raw byte) left or 3 encoded (-> 2 raw) bytes left.
+    my @chars=(split '',$enc_text);
+    if (length($enc_text)) {
+    $decoded.=chr($index{$chars[0]} << 2 | (int $index{$chars[1]} >> 4));
+    }
+    if (length($enc_text)==3) {
+    $decoded.=chr( ($index{$chars[1]} & 15) << 4 | $index{$chars[2]} >> 2);
+    }
+    return $decoded;
+}
 # Perl packages have to return true if they are run.
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2630 for trunk/gsdl/perllib/plugins/EMAILPlug.pm

Legend:

trunk/gsdl/perllib/plugins/EMAILPlug.pm

Download in other formats: