Context Navigation

← Previous Change
Next Change →

EMAILPlug.pm

Timestamp:

2001-09-03T15:29:45+12:00 (23 years ago)

Author:

jrm21

Message:

1) Non-ascii characters should now work for any encoding handled by Greenstone
(uses unicode.pm now).

2) RFC 2047 - Message Header Extensions parsing in place. Eg
From: =?<charset>?B?<BASE64ENCODING==>?=

File:

: 1 edited

trunk/gsdl/perllib/plugins/EMAILPlug.pm (modified) (13 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r2717
+              r2730
 #   after the first blank line in the document.
+#
 # Metadata:
+# Metadata (not Dublin Core!):
 #   $Headers      All the header content
 #   $Subject      Subject: header
 #   $To           To: header
 #   $From         From: header - this will be stored as Creator
+#   $From         From: header
 #   $FromName     Name of sender (where available)
 #   $FromAddr     E-mail address of sender
 …
 #       * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
 #       * RFC 2046 - MIME (part 2)  Media Types (and multipart messages)
+#       * RFC 2047 - MIME (part 3)  Message Header Extensions
 #       * RFC 1806 - Content Dispositions (ie inline/attachment)
 package EMAILPlug;
 use SplitPlug;
+use unicode;
 use sorttools;
 …
     my ($class) = @_;
     my $self = new BasPlug ("EMAILPlug", @_);
+    # make sure we don't run textcat (defaults to "auto");
+    $self->{'input_encoding'}="iso_8859_1"; # this might not be good enough...
+    # this might not actually be true at read-time, but after processing
+    # it should all be utf8.
+    $self->{'input_encoding'}="utf8";
     return bless $self, $class;
+}
 …
     $Headers =~ s/\r?\n\r?\n(.*)$//s;
     $$textref = $1;
-    # escape [] so it isn't re-interpreted as metadata
-    $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
     # Unfold headers - see rfc822
 …
     $value =~ s/\s+$//;
+    # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
+    if ($value =~ /=\?/) {
+        my $original_value=$value;
+        my $encoded=$value;
+        $value="";
+        while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
+        my ($charset, $encoding, $data)=($2,$3,$4);
+        my $decoded_data;
+        $value.="$1"; # any leading chars
+        $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
+        chomp $data;
+        $encoding =~ tr/BQ/bq/;
+        if ($encoding eq "q") { # quoted printable
+            $decoded_data=qp_decode($data);
+        } else { # base 64
+            $decoded_data=base64_decode($data);
+        }
+        if (defined($charset)) {
+            $charset=~tr/A-Z/a-z/;
+            $charset=~s/\-/_/g;
+            $charset=~s/gb2312/gb/;
+            # assumes EUC-KR, not ISO-2022 !?
+            $charset=~s/ks_c_5601_1987/korean/;
+        } else {$charset="ascii";}
+        if ($charset eq "ascii" || $charset eq "us-ascii") {
+            # technically possible to have this explicitly...
+            $value.=$decoded_data;
+        } else {
+            my $utf8_text=&unicode::unicode2utf8
+            (
+             &unicode::convert2unicode($charset,\$decoded_data)
+             );
+            $value.=$utf8_text;
+        }
+        } # end of while loop
+        $value.=$encoded; # get any trailing characters
+        if ($value =~ /^\s*$/) { # we couldn't extract anything...
+        $value=original_value;
+        }
+    } # end of if =?...?=
     # Store the metadata
     $raw{$name} = $value;
 …
     # Escape < and > in the whole From field;
-    $frommeta =~ s/</&lt;/g; $frommeta =~ s/>/&gt;/g;
     $raw{"From"}=$frommeta;
 …
+    }
     # Add extracted metadata to document object
     foreach my $name (keys %raw) {
     $value = $raw{$name};
     if ($value) {
+        # assume subject, etc headers have no special HTML meaning.
+        $value =~ s@&@&amp\;@g;
+        $value =~ s/</&lt;/g; $value =~ s/>/&gt;/g;
         $value = &text_into_html($value);
+        # escape [] so it isn't re-interpreted as metadata
+        $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
     } else {
         $value = "No $name field";
 …
     $Headers = "No headers" unless ($Headers =~ /\w/);
     $Headers =~ s/@/&#64\;/g;
+    # escape [] so it isn't re-interpreted as metadata
+    $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
     $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
 …
 #    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
     # assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
     # URI can't finish with a '.'
     $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&amp;|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;
+    # assume hostnames are \.\w\- only, then might have a trailing '/.*'
+    # assume URI doesn't finish with a '.'
+    $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
 …
         # add <<attachment>> to each part except the first...
         if ($text ne "") {
             $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
+            $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
             # add part info header
             $text.="<br>Type: $part_content_type<br>\n";
 …
             # part then make sure it is mentioned..
             $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
+            $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
             # add part info header
             $text.="<br>Type: $part_content_type<br>\n";
 …
+    }
     $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
+    $part_header =~ /content\-type:\s*([\w\/]+)/is;
+    my $type=$1; if (!defined($type)) {$type="";}
+    $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
+    my $type=$1;
+    my $charset=$2;
+    if (!defined($type)) {$type="";}
+    if (!defined($charset)) {$charset="ascii";}
     my $encoding="";
     if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
 …
+    }
+    }
     if ($type eq "text/html") {
     # only get stuff between <body> tags, or <html> tags.
+    $text =~ s/^.*?<(html|HTML)[^>]*>//s;
+    $text =~ s/<\/(html|HTML)>.*$//s;
+    $text =~ s/^.*?<(body|BODY)[^>]*>//s;
+    $text =~ s/<\/(body|BODY)>.*$//s;
+    $text =~ s@^.*<html[^>]*>@@is;
+    $text =~ s@</html>.*$@@is;
+    $text =~ s/^.*?<body[^>]*>//si;
+    $text =~ s/<\/body>.*$//si;
+    }
     elsif ($type eq "text/xml") {
     $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
     $text="<pre>\n$text\n</pre>\n";
+    }
+    # convert to unicode
+    # first get our character encoding name in the right form.
+    $charset=~tr/A-Z/a-z/;
+    $charset=~s/\-/_/g;
+    if ($charset ne "us_ascii" && $charset ne "ascii") {
+    $charset=~s/gb2312/gb/;
+    # assumes EUC-KR, not ISO-2022 !?
+    $charset=~s/ks_c_5601_1987/korean/;
+    my @unicode_array=&unicode::convert2unicode($charset,\$text);
+    my $utf8_text=&unicode::unicode2utf8(@unicode_array);
+    $text=$utf8_text;
+    }
     return $text;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2730 for trunk/gsdl/perllib/plugins/EMAILPlug.pm

Legend:

trunk/gsdl/perllib/plugins/EMAILPlug.pm

Download in other formats: