Context Navigation

← Previous Changeset
Next Changeset →

Changeset 2847

Timestamp:

2001-11-23T16:14:39+13:00 (22 years ago)

Author:

sjboddie

Message:

Altered EMAILPlug a little so it now treats all text that it used to
treat as ASCII as ISO-8859-1 encoded instead. This prevents problems
when text is assumed to be plain ASCII but isn't (that is, the resulting
XML documents couldn't be parsed by the XML::Parser module).

File:

: 1 edited

trunk/gsdl/perllib/plugins/EMAILPlug.pm (modified) (15 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r2781
+              r2847
         while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
         my ($charset, $encoding, $data)=($2,$3,$4);
         my $decoded_data;
+        my ($decoded_data);
         $value.="$1"; # any leading chars
         $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
 …
             $decoded_data=base64_decode($data);
+        }
+        if (defined($charset)) {
+            $charset=~tr/A-Z/a-z/;
+            $charset=~s/\-/_/g;
+            $charset=~s/gb2312/gb/;
+            # assumes EUC-KR, not ISO-2022 !?
+            $charset=~s/ks_c_5601_1987/korean/;
+        } else {$charset="ascii";}
+        if ($charset eq "ascii" || $charset eq "us_ascii") {
+            # technically possible to have this explicitly...
+            $value.=$decoded_data;
+        } else {
+            my $utf8_text=&unicode::unicode2utf8
+            (
+             &unicode::convert2unicode($charset,\$decoded_data)
+             );
+            $value.=$utf8_text;
+        }
+        } # end of while loop
+        $value.=$encoded; # get any trailing characters
+        $self->convert2unicode($charset, \$decoded_data);
+        $value .= $decoded_data;
+          } # end of while loop
+        # get any trailing characters
+        $self->convert2unicode("iso_8859_1", \$encoded);
+        $value.=$encoded;
         if ($value =~ /^\s*$/) { # we couldn't extract anything...
+        $value=original_value;
+          $self->convert2unicode("iso_8859_1", \$original_value);
+          $value=original_value;
+        }
     } # end of if =?...?=
 …
     my $mimetype="text/plain";
     my $mimeinfo="";
+    my $charset = "iso_8859_1";
     # Do MIME and encoding stuff
     if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
 …
         $mimetype =~ tr/[A-Z]/[a-z]/;
         $mimeinfo=$2;
+        if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
+          $charset = $1;
+        }
+    }
 …
+    }
     if ($mimetype ne "text/plain") {
     $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref,
                      $outhandle);
+    $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
+                         $outhandle);
     } elsif ($transfer_encoding =~ /quoted\-printable/) {
     $$textref=qp_decode($$textref);
     } elsif ($transfer_encoding =~ /base64/) {
     $$textref=base64_decode($$textref);
+    } else {
+      $self->convert2unicode($charset, $textref);
+    }
 …
 # the textref we are given DOES NOT include the header.
 sub text_from_mime_message {
+    my $self = shift(@_);
     my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
 …
+        {
             # Use the HTML version
             $part_text=text_from_part($message_part);
+            $part_text= $self->text_from_part($message_part);
             $mimetype="text/html";
             last;
 …
+            {
             # Use the plain version
             $part_text=text_from_part($message_part);
+            $part_text= $self->text_from_part($message_part);
             if ($part_text =~/[^\s]/) {
                 $part_text="<pre>".$part_text."</pre>";
 …
+        }
         if ($part_text eq "") { # use first part
         $part_text=text_from_part(shift @message_parts);
+        $part_text= $self->text_from_part(shift @message_parts);
+        }
         if ($part_text eq "") { # we couldn't get anything!!!
 …
         if ($part_content_type =~ m@text/@)
+        {
             my $part_text=text_from_part($message_part);
+            my $part_text= $self->text_from_part($message_part);
             if ($part_content_type !~ m@text/(ht|x)ml@) {
             $part_text=text_into_html($part_text);
 …
             if ($message_content_type =~ /multipart/) {
                 $rfc822_formatted_body=
                 text_from_mime_message($message_content_type,
                                $message_content_info,
                                $message_part_body,
                                $outhandle);
+                  $self->text_from_mime_message($message_content_type,
+                                $message_content_info,
+                                $message_part_body,
+                                $outhandle);
             } else {
                 $message_part_body=text_from_part($part_body);
+                $message_part_body= $self->text_from_part($part_body);
                 $rfc822_formatted_body=text_into_html($message_part_body);
+            }
 …
             # recurse again
             $tmptext=text_from_mime_message($part_content_type,
                             $part_content_info,
                             $part_body,
                             $outhandle);
+            $tmptext= $self->text_from_mime_message($part_content_type,
+                                $part_content_info,
+                                $part_body,
+                                $outhandle);
             $text.=$tmptext;
         } elsif ($text eq "") {
 …
 # Process a MIME part. Return "" if we can't decode it.
 sub text_from_part {
+    my $self = shift(@_);
     my $text=shift;
     my $part_header=$text;
 …
+    }
     # convert to unicode
+    # first get our character encoding name in the right form.
+    $charset=~tr/A-Z/a-z/;
+    $charset=~s/\-/_/g;
+    if ($charset ne "us_ascii" && $charset ne "ascii") {
+    $charset=~s/gb2312/gb/;
+    # assumes EUC-KR, not ISO-2022 !?
+    $charset=~s/ks_c_5601_1987/korean/;
+    my @unicode_array=&unicode::convert2unicode($charset,\$text);
+    my $utf8_text=&unicode::unicode2utf8(@unicode_array);
+    $text=$utf8_text;
+    }
+    $self->convert2unicode($charset, \$text);
     return $text;
+}
 …
+}
+sub convert2unicode {
+  my $self = shift(@_);
+  my ($charset, $textref) = @_;
+  # first get our character encoding name in the right form.
+  $charset = "iso_8859_1" unless defined $charset;
+  $charset=~tr/A-Z/a-z/;
+  $charset=~s/\-/_/g;
+  $charset=~s/gb2312/gb/;
+  # assumes EUC-KR, not ISO-2022 !?
+  $charset=~s/ks_c_5601_1987/korean/;
+  # It appears that we can't always trust ascii text so we'll treat it
+  # as iso-8859-1 (letting characters above 0x80 through without
+  # converting them to utf-8 will result in invalid XML documents
+  # which can't be parsed at build time).
+  $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
+  $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
+}
 # Perl packages have to return true if they are run.

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2847

Legend:

trunk/gsdl/perllib/plugins/EMAILPlug.pm

Download in other formats: