Changeset 2847


Ignore:
Timestamp:
2001-11-23T16:14:39+13:00 (22 years ago)
Author:
sjboddie
Message:

Altered EMAILPlug a little so it now treats all text that it used to
treat as ASCII as ISO-8859-1 encoded instead. This prevents problems
when text is assumed to be plain ASCII but isn't (that is, the resulting
XML documents couldn't be parsed by the XML::Parser module).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r2781 r2847  
    180180        while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
    181181        my ($charset, $encoding, $data)=($2,$3,$4);
    182         my $decoded_data;
     182        my ($decoded_data);
    183183        $value.="$1"; # any leading chars
    184184        $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
     
    191191            $decoded_data=base64_decode($data);
    192192        }
    193         if (defined($charset)) {
    194             $charset=~tr/A-Z/a-z/;
    195             $charset=~s/\-/_/g;
    196             $charset=~s/gb2312/gb/;
    197             # assumes EUC-KR, not ISO-2022 !?
    198             $charset=~s/ks_c_5601_1987/korean/;
    199         } else {$charset="ascii";}
    200         if ($charset eq "ascii" || $charset eq "us_ascii") {
    201             # technically possible to have this explicitly...
    202             $value.=$decoded_data;
    203         } else {
    204             my $utf8_text=&unicode::unicode2utf8
    205             (
    206              &unicode::convert2unicode($charset,\$decoded_data)
    207              );
    208             $value.=$utf8_text;
    209         }
    210         } # end of while loop
    211         $value.=$encoded; # get any trailing characters
     193        $self->convert2unicode($charset, \$decoded_data);
     194        $value .= $decoded_data;
     195          } # end of while loop
     196
     197        # get any trailing characters
     198        $self->convert2unicode("iso_8859_1", \$encoded);
     199        $value.=$encoded;
     200
    212201        if ($value =~ /^\s*$/) { # we couldn't extract anything...
    213         $value=original_value;
     202          $self->convert2unicode("iso_8859_1", \$original_value);
     203          $value=original_value;
    214204        }
    215205    } # end of if =?...?=
     
    289279    my $mimetype="text/plain";
    290280    my $mimeinfo="";
     281    my $charset = "iso_8859_1";
    291282    # Do MIME and encoding stuff
    292283    if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
     
    295286        $mimetype =~ tr/[A-Z]/[a-z]/;
    296287        $mimeinfo=$2;
     288        if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
     289          $charset = $1;
     290        }
    297291    }
    298292
     
    302296    }
    303297    if ($mimetype ne "text/plain") {
    304     $$textref=text_from_mime_message($mimetype,$mimeinfo,$$textref,
    305                     $outhandle);
     298    $$textref= $self->text_from_mime_message($mimetype,$mimeinfo,$$textref,
     299                        $outhandle);
    306300    } elsif ($transfer_encoding =~ /quoted\-printable/) {
    307301    $$textref=qp_decode($$textref);
    308302    } elsif ($transfer_encoding =~ /base64/) {
    309303    $$textref=base64_decode($$textref);
     304    } else {
     305      $self->convert2unicode($charset, $textref);
    310306    }
    311307   
     
    379375# the textref we are given DOES NOT include the header.
    380376sub text_from_mime_message {
     377    my $self = shift(@_);
    381378    my ($mimetype,$mimeinfo,$text,$outhandle)=(@_);
    382379
     
    419416        {
    420417            # Use the HTML version
    421             $part_text=text_from_part($message_part);
     418            $part_text= $self->text_from_part($message_part);
    422419            $mimetype="text/html";
    423420            last;
     
    429426            {
    430427            # Use the plain version
    431             $part_text=text_from_part($message_part);
     428            $part_text= $self->text_from_part($message_part);
    432429            if ($part_text =~/[^\s]/) {
    433430                $part_text="<pre>".$part_text."</pre>";
     
    439436        }
    440437        if ($part_text eq "") { # use first part
    441         $part_text=text_from_part(shift @message_parts);
     438        $part_text= $self->text_from_part(shift @message_parts);
    442439        }
    443440        if ($part_text eq "") { # we couldn't get anything!!!
     
    498495        if ($part_content_type =~ m@text/@)
    499496        {
    500             my $part_text=text_from_part($message_part);
     497            my $part_text= $self->text_from_part($message_part);
    501498            if ($part_content_type !~ m@text/(ht|x)ml@) {
    502499            $part_text=text_into_html($part_text);
     
    526523            if ($message_content_type =~ /multipart/) {
    527524                $rfc822_formatted_body=
    528                 text_from_mime_message($message_content_type,
    529                                $message_content_info,
    530                                $message_part_body,
    531                                $outhandle);
     525                  $self->text_from_mime_message($message_content_type,
     526                                $message_content_info,
     527                                $message_part_body,
     528                                $outhandle);
    532529            } else {
    533                 $message_part_body=text_from_part($part_body);
     530                $message_part_body= $self->text_from_part($part_body);
    534531                $rfc822_formatted_body=text_into_html($message_part_body);
    535532            }
     
    548545            # recurse again
    549546
    550             $tmptext=text_from_mime_message($part_content_type,
    551                             $part_content_info,
    552                             $part_body,
    553                             $outhandle);
     547            $tmptext= $self->text_from_mime_message($part_content_type,
     548                                $part_content_info,
     549                                $part_body,
     550                                $outhandle);
    554551            $text.=$tmptext;
    555552        } elsif ($text eq "") {
     
    581578# Process a MIME part. Return "" if we can't decode it.
    582579sub text_from_part {
     580    my $self = shift(@_);
    583581    my $text=shift;
    584582    my $part_header=$text;
     
    625623    }
    626624    # convert to unicode
    627     # first get our character encoding name in the right form.
    628     $charset=~tr/A-Z/a-z/;
    629     $charset=~s/\-/_/g;
    630     if ($charset ne "us_ascii" && $charset ne "ascii") {
    631     $charset=~s/gb2312/gb/;
    632     # assumes EUC-KR, not ISO-2022 !?
    633     $charset=~s/ks_c_5601_1987/korean/;
    634     my @unicode_array=&unicode::convert2unicode($charset,\$text);
    635     my $utf8_text=&unicode::unicode2utf8(@unicode_array);
    636     $text=$utf8_text;
    637     }
     625    $self->convert2unicode($charset, \$text);
    638626    return $text;
    639627}
     
    708696}
    709697
     698sub convert2unicode {
     699  my $self = shift(@_);
     700  my ($charset, $textref) = @_;
     701
     702  # first get our character encoding name in the right form.
     703  $charset = "iso_8859_1" unless defined $charset;
     704  $charset=~tr/A-Z/a-z/;
     705  $charset=~s/\-/_/g;
     706  $charset=~s/gb2312/gb/;
     707  # assumes EUC-KR, not ISO-2022 !?
     708  $charset=~s/ks_c_5601_1987/korean/;
     709
     710  # It appears that we can't always trust ascii text so we'll treat it
     711  # as iso-8859-1 (letting characters above 0x80 through without
     712  # converting them to utf-8 will result in invalid XML documents
     713  # which can't be parsed at build time).
     714  $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
     715
     716  $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
     717}
     718
    710719
    711720# Perl packages have to return true if they are run.
Note: See TracChangeset for help on using the changeset viewer.