Changeset 3073 for trunk/gsdl


Ignore:
Timestamp:
2002-04-03T15:44:42+12:00 (22 years ago)
Author:
jrm21
Message:

1) Default Title now correctly escapes [ and ] chars.
2) "Content-type: text" is now allowed (for emails pre-1996 RFC for all the

people with old email archives).

3) Check if text is really encoded with win1252 when it claims to be ascii,

as some mailers on Microsoft Windows do this, which causes our XML parser
to fail.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r2918 r3073  
    279279    my $mimeinfo="";
    280280    my $charset = "iso_8859_1";
    281     # Do MIME and encoding stuff
    282     if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*\;?\s*(.+?)\s*$/mi)
     281    # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
     282    # more than one parameter given to Content-type.
     283    # eg: Content-type: text/plain; charset="us-ascii"; format="flowed"
     284    if ($Headers =~ /^content\-type:\s*([\w\/\-]+)\s*(\;\s*.*)\s*$/mi)
    283285    {
    284286        $mimetype=$1;
    285287        $mimetype =~ tr/[A-Z]/[a-z]/;
     288
     289        if ($mimetype eq "text") { # for pre-RFC2045 messages (c. 1996)
     290        $mimetype = "text/plain";
     291        }
     292
    286293        $mimeinfo=$2;
    287         if ($mimeinfo =~ /charset=\"([^\"]+)\"/) {
     294        if (!defined $mimeinfo) {
     295        $mimeinfo="";
     296        } else { # strip leading and trailing stuff
     297        $mimeinfo =~ s/^\;\s*//;
     298        $mimeinfo =~ s/\s*$//;
     299        }
     300        if ($mimeinfo =~ /charset=\"([^\"]+)\"/i) {
    288301          $charset = $1;
    289302        }
     
    325338    $Title .= "<br>From: " . text_into_html($raw{'From'});
    326339    $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
     340    $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
    327341
    328342    $doc_obj->add_utf8_metadata ($cursection, "Title", $Title);
     
    760774  $charset = "iso_8859_1" if ($charset eq "us_ascii" || $charset eq "ascii");
    761775
     776  if ($charset eq "iso_8859_1") {
     777      # test if the mailer lied, and it has win1252 chars in it...
     778      # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
     779      if ($$textref =~ m/[\x80-\x9f]/) {
     780      my $outhandle = $self->{'outhandle'};
     781      print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
     782      print $outhandle "codepage 1252 detected.\n";
     783      $charset = "windows_1252";
     784      }
     785  }
    762786  $$textref=&unicode::unicode2utf8(&unicode::convert2unicode($charset,$textref));
    763787}
Note: See TracChangeset for help on using the changeset viewer.