Changeset 3132 for trunk/gsdl


Ignore:
Timestamp:
2002-05-22T17:27:41+12:00 (22 years ago)
Author:
jrm21
Message:

Try to determine the encoding used in the headers in case it is not ascii.
(rfc-822 says it should be).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r3111 r3132  
    140140    $$textref = $1;
    141141   
    142     # See if headers include non-ascii - RFC says whole header should be ascii.
    143 # not yet implemented, as we don't know what character set is the
    144 # user's default... We can do textcat to guess, or we can just choose
    145 # one of the charset fields later in the document (if there are any...)
    146 #    if ($Headers =~ /([[:^ascii:]])/) {
    147 #    }
    148 
    149142    # Unfold headers - see rfc822
    150143    $Headers =~ s/\r?\n[\t\ ]+/ /gs;
     
    155148    $raw{$name} = "No $name value";
    156149    }
     150
     151    # Get a default encoding for the header - RFC says should be ascii...
     152    my $default_heading_encoding="iso_8859_1";
     153
     154    # We don't know what character set is the user's default...
     155    # We could use textcat to guess... for now we'll look at mime content-type
     156#    if ($Headers =~ /([[:^ascii:]])/) {
     157#    }
     158    if ($Headers =~ /^Content\-type:.*charset=\"?([a-z0-9\-_]+)/mi) {
     159    $default_header_encoding=$1;
     160    $default_header_encoding =~ s@\-@_@g;
     161    $default_header_encoding =~ tr/A-Z/a-z/;
     162    }
     163
    157164
    158165    # Examine each line of the headers
     
    179186
    180187    # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
    181     if ($value =~ /=\?/) {
     188    if ($value =~ /=\?.*\?[BbQq]\?.*\?=/) {
    182189        my $original_value=$value;
    183190        my $encoded=$value;
     
    201208        $value .= $decoded_data;
    202209          } # end of while loop
    203 
     210       
    204211        # get any trailing characters
    205         $self->convert2unicode("iso_8859_1", \$encoded);
     212        $self->convert2unicode($default_header_encoding, \$encoded);
    206213        $value.=$encoded;
    207214
    208215        if ($value =~ /^\s*$/) { # we couldn't extract anything...
    209           $self->convert2unicode("iso_8859_1", \$original_value);
     216          $self->convert2unicode($default_header_encoding,
     217                     \$original_value);
    210218          $value=original_value;
    211219        }
    212     } # end of if =?...?=
     220    } else { # end of if =?...?=
     221        $self->convert2unicode($default_header_encoding, \$value);
     222    }
     223
     224
    213225   
    214226    # Store the metadata
     
    283295    my $mimetype="text/plain";
    284296    my $mimeinfo="";
    285     my $charset = "iso_8859_1";
     297    my $charset = $default_header_encoding;
    286298    # Do MIME and encoding stuff. Allow \s in mimeinfo in case there is
    287299    # more than one parameter given to Content-type.
     
    375387
    376388    # Convert problem characters into HTML symbols
    377     $text =~ s/&/&amp;/go;
    378     $text =~ s/</&lt;/go;
    379     $text =~ s/>/&gt;/go;
    380     $text =~ s/\"/&quot;/go;
     389    $text =~ s/&/&amp;/g;
     390    $text =~ s/</&lt;/g;
     391    $text =~ s/>/&gt;/g;
     392    $text =~ s/\"/&quot;/g;
    381393
    382394    # convert email addresses and URIs into links
     
    392404
    393405    # Clean up whitespace and convert \n charaters to <BR> or <P>
    394     $text =~ s/ +/ /go;
    395     $text =~ s/\s*$//o;
    396     $text =~ s/^\s*//o;
    397     $text =~ s/\n/\n<BR>/go;
    398     $text =~ s/<BR>\s*<BR>/<P>/go;
     406    $text =~ s/ +/ /g;
     407    $text =~ s/\s*$//g;
     408    $text =~ s/^\s*//g;
     409    $text =~ s/\n/\n<br>/g;
     410    $text =~ s/<br>\s*<br>/<p>/gi;
    399411
    400412    return $text;
     
    644656# Process a MIME part. Return "" if we can't decode it.
    645657sub text_from_part {
    646     my $self = shift(@_);
    647     my $text=shift;
     658    my $self = shift;
     659    my $text=shift || '';
    648660    my $part_header=$text;
    649661    # check for empty part header (leading blank line)
Note: See TracChangeset for help on using the changeset viewer.