Ignore:
Timestamp:
2001-09-03T15:29:45+12:00 (23 years ago)
Author:
jrm21
Message:

1) Non-ascii characters should now work for any encoding handled by Greenstone
(uses unicode.pm now).

2) RFC 2047 - Message Header Extensions parsing in place. Eg
From: =?<charset>?B?<BASE64ENCODING==>?=

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r2717 r2730  
    3939#   after the first blank line in the document.
    4040#
    41 # Metadata:
     41# Metadata (not Dublin Core!):
    4242#   $Headers      All the header content
    4343#   $Subject      Subject: header
    4444#   $To           To: header
    45 #   $From         From: header - this will be stored as Creator
     45#   $From         From: header
    4646#   $FromName     Name of sender (where available)
    4747#   $FromAddr     E-mail address of sender
     
    5757#       * RFC 2045 - Multipurpose Internet Mail Extensions (MIME) -part1
    5858#       * RFC 2046 - MIME (part 2)  Media Types (and multipart messages)
     59#       * RFC 2047 - MIME (part 3)  Message Header Extensions
    5960#       * RFC 1806 - Content Dispositions (ie inline/attachment)
    6061package EMAILPlug;
    6162
    6263use SplitPlug;
     64
     65use unicode;
    6366
    6467use sorttools;
     
    7982    my ($class) = @_;
    8083    my $self = new BasPlug ("EMAILPlug", @_);
    81     # make sure we don't run textcat (defaults to "auto");
    82     $self->{'input_encoding'}="iso_8859_1"; # this might not be good enough...
     84    # this might not actually be true at read-time, but after processing
     85    # it should all be utf8.
     86    $self->{'input_encoding'}="utf8";
    8387    return bless $self, $class;
    8488}
     
    128132    $Headers =~ s/\r?\n\r?\n(.*)$//s;
    129133    $$textref = $1;
    130     # escape [] so it isn't re-interpreted as metadata
    131     $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
    132 
    133134
    134135    # Unfold headers - see rfc822
     
    163164    $value =~ s/\s+$//;
    164165
     166    # decode headers if stored using =?<charset>?[BQ]?<data>?= (rfc2047)
     167    if ($value =~ /=\?/) {
     168        my $original_value=$value;
     169        my $encoded=$value;
     170        $value="";
     171        while ($encoded =~ s/(.*?)=\?([^\?]*)\?([bq])\?([^\?]+)\?=\s*//i) {
     172        my ($charset, $encoding, $data)=($2,$3,$4);
     173        my $decoded_data;
     174        $value.="$1"; # any leading chars
     175        $data=~s/^\s*//; $data=~s/\s*$//; # strip whitespace from ends
     176        chomp $data;
     177        $encoding =~ tr/BQ/bq/;
     178        if ($encoding eq "q") { # quoted printable
     179            $decoded_data=qp_decode($data);
     180        } else { # base 64
     181            $decoded_data=base64_decode($data);
     182        }
     183        if (defined($charset)) {
     184            $charset=~tr/A-Z/a-z/;
     185            $charset=~s/\-/_/g;
     186            $charset=~s/gb2312/gb/;
     187            # assumes EUC-KR, not ISO-2022 !?
     188            $charset=~s/ks_c_5601_1987/korean/;
     189        } else {$charset="ascii";}
     190        if ($charset eq "ascii" || $charset eq "us-ascii") {
     191            # technically possible to have this explicitly...
     192            $value.=$decoded_data;
     193        } else {
     194            my $utf8_text=&unicode::unicode2utf8
     195            (
     196             &unicode::convert2unicode($charset,\$decoded_data)
     197             );
     198            $value.=$utf8_text;
     199        }
     200        } # end of while loop
     201        $value.=$encoded; # get any trailing characters
     202        if ($value =~ /^\s*$/) { # we couldn't extract anything...
     203        $value=original_value;
     204        }
     205    } # end of if =?...?=
     206   
    165207    # Store the metadata
    166208    $raw{$name} = $value;
     
    201243
    202244    # Escape < and > in the whole From field;
    203     $frommeta =~ s/</&lt;/g; $frommeta =~ s/>/&gt;/g;
    204245    $raw{"From"}=$frommeta;
    205246
     
    220261    }
    221262
    222 
    223263    # Add extracted metadata to document object
    224264    foreach my $name (keys %raw) {
    225265    $value = $raw{$name};
    226266    if ($value) {
     267        # assume subject, etc headers have no special HTML meaning.
     268        $value =~ s@&@&amp\;@g;
     269        $value =~ s/</&lt;/g; $value =~ s/>/&gt;/g;
    227270        $value = &text_into_html($value);
     271        # escape [] so it isn't re-interpreted as metadata
     272        $value =~ s/\[/&#91;/g; $value =~ s/\]/&#93;/g;
    228273    } else {
    229274        $value = "No $name field";
     
    260305    $Headers = "No headers" unless ($Headers =~ /\w/);
    261306    $Headers =~ s/@/&#64\;/g;
     307    # escape [] so it isn't re-interpreted as metadata
     308    $Headers =~ s/\[/&#91;/g; $Headers =~ s/\]/&#93;/g;
     309
    262310    $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
    263311
     
    298346#    $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g;
    299347
    300     # assume hostnames are \.\w\d\- only, then might have a trailing '/.*'
    301     # URI can't finish with a '.'
    302     $text =~ s/((http|ftp|https):\/\/[\w\d\-]+(\.[\w\d\-]+)*\/?((&amp;|\.)?[\w\d\?\=\-_\/~]+)*)/<a href=\"$1\">$1<\/a>/g;
     348    # assume hostnames are \.\w\- only, then might have a trailing '/.*'
     349    # assume URI doesn't finish with a '.'
     350    $text =~ s@((http|ftp|https)://[\w\-]+(\.[\w\-]+)*/?((&amp;|\.)?[\w\?\=\-_/~]+)*)@<a href=\"$1\">$1<\/a>@g;
    303351
    304352
     
    419467        # add <<attachment>> to each part except the first...
    420468        if ($text ne "") {
    421             $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
     469            $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
    422470            # add part info header
    423471            $text.="<br>Type: $part_content_type<br>\n";
     
    488536            # part then make sure it is mentioned..
    489537           
    490             $text.="<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
     538            $text.="\n<p><hr><strong>&lt;&lt;attachment&gt;&gt;</>";
    491539            # add part info header
    492540            $text.="<br>Type: $part_content_type<br>\n";
     
    522570    }
    523571    $part_header =~ s/\r?\n[\t ]+/ /gs; #unfold
    524     $part_header =~ /content\-type:\s*([\w\/]+)/is;
    525     my $type=$1; if (!defined($type)) {$type="";}
     572    $part_header =~ /content\-type:\s*([\w\/]+).*?charset=\"?([^\;\"\s]+)\"?/is;
     573    my $type=$1;
     574    my $charset=$2;
     575    if (!defined($type)) {$type="";}
     576    if (!defined($charset)) {$charset="ascii";}
    526577    my $encoding="";
    527578    if ($part_header =~ /^content\-transfer\-encoding:\s*([^\s]+)/mis) {
     
    541592    }
    542593    }
    543 
    544594    if ($type eq "text/html") {
    545595    # only get stuff between <body> tags, or <html> tags.
    546     $text =~ s/^.*?<(html|HTML)[^>]*>//s;
    547     $text =~ s/<\/(html|HTML)>.*$//s;
    548 
    549     $text =~ s/^.*?<(body|BODY)[^>]*>//s;
    550     $text =~ s/<\/(body|BODY)>.*$//s;
     596    $text =~ s@^.*<html[^>]*>@@is;
     597    $text =~ s@</html>.*$@@is;
     598    $text =~ s/^.*?<body[^>]*>//si;
     599    $text =~ s/<\/body>.*$//si;
    551600    }
    552601    elsif ($type eq "text/xml") {
    553602    $text=~s/</&lt;/g;$text=~s/>/&gt;/g;
    554603    $text="<pre>\n$text\n</pre>\n";
     604    }
     605    # convert to unicode
     606    # first get our character encoding name in the right form.
     607    $charset=~tr/A-Z/a-z/;
     608    $charset=~s/\-/_/g;
     609    if ($charset ne "us_ascii" && $charset ne "ascii") {
     610    $charset=~s/gb2312/gb/;
     611    # assumes EUC-KR, not ISO-2022 !?
     612    $charset=~s/ks_c_5601_1987/korean/;
     613    my @unicode_array=&unicode::convert2unicode($charset,\$text);
     614    my $utf8_text=&unicode::unicode2utf8(@unicode_array);
     615    $text=$utf8_text;
    555616    }
    556617    return $text;
Note: See TracChangeset for help on using the changeset viewer.