Changeset 7703


Ignore:
Timestamp:
2004-07-05T17:52:01+12:00 (20 years ago)
Author:
jrm21
Message:

1) use the email's message ID instead of document hash for Identifier.

2) if a message claims to be utf8, actually check it for bad chars.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r6916 r7703  
    358358    }
    359359
     360
     361    # extract a message ID from the headers, if there is one, and we'll use
     362    # that as the greenstone doc ID. Having a predictable ID means we can
     363    # link to other messages, eg from In-Reply-To or References headers...
     364    if ($Headers =~ m@^Message-ID:(.+)$@mi) {
     365    my $id=escape_msg_id($1);
     366    $doc_obj->{'msgid'}=$id;
     367    }
     368    # link to another message, if this is a reply
     369    if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
     370    my $id=escape_msg_id($1);
     371    $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
     372    } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
     373    # References can have multiple, get the last one
     374    my $id=escape_msg_id($1);
     375    # not necessarily in-reply-to, but same thread...
     376    $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
     377    }
     378
     379
     380
    360381    my $mimetype="text/plain";
    361382    my $mimeinfo="";
     
    425446    # Add Title metadata
    426447    my $Title = text_into_html($raw{'Subject'});
    427     $Title .= "<br>From: " . text_into_html($raw{'From'});
     448    $Title .= "<br>From: " . text_into_html($fromnamemeta);
    428449    $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
    429450    $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
     
    620641}
    621642
     643
     644
     645# used for turning a message id into a more friendly string for greenstone
     646sub escape_msg_id {
     647#msgid
     648    my $id=shift;
     649    chomp $id; $id =~ s!\s!!g; # remove spaces
     650    $id =~ s![<>\[\]]!!g; # remove [ ] < and >
     651    $id =~ s![_&]!-!g; # replace symbols that might cause problems
     652    $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
     653    return $id;
     654}
    622655
    623656
     
    938971  my ($charset, $textref) = @_;
    939972
     973  if (!$$textref) {
     974      # nothing to do!
     975      return;
     976  }
     977
    940978  # first get our character encoding name in the right form.
    941979  $charset = "iso_8859_1" unless defined $charset;
    942   $charset=~tr/A-Z/a-z/;
    943   $charset=~s/\-/_/g;
    944   $charset=~s/gb2312/gb/;
     980  $charset =~ tr/A-Z/a-z/; # lowercase
     981  $charset =~ s/\-/_/g;
     982  $charset =~ s/gb2312/gb/;
    945983  # assumes EUC-KR, not ISO-2022 !?
    946   $charset=~s/ks_c_5601_1987/korean/;
    947 
    948   if ($charset eq "utf_8" || !$$textref) {
    949       # nothing to do!
     984  $charset =~ s/^ks_c_5601_1987/korean/;
     985  if ($charset eq 'utf_8') {$charset='utf8'}
     986
     987  my $outhandle = $self->{'outhandle'};
     988
     989  if ($charset eq "utf8") {
     990      # no conversion needed, but lets check that it's valid utf8
     991      # see utf-8 manpage for valid ranges
     992      $$textref =~ m/^/g; # to set \G
     993      my $badbytesfound=0;
     994      while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
     995      my $highbytes=$1;
     996      my $highbyteslength=length($highbytes);
     997      # replace any non utf8 complaint bytes
     998      $highbytes =~ /^/g; # set pos()
     999      while ($highbytes =~
     1000         m!\G (?: [\xc0-\xdf][\x80-\xbf]    | # 2 byte utf-8
     1001               [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
     1002               [\xf0-\xf7][\x80-\xbf]{3}   # 4 byte
     1003               [\xf8-\xfb][\x80-\xbf]{4}   # 5 byte
     1004               [\xfc-\xfd][\x80-\xbf]{5}   # 6 byte
     1005               )*([\x80-\xff])? !xg
     1006         ) {
     1007          my $badbyte=$1;
     1008          if (!defined $badbyte) {next} # hit end of string
     1009          my $pos=pos($highbytes);
     1010          substr($highbytes, $pos-1, 1, "\xc2\x80");
     1011          # update the position to continue searching (for \G)
     1012          pos($highbytes) = $pos+1; # set to just after the \x80
     1013          $badbytesfound=1;
     1014      }
     1015      if ($badbytesfound==1) {
     1016          # claims to be utf8, but it isn't!
     1017          print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
     1018          . "detected and removed.\n";
     1019
     1020          my $replength=length($highbytes);
     1021          my $textpos=pos($$textref);
     1022          # replace bad bytes with good bytes
     1023          substr( $$textref, $textpos-$replength,
     1024              $replength, $highbytes);
     1025          # update the position to continue searching (for \G)
     1026          pos($$textref)=$textpos+($replength-$highbyteslength);
     1027      }
     1028      }
    9501029      return;
    9511030  }
     
    9611040      # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
    9621041      if ($$textref =~ m/[\x80-\x9f]/) {
    963       my $outhandle = $self->{'outhandle'};
    9641042      print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
    9651043      print $outhandle "codepage 1252 detected.\n";
     
    9831061
    9841062
     1063sub set_OID {
     1064    my $self = shift (@_);
     1065    my ($doc_obj, $id, $segment_number) = @_;
     1066   
     1067    if ( exists $doc_obj->{'msgid'} ) {
     1068    $doc_obj->set_OID($doc_obj->{'msgid'});
     1069    } else {
     1070    $doc_obj->set_OID("$id\_$segment_number");
     1071    }
     1072}
     1073
     1074
    9851075# Perl packages have to return true if they are run.
    98610761;
Note: See TracChangeset for help on using the changeset viewer.