Context Navigation

← Previous Changeset
Next Changeset →

Changeset 7703

Timestamp:

2004-07-05T17:52:01+12:00 (20 years ago)

Author:

jrm21

Message:

1) use the email's message ID instead of document hash for Identifier.

2) if a message claims to be utf8, actually check it for bad chars.

File:

: 1 edited

trunk/gsdl/perllib/plugins/EMAILPlug.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r6916
+              r7703
+    }
+    # extract a message ID from the headers, if there is one, and we'll use
+    # that as the greenstone doc ID. Having a predictable ID means we can
+    # link to other messages, eg from In-Reply-To or References headers...
+    if ($Headers =~ m@^Message-ID:(.+)$@mi) {
+    my $id=escape_msg_id($1);
+    $doc_obj->{'msgid'}=$id;
+    }
+    # link to another message, if this is a reply
+    if ($Headers =~ m@^In-Reply-To:(.+)$@mi) {
+    my $id=escape_msg_id($1);
+    $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
+    } elsif ($Headers =~ m@^References:.*\s([^\s]+)$@mi) {
+    # References can have multiple, get the last one
+    my $id=escape_msg_id($1);
+    # not necessarily in-reply-to, but same thread...
+    $doc_obj->add_utf8_metadata ($cursection, 'InReplyTo', $id);
+    }
     my $mimetype="text/plain";
     my $mimeinfo="";
 …
     # Add Title metadata
     my $Title = text_into_html($raw{'Subject'});
     $Title .= "<br>From: " . text_into_html($raw{'From'});
+    $Title .= "<br>From: " . text_into_html($fromnamemeta);
     $Title .= "<br>Date: " . text_into_html($raw{'DateText'});
     $Title =~ s/\[/&#91;/g; $Title =~ s/\]/&#93;/g;
 …
+}
+# used for turning a message id into a more friendly string for greenstone
+sub escape_msg_id {
+#msgid
+    my $id=shift;
+    chomp $id; $id =~ s!\s!!g; # remove spaces
+    $id =~ s![<>\[\]]!!g; # remove [ ] < and >
+    $id =~ s![_&]!-!g; # replace symbols that might cause problems
+    $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
+    return $id;
+}
 …
   my ($charset, $textref) = @_;
+  if (!$$textref) {
+      # nothing to do!
+      return;
+  }
   # first get our character encoding name in the right form.
   $charset = "iso_8859_1" unless defined $charset;
   $charset=~tr/A-Z/a-z/;
   $charset=~s/\-/_/g;
   $charset=~s/gb2312/gb/;
+  $charset =~ tr/A-Z/a-z/; # lowercase
+  $charset =~ s/\-/_/g;
+  $charset =~ s/gb2312/gb/;
   # assumes EUC-KR, not ISO-2022 !?
+  $charset=~s/ks_c_5601_1987/korean/;
+  if ($charset eq "utf_8" || !$$textref) {
+      # nothing to do!
+  $charset =~ s/^ks_c_5601_1987/korean/;
+  if ($charset eq 'utf_8') {$charset='utf8'}
+  my $outhandle = $self->{'outhandle'};
+  if ($charset eq "utf8") {
+      # no conversion needed, but lets check that it's valid utf8
+      # see utf-8 manpage for valid ranges
+      $$textref =~ m/^/g; # to set \G
+      my $badbytesfound=0;
+      while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
+      my $highbytes=$1;
+      my $highbyteslength=length($highbytes);
+      # replace any non utf8 complaint bytes
+      $highbytes =~ /^/g; # set pos()
+      while ($highbytes =~
+         m!\G (?: [\xc0-\xdf][\x80-\xbf]    | # 2 byte utf-8
+               [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
+               [\xf0-\xf7][\x80-\xbf]{3}   # 4 byte
+               [\xf8-\xfb][\x80-\xbf]{4}   # 5 byte
+               [\xfc-\xfd][\x80-\xbf]{5}   # 6 byte
+               )*([\x80-\xff])? !xg
+         ) {
+          my $badbyte=$1;
+          if (!defined $badbyte) {next} # hit end of string
+          my $pos=pos($highbytes);
+          substr($highbytes, $pos-1, 1, "\xc2\x80");
+          # update the position to continue searching (for \G)
+          pos($highbytes) = $pos+1; # set to just after the \x80
+          $badbytesfound=1;
+      }
+      if ($badbytesfound==1) {
+          # claims to be utf8, but it isn't!
+          print $outhandle "EMAILPlug: Headers claim utf-8 but bad bytes "
+          . "detected and removed.\n";
+          my $replength=length($highbytes);
+          my $textpos=pos($$textref);
+          # replace bad bytes with good bytes
+          substr( $$textref, $textpos-$replength,
+              $replength, $highbytes);
+          # update the position to continue searching (for \G)
+          pos($$textref)=$textpos+($replength-$highbyteslength);
+      }
+      }
       return;
+  }
 …
       # 1252 has characters between 0x80 and 0x9f, 8859-1 doesn't
       if ($$textref =~ m/[\x80-\x9f]/) {
-      my $outhandle = $self->{'outhandle'};
       print $outhandle "EMAILPlug: Headers claim ISO charset but MS ";
       print $outhandle "codepage 1252 detected.\n";
 …
+sub set_OID {
+    my $self = shift (@_);
+    my ($doc_obj, $id, $segment_number) = @_;
+    if ( exists $doc_obj->{'msgid'} ) {
+    $doc_obj->set_OID($doc_obj->{'msgid'});
+    } else {
+    $doc_obj->set_OID("$id\_$segment_number");
+    }
+}
 # Perl packages have to return true if they are run.
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 7703

Legend:

trunk/gsdl/perllib/plugins/EMAILPlug.pm

Download in other formats: