Context Navigation

← Previous Changeset
Next Changeset →

Changeset 10834

Timestamp:

2005-11-02T17:33:24+13:00 (18 years ago)

Author:

jrm21

Message:

moved utf8 checking code into separate function. (maybe it should be
moved to perllib/unicode.pm). Remove '.'s from document ids so greenstone
doesn't think it is a document section.

File:

: 1 edited

trunk/gsdl/perllib/plugins/EMAILPlug.pm (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r10827
+              r10834
+        {
             # Use the HTML version
             $part_text= $self->text_from_part($message_part);
+            $part_text = $self->text_from_part($message_part);
             $mimetype="text/html";
             last;
 …
     $id =~ s![<>\[\]]!!g; # remove [ ] < and >
     $id =~ s![_&]!-!g; # replace symbols that might cause problems
+    $id =~ s!\.!-!g; # . means section to greenstone doc ids!
     $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
     return $id;
 …
     my $part_header = shift;
     my $type="text/plain"; # default, overridden from part header
     my $charset=undef;     # convert2unicode() will guess if necessary
 …
+    }
+    }
     if ($type eq "text/html") {
     # only get stuff between <body> tags, or <html> tags.
 …
     # convert to unicode
     $self->convert2unicode($charset, \$text);
     $text =~ s@_@\\_@g; # protect against GS macro language
     return $text;
 …
+}
+# returns 0 if valid utf-8, 1 if invalid
+sub is_utf8 {
+    my $self = shift;
+    my $textref = shift;
+    $$textref =~ m/^/g; # to set \G
+    my $badbytesfound=0;
+    while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
+    my $highbytes=$1;
+    my $highbyteslength=length($highbytes);
+    # replace any non utf8 complaint bytes
+    $highbytes =~ /^/g; # set pos()
+    while ($highbytes =~
+           m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
+             [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
+             [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
+             [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
+             [\xfc-\xfd][\x80-\xbf]{5}   # 6 byte
+             )*([\x80-\xff])? !xg
+           ) {
+        my $badbyte=$1;
+        if (!defined $badbyte) {next} # hit end of string
+        return 1;
+    }
+    }
+    return 0;
+}
 sub convert2unicode {
   my $self = shift(@_);
 …
   if (! defined $charset) {
       # check if we have valid utf-8
+      if ($$textref =~ /^(?: [\0-\x7f]          | # ascii
+              [\xc0-\xdf][\x80-\xbf]    | # 2 byte utf-8
+              [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
+              [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
+              [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
+              [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
+              )+ /x) {
+      $charset = "utf8";
+      }
+      if ($self->is_utf8($textref)) { $charset = "utf8" }
       # default to latin

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 10834

Legend:

trunk/gsdl/perllib/plugins/EMAILPlug.pm

Download in other formats: