Ignore:
Timestamp:
2005-11-02T17:33:24+13:00 (18 years ago)
Author:
jrm21
Message:

moved utf8 checking code into separate function. (maybe it should be
moved to perllib/unicode.pm). Remove '.'s from document ids so greenstone
doesn't think it is a document section.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r10827 r10834  
    547547        {
    548548            # Use the HTML version
    549             $part_text= $self->text_from_part($message_part);
     549            $part_text = $self->text_from_part($message_part);
    550550            $mimetype="text/html";
    551551            last;
     
    646646    $id =~ s![<>\[\]]!!g; # remove [ ] < and >
    647647    $id =~ s![_&]!-!g; # replace symbols that might cause problems
     648    $id =~ s!\.!-!g; # . means section to greenstone doc ids!
    648649    $id =~ s!@!-!g; # replace @ symbol, to avoid spambots
    649650    return $id;
     
    864865    my $part_header = shift;
    865866
     867
    866868    my $type="text/plain"; # default, overridden from part header
    867869    my $charset=undef;     # convert2unicode() will guess if necessary
     
    901903    }
    902904    }
     905
    903906    if ($type eq "text/html") {
    904907    # only get stuff between <body> tags, or <html> tags.
     
    914917    # convert to unicode
    915918    $self->convert2unicode($charset, \$text);
    916    
    917919    $text =~ s@_@\\_@g; # protect against GS macro language
    918920    return $text;
     
    974976}
    975977
     978# returns 0 if valid utf-8, 1 if invalid
     979sub is_utf8 {
     980    my $self = shift;
     981    my $textref = shift;
     982
     983    $$textref =~ m/^/g; # to set \G
     984    my $badbytesfound=0;
     985    while ($$textref =~ m!\G.*?([\x80-\xff]+)!sg) {
     986    my $highbytes=$1;
     987    my $highbyteslength=length($highbytes);
     988    # replace any non utf8 complaint bytes
     989    $highbytes =~ /^/g; # set pos()
     990    while ($highbytes =~
     991           m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
     992             [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
     993             [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
     994             [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
     995             [\xfc-\xfd][\x80-\xbf]{5}   # 6 byte
     996             )*([\x80-\xff])? !xg
     997           ) {
     998        my $badbyte=$1;
     999        if (!defined $badbyte) {next} # hit end of string
     1000        return 1;
     1001    }
     1002    }
     1003    return 0;
     1004}
     1005
     1006
     1007
    9761008sub convert2unicode {
    9771009  my $self = shift(@_);
     
    9851017  if (! defined $charset) {
    9861018      # check if we have valid utf-8
    987       if ($$textref =~ /^(?: [\0-\x7f]          | # ascii
    988               [\xc0-\xdf][\x80-\xbf]    | # 2 byte utf-8
    989               [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
    990               [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
    991               [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
    992               [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
    993               )+ /x) {
    994       $charset = "utf8";
    995       }
    996 
     1019      if ($self->is_utf8($textref)) { $charset = "utf8" }
    9971020
    9981021      # default to latin
Note: See TracChangeset for help on using the changeset viewer.