Changeset 3181

Show
Ignore:
Timestamp:
25.06.2002 20:15:26 (17 years ago)
Author:
sjboddie
Message:

Altered the getcharequiv() function so it now converts entities to raw
utf-8 characters. This should now work for entities like ∏ (i.e.
characters outside of the latin 1 character set).

Location:
trunk/gsdl/perllib
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r2803 r3181  
    461461    # 2. Split the remaining text into space-delimited tokens 
    462462 
    463     # Convert any HTML special characters (like ") to their UTF8 equivalent 
    464     s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse; 
     463    # Convert entities to their UTF8 equivalents 
     464    s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse; 
    465465 
    466466    # Split text at word boundaries 
  • trunk/gsdl/perllib/ghtml.pm

    r2994 r3181  
    2727 
    2828package ghtml; 
     29 
     30use unicode; 
    2931 
    3032# htmlsafe(TEXT) 
     
    142144 
    143145 
    144 # returns the character in the standard html font. It assumes that the 
     146# returns the character as a raw utf-8 character. It assumes that the 
    145147# & and ; have been stripped off the string. 
    146148sub getcharequiv { 
    147149    my ($entity, $convertsymbols) = @_; 
    148150 
    149     # replace &#8218 with comma (",") 
    150     $entity =~ s/8218/044/; 
    151  
    152151    # a numeric entity 
    153152    if ($entity =~ /^\#0*(\d+)/) { 
    154     return pack("c", $1); 
     153    return &unicode::unicode2utf8([$1]); 
    155154    } 
    156155     
    157156    # a named character entity 
    158157    if (defined $charnetosf{$entity}) { 
    159     return pack("c", $charnetosf{$entity}); 
     158    return &unicode::unicode2utf8([$charnetosf{$entity}]); 
    160159    } 
    161160 
    162161    # a named symbol entity 
    163162    if ($convertsymbols && defined $symnetosf{$entity}) { 
    164     return pack("c", $symnetosf{$entity}); 
     163    return &unicode::unicode2utf8([$symnetosf{$entity}]); 
    165164    } 
    166165 
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r3148 r3181  
    676676    &BasPlug::read_file($self, $filename, $encoding, $language, $textref); 
    677677 
    678  
    679     # Convert things like é to their UTF8 equivalents 
     678    # Convert entities to their UTF8 equivalents 
    680679    $$textref =~ s/&(lt|gt|amp|quot);/&z$1;/go; 
    681     $$textref =~ s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gseo; 
     680    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo; 
    682681    $$textref =~ s/&z(lt|gt|amp|quot);/&$1;/go; 
    683682}