Changeset 3181 for trunk


Ignore:
Timestamp:
2002-06-25T20:15:26+12:00 (22 years ago)
Author:
sjboddie
Message:

Altered the getcharequiv() function so it now converts entities to raw
utf-8 characters. This should now work for entities like ∏ (i.e.
characters outside of the latin 1 character set).

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/classify/phind.pm

    r2803 r3181  
    461461    # 2. Split the remaining text into space-delimited tokens
    462462
    463     # Convert any HTML special characters (like ") to their UTF8 equivalent
    464     s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;
     463    # Convert entities to their UTF8 equivalents
     464    s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse;
    465465
    466466    # Split text at word boundaries
  • trunk/gsdl/perllib/ghtml.pm

    r2994 r3181  
    2727
    2828package ghtml;
     29
     30use unicode;
    2931
    3032# htmlsafe(TEXT)
     
    142144
    143145
    144 # returns the character in the standard html font. It assumes that the
     146# returns the character as a raw utf-8 character. It assumes that the
    145147# & and ; have been stripped off the string.
    146148sub getcharequiv {
    147149    my ($entity, $convertsymbols) = @_;
    148150
    149     # replace &#8218 with comma (",")
    150     $entity =~ s/8218/044/;
    151 
    152151    # a numeric entity
    153152    if ($entity =~ /^\#0*(\d+)/) {
    154     return pack("c", $1);
     153    return &unicode::unicode2utf8([$1]);
    155154    }
    156155   
    157156    # a named character entity
    158157    if (defined $charnetosf{$entity}) {
    159     return pack("c", $charnetosf{$entity});
     158    return &unicode::unicode2utf8([$charnetosf{$entity}]);
    160159    }
    161160
    162161    # a named symbol entity
    163162    if ($convertsymbols && defined $symnetosf{$entity}) {
    164     return pack("c", $symnetosf{$entity});
     163    return &unicode::unicode2utf8([$symnetosf{$entity}]);
    165164    }
    166165
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r3148 r3181  
    676676    &BasPlug::read_file($self, $filename, $encoding, $language, $textref);
    677677
    678 
    679     # Convert things like é to their UTF8 equivalents
     678    # Convert entities to their UTF8 equivalents
    680679    $$textref =~ s/&(lt|gt|amp|quot);/&z$1;/go;
    681     $$textref =~ s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gseo;
     680    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo;
    682681    $$textref =~ s/&z(lt|gt|amp|quot);/&$1;/go;
    683682}
Note: See TracChangeset for help on using the changeset viewer.