- Timestamp:
- 2002-06-25T20:15:26+12:00 (22 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r2803 r3181 461 461 # 2. Split the remaining text into space-delimited tokens 462 462 463 # Convert any HTML special characters (like ") to their UTF8 equivalent464 s/&([^;]+);/& unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse;463 # Convert entities to their UTF8 equivalents 464 s/&([^;]+);/&ghtml::getcharequiv($1,1)/gse; 465 465 466 466 # Split text at word boundaries -
trunk/gsdl/perllib/ghtml.pm
r2994 r3181 27 27 28 28 package ghtml; 29 30 use unicode; 29 31 30 32 # htmlsafe(TEXT) … … 142 144 143 145 144 # returns the character in the standard html font. It assumes that the146 # returns the character as a raw utf-8 character. It assumes that the 145 147 # & and ; have been stripped off the string. 146 148 sub getcharequiv { 147 149 my ($entity, $convertsymbols) = @_; 148 150 149 # replace ‚ with comma (",")150 $entity =~ s/8218/044/;151 152 151 # a numeric entity 153 152 if ($entity =~ /^\#0*(\d+)/) { 154 return pack("c", $1);153 return &unicode::unicode2utf8([$1]); 155 154 } 156 155 157 156 # a named character entity 158 157 if (defined $charnetosf{$entity}) { 159 return pack("c", $charnetosf{$entity});158 return &unicode::unicode2utf8([$charnetosf{$entity}]); 160 159 } 161 160 162 161 # a named symbol entity 163 162 if ($convertsymbols && defined $symnetosf{$entity}) { 164 return pack("c", $symnetosf{$entity});163 return &unicode::unicode2utf8([$symnetosf{$entity}]); 165 164 } 166 165 -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r3148 r3181 676 676 &BasPlug::read_file($self, $filename, $encoding, $language, $textref); 677 677 678 679 # Convert things like é to their UTF8 equivalents 678 # Convert entities to their UTF8 equivalents 680 679 $$textref =~ s/&(lt|gt|amp|quot);/&z$1;/go; 681 $$textref =~ s/&([^;]+);/& unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gseo;680 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1)/gseo; 682 681 $$textref =~ s/&z(lt|gt|amp|quot);/&$1;/go; 683 682 }
Note:
See TracChangeset
for help on using the changeset viewer.