Changeset 35167 for main/trunk/greenstone2/perllib/ghtml.pm
- Timestamp:
- 2021-05-17T12:34:22+12:00 (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/ghtml.pm
r30593 r35167 203 203 204 204 205 # This gets passed in 2 entities, with &# and ; stripped off. If they are a valid surrogate pair, 206 # it returns the character they represent 207 sub desurrogate { 208 my ($hi, $lo) = @_; 209 210 my $hi_code = undef; 211 my $lo_code = undef; 212 213 if ($hi =~ m/^0*(\d+)$/) { 214 $hi_code=$1; 215 } 216 elsif ($hi =~ m/^x([0-9A-F]+)$/i) { 217 $hi_code=hex($1); 218 } 219 if ($lo =~ m/^0*(\d+)$/) { 220 $lo_code=$1; 221 } 222 elsif ($lo =~ m/^x([0-9A-F]+)$/i) { 223 $lo_code=hex($1); 224 } 225 if (!defined $hi_code && !defined $lo_code) { 226 # wasn't proper surrogate pair 227 print STDERR "WARNING, &#$hi; &#$lo; is not a valid surrogate pair, returning '?'\n"; 228 return "?"; 229 230 } 231 #([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}]) 232 if($hi_code >= 0xD800 && $hi_code <= 0xDBFF && $lo_code >= 0xDC00 && $lo_code <= 0xDFFF) { 233 #print STDERR "Found surrogate pair $hi_code, $lo_code\n"; 234 my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00); 235 my $char_equiv = &unicode::unicode2utf8([$codepoint]); 236 $char_equiv = Encode::decode("utf8",$char_equiv); 237 return $char_equiv; 238 } else { 239 print STDERR "WARNING, &#$hi_code; &#$lo_code; is not a valid surrogate pair, returning '?'\n"; 240 return "?"; 241 } 242 } 243 244 #If you want to remove surrogate pairs before you process all the other entities, then you need more complicated lookahead system, to handle when the two entities you are looking at are not the pair. 245 # leaving this here for future reference, but its not used currently 246 # (?= is lookahead, can return capturing groups, but won't be consumed by a match 247 # (?:....)? non-capturing group that is optional 248 #$$textref =~ s/&\#([^;]+);(?=(?:&\#([^;]+);)?)/&ghtml::desurrogate($1,$2,1)/gseo; 249 250 # returns a surroage pair. assumes &# and ; have been stripped off the entity 251 # optional lookahead to get $lo 252 my $in_surrogate = 0; 253 sub preprocess_desurrogate_NOTUSED { 254 my ($hi, $lo, $and_decode) = @_; 255 print STDERR "in ghtml::desurrogate, $hi"; if (defined $lo) {print STDERR " $lo";} print STDERR "\n"; 256 my $hi_code = undef; 257 my $lo_code = undef; 258 259 if ($in_surrogate) { # consume the second entity of the surrogate 260 $in_surrogate = 0; 261 return ""; 262 } 263 if (!defined $lo) { # we are not part of a pair 264 return "&#$hi;"; 265 } 266 if ($hi =~ m/^0*(\d+)$/) { 267 $hi_code=$1; 268 } 269 elsif ($hi =~ m/^x([0-9A-F]+)$/i) { 270 $hi_code=hex($1); 271 } 272 # are we the first part of a surrogate? 273 if (!defined $hi_code || !($hi_code >= 0xD800 && $hi_code <= 0xDFFF)) { 274 # no, return the original 275 return "&#$hi;"; 276 } 277 # check the second part - is that a surrogate part? 278 if ($lo =~ m/^0*(\d+)$/) { 279 $lo_code=$1; 280 } 281 elsif ($lo =~ m/^x([0-9A-F]+)$/i) { 282 $lo_code=hex($1); 283 } 284 285 if (!defined $lo_code || !($lo_code>= 0xD800 && $lo_code <= 0xDFFF)) { 286 # not part of a surrogate 287 return "&#$hi;"; 288 } 289 290 my $char_equiv = undef; 291 my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00); 292 $char_equiv = &unicode::unicode2utf8([$codepoint]); 293 294 295 if (!defined $char_equiv) { 296 return "&#$hi;"; 297 } 298 else { 299 if ((defined $and_decode) && ($and_decode)) { 300 $char_equiv = Encode::decode("utf8",$char_equiv); 301 } 302 $in_surrogate=1; 303 print STDERR "found surrogate\n"; 304 return $char_equiv; 305 } 306 } 205 307 # returns the character as a raw utf-8 character. It assumes that the 206 308 # & and ; have been stripped off the string. 309 # If and_decode is true, it returns the codepoint instead of utf8 310 # If keep_surrogates is true, leave the surrogate entities as is - for later processing with desurrogate. 207 311 sub getcharequiv { 208 my ($entity, $convertsymbols, $and_decode) = @_; 209 312 my ($entity, $convertsymbols, $and_decode, $keep_surrogates) = @_; 313 314 $keep_surrogates = 0 unless defined $keep_surrogates; 210 315 my $char_equiv = undef; 211 316 … … 218 323 $code=hex($1); 219 324 } 220 221 325 222 326 if (defined $code) { 223 327 224 # malformed UTF-8 character used in UTF-16328 # UTF-16 surrogate pairs 225 329 if($code >= 0xD800 && $code <= 0xDFFF) { 226 print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). Replacing with '?'.\n"; 227 $code = ord("?"); 330 print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). "; 331 if ($keep_surrogates) { 332 print STDERR "Leaving as entity\n"; 333 return "&$entity;"; 334 } 335 else { 336 print STDERR "Replacing with '?'.\n"; 337 $code = ord("?"); 338 } 228 339 } 229 340
Note:
See TracChangeset
for help on using the changeset viewer.