Ignore:
Timestamp:
2021-05-17T12:28:53+12:00 (3 years ago)
Author:
kjdon
Message:

added code that handles utf16 surrogate pair entities.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r32325 r35166  
    200200    # Convert entities to their UTF8 equivalents
    201201    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
    202     $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
     202    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0,0)/gseo; # on this occassion, want it left as utf8 and we throw away utf16 surrogates (is that the right thing to do here?)
    203203    $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
    204204
     
    16261626    # Convert entities to their Unicode code-point equivalents
    16271627    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
    1628     $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
     1628    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1,1)/gseo; # leaves surrogate pairs as entities
     1629    $$textref =~ s/&\#([^;]+);&\#([^;]+);/&ghtml::desurrogate($1, $2)/gseo; # convert surrogate pairs
    16291630    $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
    16301631
Note: See TracChangeset for help on using the changeset viewer.