Ignore:
Timestamp:
2021-05-17T12:34:22+12:00 (3 years ago)
Author:
kjdon
Message:

getcharequiv, which replaces entities with utf8 or characters, can now be told to keep surrogate entities, which can then be processed by desurrogate, which replaces a surrogate pair with the character

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/ghtml.pm

    r30593 r35167  
    203203
    204204
     205# This gets passed in 2 entities, with &# and ; stripped off. If they are a valid surrogate pair,
     206# it returns the character they represent
     207sub desurrogate {
     208    my ($hi, $lo) = @_;
     209
     210    my $hi_code = undef;
     211    my $lo_code = undef;
     212   
     213    if ($hi =~ m/^0*(\d+)$/) {
     214    $hi_code=$1;
     215    }
     216    elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
     217    $hi_code=hex($1);
     218    }
     219    if ($lo =~ m/^0*(\d+)$/) {
     220    $lo_code=$1;
     221    }
     222    elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
     223    $lo_code=hex($1);
     224    }
     225    if (!defined $hi_code && !defined $lo_code) {
     226    # wasn't proper surrogate pair
     227    print STDERR "WARNING, &#$hi; &#$lo; is not a valid surrogate pair, returning '?'\n";
     228    return "?";
     229
     230    }
     231    #([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])
     232    if($hi_code >= 0xD800 && $hi_code <= 0xDBFF && $lo_code >= 0xDC00 && $lo_code <= 0xDFFF) {
     233    #print STDERR "Found surrogate pair $hi_code, $lo_code\n";
     234    my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
     235    my $char_equiv = &unicode::unicode2utf8([$codepoint]);
     236    $char_equiv = Encode::decode("utf8",$char_equiv);
     237    return $char_equiv;
     238    } else {
     239    print STDERR "WARNING, &#$hi_code; &#$lo_code; is not a valid surrogate pair, returning '?'\n";
     240    return "?";
     241    }
     242}
     243
     244#If you want to remove surrogate pairs before you process all the other entities, then you need more complicated lookahead system, to handle when the two entities you are looking at are not the pair.
     245# leaving this here for future reference, but its not used currently
     246# (?= is lookahead, can return capturing groups, but won't be consumed by a match
     247# (?:....)? non-capturing group that is optional
     248#$$textref =~ s/&\#([^;]+);(?=(?:&\#([^;]+);)?)/&ghtml::desurrogate($1,$2,1)/gseo;
     249
     250# returns a surroage pair. assumes &# and ; have been stripped off the entity
     251# optional lookahead to get $lo
     252my $in_surrogate = 0;
     253sub preprocess_desurrogate_NOTUSED {
     254    my ($hi, $lo, $and_decode) = @_;
     255    print STDERR "in ghtml::desurrogate, $hi"; if (defined $lo) {print STDERR " $lo";} print STDERR "\n";
     256    my $hi_code = undef;
     257    my $lo_code = undef;
     258
     259    if ($in_surrogate) { # consume the second entity of the surrogate
     260    $in_surrogate = 0;
     261    return "";
     262    }
     263    if (!defined $lo) { # we are not part of a pair
     264    return "&#$hi;";
     265    }
     266    if ($hi =~ m/^0*(\d+)$/) {
     267    $hi_code=$1;
     268    }
     269    elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
     270    $hi_code=hex($1);
     271    }
     272    # are we the first part of a surrogate?
     273    if (!defined $hi_code || !($hi_code >= 0xD800 && $hi_code <= 0xDFFF)) {
     274    # no, return the original
     275    return "&#$hi;";
     276    }
     277    # check the second part - is that a surrogate part?
     278    if ($lo =~ m/^0*(\d+)$/) {
     279    $lo_code=$1;
     280    }
     281    elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
     282    $lo_code=hex($1);
     283    }
     284
     285    if (!defined $lo_code || !($lo_code>= 0xD800 && $lo_code <= 0xDFFF)) {
     286    # not part of a surrogate
     287    return "&#$hi;";
     288    }
     289
     290    my $char_equiv = undef;
     291    my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
     292    $char_equiv = &unicode::unicode2utf8([$codepoint]);
     293   
     294   
     295    if (!defined $char_equiv) {
     296    return "&#$hi;";
     297    }
     298    else {
     299    if ((defined $and_decode) && ($and_decode)) {
     300        $char_equiv = Encode::decode("utf8",$char_equiv);
     301    }
     302    $in_surrogate=1;
     303    print STDERR "found surrogate\n";
     304    return $char_equiv;
     305    }
     306}
    205307# returns the character as a raw utf-8 character. It assumes that the
    206308# & and ; have been stripped off the string.
     309# If and_decode is true, it returns the codepoint instead of utf8
     310# If keep_surrogates is true, leave the surrogate entities as is - for later processing with desurrogate.
    207311sub getcharequiv {
    208     my ($entity, $convertsymbols, $and_decode) = @_;
    209 
     312    my ($entity, $convertsymbols, $and_decode, $keep_surrogates) = @_;
     313
     314    $keep_surrogates = 0 unless defined $keep_surrogates;
    210315    my $char_equiv = undef;
    211316
     
    218323    $code=hex($1);
    219324    }
    220 
    221 
     325   
    222326    if (defined $code) {
    223327   
    224     # malformed UTF-8 character used in UTF-16
     328    # UTF-16 surrogate pairs
    225329    if($code >= 0xD800 && $code <= 0xDFFF) {
    226         print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). Replacing with '?'.\n";
    227         $code = ord("?");
     330        print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). ";
     331        if ($keep_surrogates) {
     332        print STDERR "Leaving as entity\n";
     333        return  "&$entity;";
     334        }
     335        else {
     336        print STDERR "Replacing with '?'.\n";
     337        $code = ord("?");
     338        }
    228339    }
    229340
Note: See TracChangeset for help on using the changeset viewer.