Context Navigation

← Previous Change
Next Change →

ghtml.pm

Timestamp:

2021-05-17T12:34:22+12:00 (3 years ago)

Author:

kjdon

Message:

getcharequiv, which replaces entities with utf8 or characters, can now be told to keep surrogate entities, which can then be processed by desurrogate, which replaces a surrogate pair with the character

File:

: 1 edited

main/trunk/greenstone2/perllib/ghtml.pm (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/ghtml.pm

-              r30593
+              r35167
+# This gets passed in 2 entities, with &# and ; stripped off. If they are a valid surrogate pair,
+# it returns the character they represent
+sub desurrogate {
+    my ($hi, $lo) = @_;
+    my $hi_code = undef;
+    my $lo_code = undef;
+    if ($hi =~ m/^0*(\d+)$/) {
+    $hi_code=$1;
+    }
+    elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
+    $hi_code=hex($1);
+    }
+    if ($lo =~ m/^0*(\d+)$/) {
+    $lo_code=$1;
+    }
+    elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
+    $lo_code=hex($1);
+    }
+    if (!defined $hi_code && !defined $lo_code) {
+    # wasn't proper surrogate pair
+    print STDERR "WARNING, &#$hi; &#$lo; is not a valid surrogate pair, returning '?'\n";
+    return "?";
+    }
+    #([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])
+    if($hi_code >= 0xD800 && $hi_code <= 0xDBFF && $lo_code >= 0xDC00 && $lo_code <= 0xDFFF) {
+    #print STDERR "Found surrogate pair $hi_code, $lo_code\n";
+    my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
+    my $char_equiv = &unicode::unicode2utf8([$codepoint]);
+    $char_equiv = Encode::decode("utf8",$char_equiv);
+    return $char_equiv;
+    } else {
+    print STDERR "WARNING, &#$hi_code; &#$lo_code; is not a valid surrogate pair, returning '?'\n";
+    return "?";
+    }
+}
+#If you want to remove surrogate pairs before you process all the other entities, then you need more complicated lookahead system, to handle when the two entities you are looking at are not the pair.
+# leaving this here for future reference, but its not used currently
+# (?= is lookahead, can return capturing groups, but won't be consumed by a match
+# (?:....)? non-capturing group that is optional
+#$$textref =~ s/&\#([^;]+);(?=(?:&\#([^;]+);)?)/&ghtml::desurrogate($1,$2,1)/gseo;
+# returns a surroage pair. assumes &# and ; have been stripped off the entity
+# optional lookahead to get $lo
+my $in_surrogate = 0;
+sub preprocess_desurrogate_NOTUSED {
+    my ($hi, $lo, $and_decode) = @_;
+    print STDERR "in ghtml::desurrogate, $hi"; if (defined $lo) {print STDERR " $lo";} print STDERR "\n";
+    my $hi_code = undef;
+    my $lo_code = undef;
+    if ($in_surrogate) { # consume the second entity of the surrogate
+    $in_surrogate = 0;
+    return "";
+    }
+    if (!defined $lo) { # we are not part of a pair
+    return "&#$hi;";
+    }
+    if ($hi =~ m/^0*(\d+)$/) {
+    $hi_code=$1;
+    }
+    elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
+    $hi_code=hex($1);
+    }
+    # are we the first part of a surrogate?
+    if (!defined $hi_code || !($hi_code >= 0xD800 && $hi_code <= 0xDFFF)) {
+    # no, return the original
+    return "&#$hi;";
+    }
+    # check the second part - is that a surrogate part?
+    if ($lo =~ m/^0*(\d+)$/) {
+    $lo_code=$1;
+    }
+    elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
+    $lo_code=hex($1);
+    }
+    if (!defined $lo_code || !($lo_code>= 0xD800 && $lo_code <= 0xDFFF)) {
+    # not part of a surrogate
+    return "&#$hi;";
+    }
+    my $char_equiv = undef;
+    my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
+    $char_equiv = &unicode::unicode2utf8([$codepoint]);
+    if (!defined $char_equiv) {
+    return "&#$hi;";
+    }
+    else {
+    if ((defined $and_decode) && ($and_decode)) {
+        $char_equiv = Encode::decode("utf8",$char_equiv);
+    }
+    $in_surrogate=1;
+    print STDERR "found surrogate\n";
+    return $char_equiv;
+    }
+}
 # returns the character as a raw utf-8 character. It assumes that the
 # & and ; have been stripped off the string.
+# If and_decode is true, it returns the codepoint instead of utf8
+# If keep_surrogates is true, leave the surrogate entities as is - for later processing with desurrogate.
 sub getcharequiv {
+    my ($entity, $convertsymbols, $and_decode) = @_;
+    my ($entity, $convertsymbols, $and_decode, $keep_surrogates) = @_;
+    $keep_surrogates = 0 unless defined $keep_surrogates;
     my $char_equiv = undef;
 …
     $code=hex($1);
+    }
     if (defined $code) {
     # malformed UTF-8 character used in UTF-16
+    # UTF-16 surrogate pairs
     if($code >= 0xD800 && $code <= 0xDFFF) {
+        print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). Replacing with '?'.\n";
+        $code = ord("?");
+        print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). ";
+        if ($keep_surrogates) {
+        print STDERR "Leaving as entity\n";
+        return  "&$entity;";
+        }
+        else {
+        print STDERR "Replacing with '?'.\n";
+        $code = ord("?");
+        }
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 35167 for main/trunk/greenstone2/perllib/ghtml.pm

Legend:

main/trunk/greenstone2/perllib/ghtml.pm

Download in other formats: