Changeset 35165


Ignore:
Timestamp:
2021-05-17T12:26:40+12:00 (3 years ago)
Author:
kjdon
Message:

updated to and from utf8 methods to handle 4 byte utf8 characters - emojis are in this space.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/unicode.pm

    r34393 r35165  
    125125}
    126126
     127# utf-8 SCHEME:
     128#Char. number range  |     UTF-8 bytes/octets sequence
     129#   (hexadecimal)    |              (binary)
     130#--------------------+------------------------------------
     131#0000 0000 - 0000 007F | 0xxxxxxx
     132#0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
     133#0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
     134#0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     135
     136
     137# 4 byte code from a web page https://www.perl.com/article/building-a-utf-8-encoder-in-perl/
    127138# unicode2utf8 takes a unicode array as input and encodes it
    128139# using utf-8
     
    133144    foreach my $num (@$in) {
    134145    next unless defined $num;
    135     if ($num < 0x80) {
     146    if ($num < 0x80) {# 10 000 000
     147        # no transformation needed
    136148        $out .= chr ($num);
    137149
    138     } elsif ($num < 0x800) {
    139         $out .= chr (0xc0 + (($num >> 6) & 0x1f));
    140         $out .= chr (0x80 + ($num & 0x3f));
    141 
    142     } elsif ($num < 0xFFFF) {
    143         $out .= chr (0xe0 + (($num >> 12) & 0xf));
    144         $out .= chr (0x80 + (($num >> 6) & 0x3f));
    145         $out .= chr (0x80 + ($num & 0x3f));
    146 
    147     } else {
    148         # error, don't encode anything
    149         #die;
    150         # Diego's bugfix: instead of aborting the import process, it
    151         # is better to get a converted file with a few extra spaces
    152         print STDERR "strange char: $num\n";
    153         $out .= " ";
     150    } elsif ($num < 0x800) {# 100 000 000 000  (0x80 to 0x7ff)
     151        # populate bitmask 110xxxxx 10xxxxxx with the code point
     152        $out .= chr (0xc0 + (($num >> 6) & 0x1f)); # top 5 bits go in first byte ($num >> 6 | 0b11000000)
     153        $out .= chr (0x80 + ($num & 0x3f)); # bottom 6 bit go in second byte ($num & 0b00111111 | 0b10000000)
     154
     155    } elsif ($num < 0x10000) {# 10 000 000 000 000 000  (0x800 to 0xffff)
     156        # populate bitmask 1110xxxx 10xxxxxx 10xxxxxx
     157        $out .= chr (0xe0 + (($num >> 12) & 0xf)); # top 4 bits ($num >> 12 | 0b11100000)
     158        $out .= chr (0x80 + (($num >> 6) & 0x3f)); # next 6 bits ($num >> 6 & 0b00111111 | 0b10000000)
     159        $out .= chr (0x80 + ($num & 0x3f)); # last 6 bits ($num & 0b00111111 | 0b10000000)
     160
     161    } else { # (0x10000 to 0x10ffff)
     162        #populate bitmask 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     163        $out .= chr(($num >> 18 )|0b11110000);
     164        $out .= chr(($num >> 12) & 0b00111111 | 0b10000000);
     165        $out .= chr(($num >> 6 ) & 0b00111111 | 0b10000000);
     166        $out .= chr ($num & 0b00111111 | 0b10000000);
    154167
    155168    }
     
    157170    return $out;
    158171}
     172
    159173
    160174# utf82unicode takes a utf-8 string and produces a unicode
     
    169183
    170184    my $i = 0;
    171     my ($c1, $c2, $c3);
     185    my ($c1, $c2, $c3, $c4);
    172186    my $len = length($in);
    173187    while ($i < $len) {
     
    204218        }
    205219
     220    } elsif ($c1 >= 0xf0 && $i+3 < $len) {
     221        # an encoded character with four bytes
     222        $c2 = ord (substr ($in, $i+1, 1));
     223        $c3 = ord (substr ($in, $i+2, 1));
     224        $c4 = ord (substr ($in, $i+2, 1));
     225        if ($c2 >= 0x80 && $c2 < 0xc0 &&
     226        $c3 >= 0x80 && $c3 < 0xc0 &&
     227        $c4 >= 0x80 && $c4 < 0xc0) {
     228        # everything looks ok
     229        push (@$out, ((($c1 & 0x7) << 18) +
     230                  (($c2 & 0x3f) << 12) +
     231                  (($c3 & 0x3f) << 6) +
     232                  ($c4 & 0x3f)));
     233        $i+= 3; # gobbled an extra 3 bytes
     234        }
    206235    } else {
    207         # error, only decode Unicode characters not full UCS.
     236       
     237        # error
    208238        # Do nothing.
    209239    }
Note: See TracChangeset for help on using the changeset viewer.