Context Navigation

← Previous Changeset
Next Changeset →

Changeset 35165

Timestamp:

2021-05-17T12:26:40+12:00 (3 years ago)

Author:

kjdon

Message:

updated to and from utf8 methods to handle 4 byte utf8 characters - emojis are in this space.

File:

: 1 edited

main/trunk/greenstone2/perllib/unicode.pm (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/unicode.pm

-              r34393
+              r35165
+}
+# utf-8 SCHEME:
+#Char. number range  |     UTF-8 bytes/octets sequence
+#   (hexadecimal)    |              (binary)
+#--------------------+------------------------------------
+#0000 0000 - 0000 007F | 0xxxxxxx
+#0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
+#0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+#0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+# 4 byte code from a web page https://www.perl.com/article/building-a-utf-8-encoder-in-perl/
 # unicode2utf8 takes a unicode array as input and encodes it
 # using utf-8
 …
     foreach my $num (@$in) {
     next unless defined $num;
+    if ($num < 0x80) {
+    if ($num < 0x80) {# 10 000 000
+        # no transformation needed
         $out .= chr ($num);
+    } elsif ($num < 0x800) {
+        $out .= chr (0xc0 + (($num >> 6) & 0x1f));
+        $out .= chr (0x80 + ($num & 0x3f));
+    } elsif ($num < 0xFFFF) {
+        $out .= chr (0xe0 + (($num >> 12) & 0xf));
+        $out .= chr (0x80 + (($num >> 6) & 0x3f));
+        $out .= chr (0x80 + ($num & 0x3f));
+    } else {
+        # error, don't encode anything
+        #die;
+        # Diego's bugfix: instead of aborting the import process, it
+        # is better to get a converted file with a few extra spaces
+        print STDERR "strange char: $num\n";
+        $out .= " ";
+    } elsif ($num < 0x800) {# 100 000 000 000  (0x80 to 0x7ff)
+        # populate bitmask 110xxxxx 10xxxxxx with the code point
+        $out .= chr (0xc0 + (($num >> 6) & 0x1f)); # top 5 bits go in first byte ($num >> 6 | 0b11000000)
+        $out .= chr (0x80 + ($num & 0x3f)); # bottom 6 bit go in second byte ($num & 0b00111111 | 0b10000000)
+    } elsif ($num < 0x10000) {# 10 000 000 000 000 000  (0x800 to 0xffff)
+        # populate bitmask 1110xxxx 10xxxxxx 10xxxxxx
+        $out .= chr (0xe0 + (($num >> 12) & 0xf)); # top 4 bits ($num >> 12 | 0b11100000)
+        $out .= chr (0x80 + (($num >> 6) & 0x3f)); # next 6 bits ($num >> 6 & 0b00111111 | 0b10000000)
+        $out .= chr (0x80 + ($num & 0x3f)); # last 6 bits ($num & 0b00111111 | 0b10000000)
+    } else { # (0x10000 to 0x10ffff)
+        #populate bitmask 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        $out .= chr(($num >> 18 )|0b11110000);
+        $out .= chr(($num >> 12) & 0b00111111 | 0b10000000);
+        $out .= chr(($num >> 6 ) & 0b00111111 | 0b10000000);
+        $out .= chr ($num & 0b00111111 | 0b10000000);
+    }
 …
     return $out;
+}
 # utf82unicode takes a utf-8 string and produces a unicode
 …
     my $i = 0;
     my ($c1, $c2, $c3);
+    my ($c1, $c2, $c3, $c4);
     my $len = length($in);
     while ($i < $len) {
 …
+        }
+    } elsif ($c1 >= 0xf0 && $i+3 < $len) {
+        # an encoded character with four bytes
+        $c2 = ord (substr ($in, $i+1, 1));
+        $c3 = ord (substr ($in, $i+2, 1));
+        $c4 = ord (substr ($in, $i+2, 1));
+        if ($c2 >= 0x80 && $c2 < 0xc0 &&
+        $c3 >= 0x80 && $c3 < 0xc0 &&
+        $c4 >= 0x80 && $c4 < 0xc0) {
+        # everything looks ok
+        push (@$out, ((($c1 & 0x7) << 18) +
+                  (($c2 & 0x3f) << 12) +
+                  (($c3 & 0x3f) << 6) +
+                  ($c4 & 0x3f)));
+        $i+= 3; # gobbled an extra 3 bytes
+        }
     } else {
+        # error, only decode Unicode characters not full UCS.
+        # error
         # Do nothing.
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 35165

Legend:

main/trunk/greenstone2/perllib/unicode.pm

Download in other formats: