Changeset 35165
- Timestamp:
- 2021-05-17T12:26:40+12:00 (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/unicode.pm
r34393 r35165 125 125 } 126 126 127 # utf-8 SCHEME: 128 #Char. number range | UTF-8 bytes/octets sequence 129 # (hexadecimal) | (binary) 130 #--------------------+------------------------------------ 131 #0000 0000 - 0000 007F | 0xxxxxxx 132 #0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx 133 #0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 134 #0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 135 136 137 # 4 byte code from a web page https://www.perl.com/article/building-a-utf-8-encoder-in-perl/ 127 138 # unicode2utf8 takes a unicode array as input and encodes it 128 139 # using utf-8 … … 133 144 foreach my $num (@$in) { 134 145 next unless defined $num; 135 if ($num < 0x80) { 146 if ($num < 0x80) {# 10 000 000 147 # no transformation needed 136 148 $out .= chr ($num); 137 149 138 } elsif ($num < 0x800) { 139 $out .= chr (0xc0 + (($num >> 6) & 0x1f)); 140 $out .= chr (0x80 + ($num & 0x3f)); 141 142 } elsif ($num < 0xFFFF) { 143 $out .= chr (0xe0 + (($num >> 12) & 0xf)); 144 $out .= chr (0x80 + (($num >> 6) & 0x3f)); 145 $out .= chr (0x80 + ($num & 0x3f)); 146 147 } else { 148 # error, don't encode anything 149 #die; 150 # Diego's bugfix: instead of aborting the import process, it 151 # is better to get a converted file with a few extra spaces 152 print STDERR "strange char: $num\n"; 153 $out .= " "; 150 } elsif ($num < 0x800) {# 100 000 000 000 (0x80 to 0x7ff) 151 # populate bitmask 110xxxxx 10xxxxxx with the code point 152 $out .= chr (0xc0 + (($num >> 6) & 0x1f)); # top 5 bits go in first byte ($num >> 6 | 0b11000000) 153 $out .= chr (0x80 + ($num & 0x3f)); # bottom 6 bit go in second byte ($num & 0b00111111 | 0b10000000) 154 155 } elsif ($num < 0x10000) {# 10 000 000 000 000 000 (0x800 to 0xffff) 156 # populate bitmask 1110xxxx 10xxxxxx 10xxxxxx 157 $out .= chr (0xe0 + (($num >> 12) & 0xf)); # top 4 bits ($num >> 12 | 0b11100000) 158 $out .= chr (0x80 + (($num >> 6) & 0x3f)); # next 6 bits ($num >> 6 & 0b00111111 | 0b10000000) 159 $out .= chr (0x80 + ($num & 0x3f)); # last 6 bits ($num & 0b00111111 | 0b10000000) 160 161 } else { # (0x10000 to 0x10ffff) 162 #populate bitmask 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 163 $out .= chr(($num >> 18 )|0b11110000); 164 $out .= chr(($num >> 12) & 0b00111111 | 0b10000000); 165 $out .= chr(($num >> 6 ) & 0b00111111 | 0b10000000); 166 $out .= chr ($num & 0b00111111 | 0b10000000); 154 167 155 168 } … … 157 170 return $out; 158 171 } 172 159 173 160 174 # utf82unicode takes a utf-8 string and produces a unicode … … 169 183 170 184 my $i = 0; 171 my ($c1, $c2, $c3 );185 my ($c1, $c2, $c3, $c4); 172 186 my $len = length($in); 173 187 while ($i < $len) { … … 204 218 } 205 219 220 } elsif ($c1 >= 0xf0 && $i+3 < $len) { 221 # an encoded character with four bytes 222 $c2 = ord (substr ($in, $i+1, 1)); 223 $c3 = ord (substr ($in, $i+2, 1)); 224 $c4 = ord (substr ($in, $i+2, 1)); 225 if ($c2 >= 0x80 && $c2 < 0xc0 && 226 $c3 >= 0x80 && $c3 < 0xc0 && 227 $c4 >= 0x80 && $c4 < 0xc0) { 228 # everything looks ok 229 push (@$out, ((($c1 & 0x7) << 18) + 230 (($c2 & 0x3f) << 12) + 231 (($c3 & 0x3f) << 6) + 232 ($c4 & 0x3f))); 233 $i+= 3; # gobbled an extra 3 bytes 234 } 206 235 } else { 207 # error, only decode Unicode characters not full UCS. 236 237 # error 208 238 # Do nothing. 209 239 }
Note:
See TracChangeset
for help on using the changeset viewer.