1 | package Encode::Unicode;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 | use warnings;
|
---|
5 | no warnings 'redefine';
|
---|
6 |
|
---|
7 | our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
8 |
|
---|
9 | use XSLoader;
|
---|
10 | XSLoader::load(__PACKAGE__,$VERSION);
|
---|
11 |
|
---|
12 | #
|
---|
13 | # Object Generator 8 transcoders all at once!
|
---|
14 | #
|
---|
15 |
|
---|
16 | require Encode;
|
---|
17 |
|
---|
18 | our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32);
|
---|
19 |
|
---|
20 | for my $name (qw(UTF-16 UTF-16BE UTF-16LE
|
---|
21 | UTF-32 UTF-32BE UTF-32LE
|
---|
22 | UCS-2BE UCS-2LE))
|
---|
23 | {
|
---|
24 | my ($size, $endian, $ucs2, $mask);
|
---|
25 | $name =~ /^(\w+)-(\d+)(\w*)$/o;
|
---|
26 | if ($ucs2 = ($1 eq 'UCS')){
|
---|
27 | $size = 2;
|
---|
28 | }else{
|
---|
29 | $size = $2/8;
|
---|
30 | }
|
---|
31 | $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ;
|
---|
32 | $size == 4 and $endian = uc($endian);
|
---|
33 |
|
---|
34 | $Encode::Encoding{$name} =
|
---|
35 | bless {
|
---|
36 | Name => $name,
|
---|
37 | size => $size,
|
---|
38 | endian => $endian,
|
---|
39 | ucs2 => $ucs2,
|
---|
40 | } => __PACKAGE__;
|
---|
41 | }
|
---|
42 |
|
---|
43 | use base qw(Encode::Encoding);
|
---|
44 |
|
---|
45 | sub renew {
|
---|
46 | my $self = shift;
|
---|
47 | $BOM_Unknown{$self->name} or return $self;
|
---|
48 | my $clone = bless { %$self } => ref($self);
|
---|
49 | $clone->{renewed}++; # so the caller knows it is renewed.
|
---|
50 | return $clone;
|
---|
51 | }
|
---|
52 |
|
---|
53 | # There used to be a perl implemntation of (en|de)code but with
|
---|
54 | # XS version is ripe, perl version is zapped for optimal speed
|
---|
55 |
|
---|
56 | *decode = \&decode_xs;
|
---|
57 | *encode = \&encode_xs;
|
---|
58 |
|
---|
59 | 1;
|
---|
60 | __END__
|
---|
61 |
|
---|
62 | =head1 NAME
|
---|
63 |
|
---|
64 | Encode::Unicode -- Various Unicode Transformation Formats
|
---|
65 |
|
---|
66 | =cut
|
---|
67 |
|
---|
68 | =head1 SYNOPSIS
|
---|
69 |
|
---|
70 | use Encode qw/encode decode/;
|
---|
71 | $ucs2 = encode("UCS-2BE", $utf8);
|
---|
72 | $utf8 = decode("UCS-2BE", $ucs2);
|
---|
73 |
|
---|
74 | =head1 ABSTRACT
|
---|
75 |
|
---|
76 | This module implements all Character Encoding Schemes of Unicode that
|
---|
77 | are officially documented by Unicode Consortium (except, of course,
|
---|
78 | for UTF-8, which is a native format in perl).
|
---|
79 |
|
---|
80 | =over 4
|
---|
81 |
|
---|
82 | =item L<http://www.unicode.org/glossary/> says:
|
---|
83 |
|
---|
84 | I<Character Encoding Scheme> A character encoding form plus byte
|
---|
85 | serialization. There are Seven character encoding schemes in Unicode:
|
---|
86 | UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
|
---|
87 | UTF-32LE (UCS-4LE), and UTF-7.
|
---|
88 |
|
---|
89 | Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
|
---|
90 | Unicode's Character Encoding Scheme. It is separately implemented in
|
---|
91 | Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>.
|
---|
92 |
|
---|
93 | =item Quick Reference
|
---|
94 |
|
---|
95 | Decodes from ord(N) Encodes chr(N) to...
|
---|
96 | octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
|
---|
97 | ---------------+-----------------+------------------------------
|
---|
98 | UCS-2BE 2 N N is bogus Not Available
|
---|
99 | UCS-2LE 2 N N bogus Not Available
|
---|
100 | UTF-16 2/4 Y Y is S.P S.P BE/LE
|
---|
101 | UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
|
---|
102 | UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf
|
---|
103 | UTF-32 4 Y - is bogus As is BE/LE
|
---|
104 | UTF-32BE 4 N - bogus As is 0x0001abcd
|
---|
105 | UTF-32LE 4 N - bogus As is 0xcdab0100
|
---|
106 | UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
|
---|
107 | ---------------+-----------------+------------------------------
|
---|
108 |
|
---|
109 | =back
|
---|
110 |
|
---|
111 | =head1 Size, Endianness, and BOM
|
---|
112 |
|
---|
113 | You can categorize these CES by 3 criteria: size of each character,
|
---|
114 | endianness, and Byte Order Mark.
|
---|
115 |
|
---|
116 | =head2 by size
|
---|
117 |
|
---|
118 | UCS-2 is a fixed-length encoding with each character taking 16 bits.
|
---|
119 | It B<does not> support I<surrogate pairs>. When a surrogate pair
|
---|
120 | is encountered during decode(), its place is filled with \x{FFFD}
|
---|
121 | if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
|
---|
122 | character whose ord value is larger than 0xFFFF is encountered,
|
---|
123 | its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
|
---|
124 | croaks if I<CHECK> is 1.
|
---|
125 |
|
---|
126 | UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
|
---|
127 | When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
|
---|
128 | following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
|
---|
129 | form a character. Bogus surrogates result in death. When \x{10000}
|
---|
130 | or above is encountered during encode(), it C<ensurrogate>s them and
|
---|
131 | pushes the surrogate pair to the output stream.
|
---|
132 |
|
---|
133 | UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits.
|
---|
134 | Since it is 32-bit, there is no need for I<surrogate pairs>.
|
---|
135 |
|
---|
136 | =head2 by endianness
|
---|
137 |
|
---|
138 | The first (and now failed) goal of Unicode was to map all character
|
---|
139 | repertoires into a fixed-length integer so that programmers are happy.
|
---|
140 | Since each character is either a I<short> or I<long> in C, you have to
|
---|
141 | pay attention to the endianness of each platform when you pass data
|
---|
142 | to one another.
|
---|
143 |
|
---|
144 | Anything marked as BE is Big Endian (or network byte order) and LE is
|
---|
145 | Little Endian (aka VAX byte order). For anything not marked either
|
---|
146 | BE or LE, a character called Byte Order Mark (BOM) indicating the
|
---|
147 | endianness is prepended to the string.
|
---|
148 |
|
---|
149 | CAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless
|
---|
150 | and as of this writing Encode suite just leave it as is (\x{FeFF}).
|
---|
151 |
|
---|
152 | =over 4
|
---|
153 |
|
---|
154 | =item BOM as integer when fetched in network byte order
|
---|
155 |
|
---|
156 | 16 32 bits/char
|
---|
157 | -------------------------
|
---|
158 | BE 0xFeFF 0x0000FeFF
|
---|
159 | LE 0xFFeF 0xFFFe0000
|
---|
160 | -------------------------
|
---|
161 |
|
---|
162 | =back
|
---|
163 |
|
---|
164 | This modules handles the BOM as follows.
|
---|
165 |
|
---|
166 | =over 4
|
---|
167 |
|
---|
168 | =item *
|
---|
169 |
|
---|
170 | When BE or LE is explicitly stated as the name of encoding, BOM is
|
---|
171 | simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
|
---|
172 |
|
---|
173 | =item *
|
---|
174 |
|
---|
175 | When BE or LE is omitted during decode(), it checks if BOM is at the
|
---|
176 | beginning of the string; if one is found, the endianness is set to
|
---|
177 | what the BOM says. If no BOM is found, the routine dies.
|
---|
178 |
|
---|
179 | =item *
|
---|
180 |
|
---|
181 | When BE or LE is omitted during encode(), it returns a BE-encoded
|
---|
182 | string with BOM prepended. So when you want to encode a whole text
|
---|
183 | file, make sure you encode() the whole text at once, not line by line
|
---|
184 | or each line, not file, will have a BOM prepended.
|
---|
185 |
|
---|
186 | =item *
|
---|
187 |
|
---|
188 | C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
|
---|
189 | UCS-2 is already registered by IANA and others that way.
|
---|
190 |
|
---|
191 | =back
|
---|
192 |
|
---|
193 | =head1 Surrogate Pairs
|
---|
194 |
|
---|
195 | To say the least, surrogate pairs were the biggest mistake of the
|
---|
196 | Unicode Consortium. But according to the late Douglas Adams in I<The
|
---|
197 | Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
|
---|
198 | Universe was created. This has made a lot of people very angry and
|
---|
199 | been widely regarded as a bad move>. Their mistake was not of this
|
---|
200 | magnitude so let's forgive them.
|
---|
201 |
|
---|
202 | (I don't dare make any comparison with Unicode Consortium and the
|
---|
203 | Vogons here ;) Or, comparing Encode to Babel Fish is completely
|
---|
204 | appropriate -- if you can only stick this into your ear :)
|
---|
205 |
|
---|
206 | Surrogate pairs were born when the Unicode Consortium finally
|
---|
207 | admitted that 16 bits were not big enough to hold all the world's
|
---|
208 | character repertoires. But they already made UCS-2 16-bit. What
|
---|
209 | do we do?
|
---|
210 |
|
---|
211 | Back then, the range 0xD800-0xDFFF was not allocated. Let's split
|
---|
212 | that range in half and use the first half to represent the C<upper
|
---|
213 | half of a character> and the second half to represent the C<lower
|
---|
214 | half of a character>. That way, you can represent 1024 * 1024 =
|
---|
215 | 1048576 more characters. Now we can store character ranges up to
|
---|
216 | \x{10ffff} even with 16-bit encodings. This pair of half-character is
|
---|
217 | now called a I<surrogate pair> and UTF-16 is the name of the encoding
|
---|
218 | that embraces them.
|
---|
219 |
|
---|
220 | Here is a formula to ensurrogate a Unicode character \x{10000} and
|
---|
221 | above;
|
---|
222 |
|
---|
223 | $hi = ($uni - 0x10000) / 0x400 + 0xD800;
|
---|
224 | $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
|
---|
225 |
|
---|
226 | And to desurrogate;
|
---|
227 |
|
---|
228 | $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
|
---|
229 |
|
---|
230 | Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
|
---|
231 | perl does not prohibit the use of characters within this range. To perl,
|
---|
232 | every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
|
---|
233 |
|
---|
234 | (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
|
---|
235 | integer support!
|
---|
236 |
|
---|
237 | =head1 Error Checking
|
---|
238 |
|
---|
239 | Unlike most encodings which accept various ways to handle errors,
|
---|
240 | Unicode encodings simply croaks.
|
---|
241 |
|
---|
242 | % perl -MEncode -e '$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
|
---|
243 | -e 'Encode::from_to($_, "utf16","shift_jis", 0); print'
|
---|
244 | UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184.
|
---|
245 | % perl -MEncode -e '$a = "BOM missing"' \
|
---|
246 | -e ' Encode::from_to($a, "utf16", "shift_jis", 0); print'
|
---|
247 | UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184.
|
---|
248 |
|
---|
249 | Unlike other encodings where mappings are not one-to-one against
|
---|
250 | Unicode, UTFs are supposed to map 100% against one another. So Encode
|
---|
251 | is more strict on UTFs.
|
---|
252 |
|
---|
253 | Consider that "division by zero" of Encode :)
|
---|
254 |
|
---|
255 | =head1 SEE ALSO
|
---|
256 |
|
---|
257 | L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>,
|
---|
258 | L<http://www.unicode.org/unicode/faq/utf_bom.html>,
|
---|
259 |
|
---|
260 | RFC 2781 L<http://rfc.net/rfc2781.html>,
|
---|
261 |
|
---|
262 | The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html>
|
---|
263 |
|
---|
264 | Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
|
---|
265 | by Larry Wall, Tom Christiansen, Jon Orwant;
|
---|
266 | O'Reilly & Associates; ISBN 0-596-00027-8
|
---|
267 |
|
---|
268 | =cut
|
---|