1 | # $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp dankogai $
|
---|
2 | package encoding;
|
---|
3 | our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
4 |
|
---|
5 | use Encode;
|
---|
6 | use strict;
|
---|
7 |
|
---|
8 | sub DEBUG () { 0 }
|
---|
9 |
|
---|
10 | BEGIN {
|
---|
11 | if (ord("A") == 193) {
|
---|
12 | require Carp;
|
---|
13 | Carp::croak("encoding: pragma does not support EBCDIC platforms");
|
---|
14 | }
|
---|
15 | }
|
---|
16 |
|
---|
17 | our $HAS_PERLIO = 0;
|
---|
18 | eval { require PerlIO::encoding };
|
---|
19 | unless ($@){
|
---|
20 | $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02);
|
---|
21 | }
|
---|
22 |
|
---|
23 | sub _exception{
|
---|
24 | my $name = shift;
|
---|
25 | $] > 5.008 and return 0; # 5.8.1 or higher then no
|
---|
26 | my %utfs = map {$_=>1}
|
---|
27 | qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
|
---|
28 | UTF-32 UTF-32BE UTF-32LE);
|
---|
29 | $utfs{$name} or return 0; # UTFs or no
|
---|
30 | require Config; Config->import(); our %Config;
|
---|
31 | return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no
|
---|
32 | }
|
---|
33 |
|
---|
34 | sub in_locale { $^H & ($locale::hint_bits || 0)}
|
---|
35 |
|
---|
36 | sub _get_locale_encoding {
|
---|
37 | my $locale_encoding;
|
---|
38 |
|
---|
39 | # I18N::Langinfo isn't available everywhere
|
---|
40 | eval {
|
---|
41 | require I18N::Langinfo;
|
---|
42 | I18N::Langinfo->import(qw(langinfo CODESET));
|
---|
43 | $locale_encoding = langinfo(CODESET());
|
---|
44 | };
|
---|
45 |
|
---|
46 | my $country_language;
|
---|
47 |
|
---|
48 | no warnings 'uninitialized';
|
---|
49 |
|
---|
50 | if (not $locale_encoding && in_locale()) {
|
---|
51 | if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) {
|
---|
52 | ($country_language, $locale_encoding) = ($1, $2);
|
---|
53 | } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) {
|
---|
54 | ($country_language, $locale_encoding) = ($1, $2);
|
---|
55 | }
|
---|
56 | # LANGUAGE affects only LC_MESSAGES only on glibc
|
---|
57 | } elsif (not $locale_encoding) {
|
---|
58 | if ($ENV{LC_ALL} =~ /\butf-?8\b/i ||
|
---|
59 | $ENV{LANG} =~ /\butf-?8\b/i) {
|
---|
60 | $locale_encoding = 'utf8';
|
---|
61 | }
|
---|
62 | # Could do more heuristics based on the country and language
|
---|
63 | # parts of LC_ALL and LANG (the parts before the dot (if any)),
|
---|
64 | # since we have Locale::Country and Locale::Language available.
|
---|
65 | # TODO: get a database of Language -> Encoding mappings
|
---|
66 | # (the Estonian database at http://www.eki.ee/letter/
|
---|
67 | # would be excellent!) --jhi
|
---|
68 | }
|
---|
69 | if (defined $locale_encoding &&
|
---|
70 | lc($locale_encoding) eq 'euc' &&
|
---|
71 | defined $country_language) {
|
---|
72 | if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) {
|
---|
73 | $locale_encoding = 'euc-jp';
|
---|
74 | } elsif ($country_language =~ /^ko_KR|korean?$/i) {
|
---|
75 | $locale_encoding = 'euc-kr';
|
---|
76 | } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) {
|
---|
77 | $locale_encoding = 'euc-cn';
|
---|
78 | } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) {
|
---|
79 | $locale_encoding = 'euc-tw';
|
---|
80 | } else {
|
---|
81 | require Carp;
|
---|
82 | Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous");
|
---|
83 | }
|
---|
84 | }
|
---|
85 |
|
---|
86 | return $locale_encoding;
|
---|
87 | }
|
---|
88 |
|
---|
89 | sub import {
|
---|
90 | my $class = shift;
|
---|
91 | my $name = shift;
|
---|
92 | if ($name eq ':_get_locale_encoding') { # used by lib/open.pm
|
---|
93 | my $caller = caller();
|
---|
94 | {
|
---|
95 | no strict 'refs';
|
---|
96 | *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
|
---|
97 | }
|
---|
98 | return;
|
---|
99 | }
|
---|
100 | $name = _get_locale_encoding() if $name eq ':locale';
|
---|
101 | my %arg = @_;
|
---|
102 | $name = $ENV{PERL_ENCODING} unless defined $name;
|
---|
103 | my $enc = find_encoding($name);
|
---|
104 | unless (defined $enc) {
|
---|
105 | require Carp;
|
---|
106 | Carp::croak("encoding: Unknown encoding '$name'");
|
---|
107 | }
|
---|
108 | $name = $enc->name; # canonize
|
---|
109 | unless ($arg{Filter}) {
|
---|
110 | DEBUG and warn "_exception($name) = ", _exception($name);
|
---|
111 | _exception($name) or ${^ENCODING} = $enc;
|
---|
112 | $HAS_PERLIO or return 1;
|
---|
113 | }else{
|
---|
114 | defined(${^ENCODING}) and undef ${^ENCODING};
|
---|
115 | # implicitly 'use utf8'
|
---|
116 | require utf8; # to fetch $utf8::hint_bits;
|
---|
117 | $^H |= $utf8::hint_bits;
|
---|
118 | eval {
|
---|
119 | require Filter::Util::Call ;
|
---|
120 | Filter::Util::Call->import ;
|
---|
121 | filter_add(sub{
|
---|
122 | my $status = filter_read();
|
---|
123 | if ($status > 0){
|
---|
124 | $_ = $enc->decode($_, 1);
|
---|
125 | DEBUG and warn $_;
|
---|
126 | }
|
---|
127 | $status ;
|
---|
128 | });
|
---|
129 | };
|
---|
130 | $@ eq '' and DEBUG and warn "Filter installed";
|
---|
131 | }
|
---|
132 | defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
|
---|
133 | for my $h (qw(STDIN STDOUT)){
|
---|
134 | if ($arg{$h}){
|
---|
135 | unless (defined find_encoding($arg{$h})) {
|
---|
136 | require Carp;
|
---|
137 | Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'");
|
---|
138 | }
|
---|
139 | eval { binmode($h, ":raw :encoding($arg{$h})") };
|
---|
140 | }else{
|
---|
141 | unless (exists $arg{$h}){
|
---|
142 | eval {
|
---|
143 | no warnings 'uninitialized';
|
---|
144 | binmode($h, ":raw :encoding($name)");
|
---|
145 | };
|
---|
146 | }
|
---|
147 | }
|
---|
148 | if ($@){
|
---|
149 | require Carp;
|
---|
150 | Carp::croak($@);
|
---|
151 | }
|
---|
152 | }
|
---|
153 | return 1; # I doubt if we need it, though
|
---|
154 | }
|
---|
155 |
|
---|
156 | sub unimport{
|
---|
157 | no warnings;
|
---|
158 | undef ${^ENCODING};
|
---|
159 | if ($HAS_PERLIO){
|
---|
160 | binmode(STDIN, ":raw");
|
---|
161 | binmode(STDOUT, ":raw");
|
---|
162 | }else{
|
---|
163 | binmode(STDIN);
|
---|
164 | binmode(STDOUT);
|
---|
165 | }
|
---|
166 | if ($INC{"Filter/Util/Call.pm"}){
|
---|
167 | eval { filter_del() };
|
---|
168 | }
|
---|
169 | }
|
---|
170 |
|
---|
171 | 1;
|
---|
172 | __END__
|
---|
173 |
|
---|
174 | =pod
|
---|
175 |
|
---|
176 | =head1 NAME
|
---|
177 |
|
---|
178 | encoding - allows you to write your script in non-ascii or non-utf8
|
---|
179 |
|
---|
180 | =head1 SYNOPSIS
|
---|
181 |
|
---|
182 | use encoding "greek"; # Perl like Greek to you?
|
---|
183 | use encoding "euc-jp"; # Jperl!
|
---|
184 |
|
---|
185 | # or you can even do this if your shell supports your native encoding
|
---|
186 |
|
---|
187 | perl -Mencoding=latin2 -e '...' # Feeling centrally European?
|
---|
188 | perl -Mencoding=euc-kr -e '...' # Or Korean?
|
---|
189 |
|
---|
190 | # more control
|
---|
191 |
|
---|
192 | # A simple euc-cn => utf-8 converter
|
---|
193 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
|
---|
194 |
|
---|
195 | # "no encoding;" supported (but not scoped!)
|
---|
196 | no encoding;
|
---|
197 |
|
---|
198 | # an alternate way, Filter
|
---|
199 | use encoding "euc-jp", Filter=>1;
|
---|
200 | # now you can use kanji identifiers -- in euc-jp!
|
---|
201 |
|
---|
202 | # switch on locale -
|
---|
203 | # note that this probably means that unless you have a complete control
|
---|
204 | # over the environments the application is ever going to be run, you should
|
---|
205 | # NOT use the feature of encoding pragma allowing you to write your script
|
---|
206 | # in any recognized encoding because changing locale settings will wreck
|
---|
207 | # the script; you can of course still use the other features of the pragma.
|
---|
208 | use encoding ':locale';
|
---|
209 |
|
---|
210 | =head1 ABSTRACT
|
---|
211 |
|
---|
212 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode
|
---|
213 | support. You could apply C<substr()> and regexes even to complex CJK
|
---|
214 | characters -- so long as the script was written in UTF-8. But back
|
---|
215 | then, text editors that supported UTF-8 were still rare and many users
|
---|
216 | instead chose to write scripts in legacy encodings, giving up a whole
|
---|
217 | new feature of Perl 5.6.
|
---|
218 |
|
---|
219 | Rewind to the future: starting from perl 5.8.0 with the B<encoding>
|
---|
220 | pragma, you can write your script in any encoding you like (so long
|
---|
221 | as the C<Encode> module supports it) and still enjoy Unicode support.
|
---|
222 | This pragma achieves that by doing the following:
|
---|
223 |
|
---|
224 | =over
|
---|
225 |
|
---|
226 | =item *
|
---|
227 |
|
---|
228 | Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
|
---|
229 | the encoding specified to utf8. In Perl 5.8.1 and later, literals in
|
---|
230 | C<tr///> and C<DATA> pseudo-filehandle are also converted.
|
---|
231 |
|
---|
232 | =item *
|
---|
233 |
|
---|
234 | Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
|
---|
235 | specified.
|
---|
236 |
|
---|
237 | =back
|
---|
238 |
|
---|
239 | =head2 Literal Conversions
|
---|
240 |
|
---|
241 | You can write code in EUC-JP as follows:
|
---|
242 |
|
---|
243 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
|
---|
244 | #<-char-><-char-> # 4 octets
|
---|
245 | s/\bCamel\b/$Rakuda/;
|
---|
246 |
|
---|
247 | And with C<use encoding "euc-jp"> in effect, it is the same thing as
|
---|
248 | the code in UTF-8:
|
---|
249 |
|
---|
250 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
|
---|
251 | s/\bCamel\b/$Rakuda/;
|
---|
252 |
|
---|
253 | =head2 PerlIO layers for C<STD(IN|OUT)>
|
---|
254 |
|
---|
255 | The B<encoding> pragma also modifies the filehandle layers of
|
---|
256 | STDIN and STDOUT to the specified encoding. Therefore,
|
---|
257 |
|
---|
258 | use encoding "euc-jp";
|
---|
259 | my $message = "Camel is the symbol of perl.\n";
|
---|
260 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
|
---|
261 | $message =~ s/\bCamel\b/$Rakuda/;
|
---|
262 | print $message;
|
---|
263 |
|
---|
264 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
|
---|
265 | not "\x{99F1}\x{99DD} is the symbol of perl.\n".
|
---|
266 |
|
---|
267 | You can override this by giving extra arguments; see below.
|
---|
268 |
|
---|
269 | =head2 Implicit upgrading for byte strings
|
---|
270 |
|
---|
271 | By default, if strings operating under byte semantics and strings
|
---|
272 | with Unicode character data are concatenated, the new string will
|
---|
273 | be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
|
---|
274 |
|
---|
275 | The B<encoding> pragma changes this to use the specified encoding
|
---|
276 | instead. For example:
|
---|
277 |
|
---|
278 | use encoding 'utf8';
|
---|
279 | my $string = chr(20000); # a Unicode string
|
---|
280 | utf8::encode($string); # now it's a UTF-8 encoded byte string
|
---|
281 | # concatenate with another Unicode string
|
---|
282 | print length($string . chr(20000));
|
---|
283 |
|
---|
284 | Will print C<2>, because C<$string> is upgraded as UTF-8. Without
|
---|
285 | C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
|
---|
286 | is three octets when interpreted as Latin-1.
|
---|
287 |
|
---|
288 | =head1 FEATURES THAT REQUIRE 5.8.1
|
---|
289 |
|
---|
290 | Some of the features offered by this pragma requires perl 5.8.1. Most
|
---|
291 | of these are done by Inaba Hiroto. Any other features and changes
|
---|
292 | are good for 5.8.0.
|
---|
293 |
|
---|
294 | =over
|
---|
295 |
|
---|
296 | =item "NON-EUC" doublebyte encodings
|
---|
297 |
|
---|
298 | Because perl needs to parse script before applying this pragma, such
|
---|
299 | encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
|
---|
300 | \x5c) in the second byte fails because the second byte may
|
---|
301 | accidentally escape the quoting character that follows. Perl 5.8.1
|
---|
302 | or later fixes this problem.
|
---|
303 |
|
---|
304 | =item tr//
|
---|
305 |
|
---|
306 | C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
|
---|
307 | See the section below for details.
|
---|
308 |
|
---|
309 | =item DATA pseudo-filehandle
|
---|
310 |
|
---|
311 | Another feature that was overlooked was C<DATA>.
|
---|
312 |
|
---|
313 | =back
|
---|
314 |
|
---|
315 | =head1 USAGE
|
---|
316 |
|
---|
317 | =over 4
|
---|
318 |
|
---|
319 | =item use encoding [I<ENCNAME>] ;
|
---|
320 |
|
---|
321 | Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE}
|
---|
322 | exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
|
---|
323 | ":encoding(I<ENCNAME>)".
|
---|
324 |
|
---|
325 | Note that STDERR WILL NOT be changed.
|
---|
326 |
|
---|
327 | Also note that non-STD file handles remain unaffected. Use C<use
|
---|
328 | open> or C<binmode> to change layers of those.
|
---|
329 |
|
---|
330 | If no encoding is specified, the environment variable L<PERL_ENCODING>
|
---|
331 | is consulted. If no encoding can be found, the error C<Unknown encoding
|
---|
332 | 'I<ENCNAME>'> will be thrown.
|
---|
333 |
|
---|
334 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
|
---|
335 |
|
---|
336 | You can also individually set encodings of STDIN and STDOUT via the
|
---|
337 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the
|
---|
338 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding
|
---|
339 | completely off.
|
---|
340 |
|
---|
341 | When ${^UNICODE} exists and non-zero, these options will completely
|
---|
342 | ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See
|
---|
343 | L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
|
---|
344 | details (perl 5.8.1 and later).
|
---|
345 |
|
---|
346 | =item use encoding I<ENCNAME> Filter=E<gt>1;
|
---|
347 |
|
---|
348 | This turns the encoding pragma into a source filter. While the
|
---|
349 | default approach just decodes interpolated literals (in qq() and
|
---|
350 | qr()), this will apply a source filter to the entire source code. See
|
---|
351 | L</"The Filter Option"> below for details.
|
---|
352 |
|
---|
353 | =item no encoding;
|
---|
354 |
|
---|
355 | Unsets the script encoding. The layers of STDIN, STDOUT are
|
---|
356 | reset to ":raw" (the default unprocessed raw stream of bytes).
|
---|
357 |
|
---|
358 | =back
|
---|
359 |
|
---|
360 | =head1 The Filter Option
|
---|
361 |
|
---|
362 | The magic of C<use encoding> is not applied to the names of
|
---|
363 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human
|
---|
364 | is a single Han ideograph) work, you still need to write your script
|
---|
365 | in UTF-8 -- or use a source filter. That's what 'Filter=>1' does.
|
---|
366 |
|
---|
367 | What does this mean? Your source code behaves as if it is written in
|
---|
368 | UTF-8 with 'use utf8' in effect. So even if your editor only supports
|
---|
369 | Shift_JIS, for example, you can still try examples in Chapter 15 of
|
---|
370 | C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8
|
---|
371 | identifiers.
|
---|
372 |
|
---|
373 | This option is significantly slower and (as of this writing) non-ASCII
|
---|
374 | identifiers are not very stable WITHOUT this option and with the
|
---|
375 | source code written in UTF-8.
|
---|
376 |
|
---|
377 | =head2 Filter-related changes at Encode version 1.87
|
---|
378 |
|
---|
379 | =over
|
---|
380 |
|
---|
381 | =item *
|
---|
382 |
|
---|
383 | The Filter option now sets STDIN and STDOUT like non-filter options.
|
---|
384 | And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
|
---|
385 | non-filter version.
|
---|
386 |
|
---|
387 | =item *
|
---|
388 |
|
---|
389 | C<use utf8> is implicitly declared so you no longer have to C<use
|
---|
390 | utf8> to C<${"\x{4eba}"}++>.
|
---|
391 |
|
---|
392 | =back
|
---|
393 |
|
---|
394 | =head1 CAVEATS
|
---|
395 |
|
---|
396 | =head2 NOT SCOPED
|
---|
397 |
|
---|
398 | The pragma is a per script, not a per block lexical. Only the last
|
---|
399 | C<use encoding> or C<no encoding> matters, and it affects
|
---|
400 | B<the whole script>. However, the <no encoding> pragma is supported and
|
---|
401 | B<use encoding> can appear as many times as you want in a given script.
|
---|
402 | The multiple use of this pragma is discouraged.
|
---|
403 |
|
---|
404 | By the same reason, the use this pragma inside modules is also
|
---|
405 | discouraged (though not as strongly discouraged as the case above.
|
---|
406 | See below).
|
---|
407 |
|
---|
408 | If you still have to write a module with this pragma, be very careful
|
---|
409 | of the load order. See the codes below;
|
---|
410 |
|
---|
411 | # called module
|
---|
412 | package Module_IN_BAR;
|
---|
413 | use encoding "bar";
|
---|
414 | # stuff in "bar" encoding here
|
---|
415 | 1;
|
---|
416 |
|
---|
417 | # caller script
|
---|
418 | use encoding "foo"
|
---|
419 | use Module_IN_BAR;
|
---|
420 | # surprise! use encoding "bar" is in effect.
|
---|
421 |
|
---|
422 | The best way to avoid this oddity is to use this pragma RIGHT AFTER
|
---|
423 | other modules are loaded. i.e.
|
---|
424 |
|
---|
425 | use Module_IN_BAR;
|
---|
426 | use encoding "foo";
|
---|
427 |
|
---|
428 | =head2 DO NOT MIX MULTIPLE ENCODINGS
|
---|
429 |
|
---|
430 | Notice that only literals (string or regular expression) having only
|
---|
431 | legacy code points are affected: if you mix data like this
|
---|
432 |
|
---|
433 | \xDF\x{100}
|
---|
434 |
|
---|
435 | the data is assumed to be in (Latin 1 and) Unicode, not in your native
|
---|
436 | encoding. In other words, this will match in "greek":
|
---|
437 |
|
---|
438 | "\xDF" =~ /\x{3af}/
|
---|
439 |
|
---|
440 | but this will not
|
---|
441 |
|
---|
442 | "\xDF\x{100}" =~ /\x{3af}\x{100}/
|
---|
443 |
|
---|
444 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
|
---|
445 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
|
---|
446 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You
|
---|
447 | should not be mixing your legacy data and Unicode in the same string.
|
---|
448 |
|
---|
449 | This pragma also affects encoding of the 0x80..0xFF code point range:
|
---|
450 | normally characters in that range are left as eight-bit bytes (unless
|
---|
451 | they are combined with characters with code points 0x100 or larger,
|
---|
452 | in which case all characters need to become UTF-8 encoded), but if
|
---|
453 | the C<encoding> pragma is present, even the 0x80..0xFF range always
|
---|
454 | gets UTF-8 encoded.
|
---|
455 |
|
---|
456 | After all, the best thing about this pragma is that you don't have to
|
---|
457 | resort to \x{....} just to spell your name in a native encoding.
|
---|
458 | So feel free to put your strings in your encoding in quotes and
|
---|
459 | regexes.
|
---|
460 |
|
---|
461 | =head2 tr/// with ranges
|
---|
462 |
|
---|
463 | The B<encoding> pragma works by decoding string literals in
|
---|
464 | C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this
|
---|
465 | does not apply to C<tr///>. Therefore,
|
---|
466 |
|
---|
467 | use encoding 'euc-jp';
|
---|
468 | #....
|
---|
469 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
|
---|
470 | # -------- -------- -------- --------
|
---|
471 |
|
---|
472 | Does not work as
|
---|
473 |
|
---|
474 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
|
---|
475 |
|
---|
476 | =over
|
---|
477 |
|
---|
478 | =item Legend of characters above
|
---|
479 |
|
---|
480 | utf8 euc-jp charnames::viacode()
|
---|
481 | -----------------------------------------
|
---|
482 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
|
---|
483 | \x{3093} \xA4\xF3 HIRAGANA LETTER N
|
---|
484 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
|
---|
485 | \x{30f3} \xA5\xF3 KATAKANA LETTER N
|
---|
486 |
|
---|
487 | =back
|
---|
488 |
|
---|
489 | This counterintuitive behavior has been fixed in perl 5.8.1.
|
---|
490 |
|
---|
491 | =head3 workaround to tr///;
|
---|
492 |
|
---|
493 | In perl 5.8.0, you can work around as follows;
|
---|
494 |
|
---|
495 | use encoding 'euc-jp';
|
---|
496 | # ....
|
---|
497 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
|
---|
498 |
|
---|
499 | Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind
|
---|
500 | is the same as classic idiom that makes C<tr///> 'interpolate'.
|
---|
501 |
|
---|
502 | tr/$from/$to/; # wrong!
|
---|
503 | eval qq{ tr/$from/$to/ }; # workaround.
|
---|
504 |
|
---|
505 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
|
---|
506 | C<tr///> not being decoded was obviously against the will of Perl5
|
---|
507 | Porters so it has been fixed in Perl 5.8.1 or later.
|
---|
508 |
|
---|
509 | =head1 EXAMPLE - Greekperl
|
---|
510 |
|
---|
511 | use encoding "iso 8859-7";
|
---|
512 |
|
---|
513 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
|
---|
514 |
|
---|
515 | $a = "\xDF";
|
---|
516 | $b = "\x{100}";
|
---|
517 |
|
---|
518 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
|
---|
519 |
|
---|
520 | $c = $a . $b;
|
---|
521 |
|
---|
522 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
|
---|
523 |
|
---|
524 | # chr() is affected, and ...
|
---|
525 |
|
---|
526 | print "mega\n" if ord(chr(0xdf)) == 0x3af;
|
---|
527 |
|
---|
528 | # ... ord() is affected by the encoding pragma ...
|
---|
529 |
|
---|
530 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
|
---|
531 |
|
---|
532 | # ... as are eq and cmp ...
|
---|
533 |
|
---|
534 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf);
|
---|
535 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0;
|
---|
536 |
|
---|
537 | # ... but pack/unpack C are not affected, in case you still
|
---|
538 | # want to go back to your native encoding
|
---|
539 |
|
---|
540 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
|
---|
541 |
|
---|
542 | =head1 KNOWN PROBLEMS
|
---|
543 |
|
---|
544 | =over
|
---|
545 |
|
---|
546 | =item literals in regex that are longer than 127 bytes
|
---|
547 |
|
---|
548 | For native multibyte encodings (either fixed or variable length),
|
---|
549 | the current implementation of the regular expressions may introduce
|
---|
550 | recoding errors for regular expression literals longer than 127 bytes.
|
---|
551 |
|
---|
552 | =item EBCDIC
|
---|
553 |
|
---|
554 | The encoding pragma is not supported on EBCDIC platforms.
|
---|
555 | (Porters who are willing and able to remove this limitation are
|
---|
556 | welcome.)
|
---|
557 |
|
---|
558 | =item format
|
---|
559 |
|
---|
560 | This pragma doesn't work well with format because PerlIO does not
|
---|
561 | get along very well with it. When format contains non-ascii
|
---|
562 | characters it prints funny or gets "wide character warnings".
|
---|
563 | To understand it, try the code below.
|
---|
564 |
|
---|
565 | # Save this one in utf8
|
---|
566 | # replace *non-ascii* with a non-ascii string
|
---|
567 | my $camel;
|
---|
568 | format STDOUT =
|
---|
569 | *non-ascii*@>>>>>>>
|
---|
570 | $camel
|
---|
571 | .
|
---|
572 | $camel = "*non-ascii*";
|
---|
573 | binmode(STDOUT=>':encoding(utf8)'); # bang!
|
---|
574 | write; # funny
|
---|
575 | print $camel, "\n"; # fine
|
---|
576 |
|
---|
577 | Without binmode this happens to work but without binmode, print()
|
---|
578 | fails instead of write().
|
---|
579 |
|
---|
580 | At any rate, the very use of format is questionable when it comes to
|
---|
581 | unicode characters since you have to consider such things as character
|
---|
582 | width (i.e. double-width for ideographs) and directions (i.e. BIDI for
|
---|
583 | Arabic and Hebrew).
|
---|
584 |
|
---|
585 | =back
|
---|
586 |
|
---|
587 | =head2 The Logic of :locale
|
---|
588 |
|
---|
589 | The logic of C<:locale> is as follows:
|
---|
590 |
|
---|
591 | =over 4
|
---|
592 |
|
---|
593 | =item 1.
|
---|
594 |
|
---|
595 | If the platform supports the langinfo(CODESET) interface, the codeset
|
---|
596 | returned is used as the default encoding for the open pragma.
|
---|
597 |
|
---|
598 | =item 2.
|
---|
599 |
|
---|
600 | If 1. didn't work but we are under the locale pragma, the environment
|
---|
601 | variables LC_ALL and LANG (in that order) are matched for encodings
|
---|
602 | (the part after C<.>, if any), and if any found, that is used
|
---|
603 | as the default encoding for the open pragma.
|
---|
604 |
|
---|
605 | =item 3.
|
---|
606 |
|
---|
607 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG
|
---|
608 | (in that order) are matched for anything looking like UTF-8, and if
|
---|
609 | any found, C<:utf8> is used as the default encoding for the open
|
---|
610 | pragma.
|
---|
611 |
|
---|
612 | =back
|
---|
613 |
|
---|
614 | If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
|
---|
615 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
|
---|
616 | the default encoding of your STDIN, STDOUT, and STDERR, and of
|
---|
617 | B<any subsequent file open>, is UTF-8.
|
---|
618 |
|
---|
619 | =head1 HISTORY
|
---|
620 |
|
---|
621 | This pragma first appeared in Perl 5.8.0. For features that require
|
---|
622 | 5.8.1 and better, see above.
|
---|
623 |
|
---|
624 | The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
|
---|
625 |
|
---|
626 | =head1 SEE ALSO
|
---|
627 |
|
---|
628 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
|
---|
629 |
|
---|
630 | Ch. 15 of C<Programming Perl (3rd Edition)>
|
---|
631 | by Larry Wall, Tom Christiansen, Jon Orwant;
|
---|
632 | O'Reilly & Associates; ISBN 0-596-00027-8
|
---|
633 |
|
---|
634 | =cut
|
---|