[27174] | 1 | package Encode::Locale;
|
---|
| 2 |
|
---|
| 3 | use strict;
|
---|
| 4 | our $VERSION = "1.03";
|
---|
| 5 |
|
---|
| 6 | use base 'Exporter';
|
---|
| 7 | our @EXPORT_OK = qw(
|
---|
| 8 | decode_argv env
|
---|
| 9 | $ENCODING_LOCALE $ENCODING_LOCALE_FS
|
---|
| 10 | $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
|
---|
| 11 | );
|
---|
| 12 |
|
---|
| 13 | use Encode ();
|
---|
| 14 | use Encode::Alias ();
|
---|
| 15 |
|
---|
| 16 | our $ENCODING_LOCALE;
|
---|
| 17 | our $ENCODING_LOCALE_FS;
|
---|
| 18 | our $ENCODING_CONSOLE_IN;
|
---|
| 19 | our $ENCODING_CONSOLE_OUT;
|
---|
| 20 |
|
---|
| 21 | sub DEBUG () { 0 }
|
---|
| 22 |
|
---|
| 23 | sub _init {
|
---|
| 24 | if ($^O eq "MSWin32") {
|
---|
| 25 | unless ($ENCODING_LOCALE) {
|
---|
| 26 | # Try to obtain what the Windows ANSI code page is
|
---|
| 27 | eval {
|
---|
| 28 | unless (defined &GetACP) {
|
---|
| 29 | require Win32::API;
|
---|
| 30 | Win32::API->Import('kernel32', 'int GetACP()');
|
---|
| 31 | };
|
---|
| 32 | if (defined &GetACP) {
|
---|
| 33 | my $cp = GetACP();
|
---|
| 34 | $ENCODING_LOCALE = "cp$cp" if $cp;
|
---|
| 35 | }
|
---|
| 36 | };
|
---|
| 37 | }
|
---|
| 38 |
|
---|
| 39 | unless ($ENCODING_CONSOLE_IN) {
|
---|
| 40 | # If we have the Win32::Console module installed we can ask
|
---|
| 41 | # it for the code set to use
|
---|
| 42 | eval {
|
---|
| 43 | require Win32::Console;
|
---|
| 44 | my $cp = Win32::Console::InputCP();
|
---|
| 45 | $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
|
---|
| 46 | $cp = Win32::Console::OutputCP();
|
---|
| 47 | $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
|
---|
| 48 | };
|
---|
| 49 | # Invoking the 'chcp' program might also work
|
---|
| 50 | if (!$ENCODING_CONSOLE_IN && (qx(chcp) || '') =~ /^Active code page: (\d+)/) {
|
---|
| 51 | $ENCODING_CONSOLE_IN = "cp$1";
|
---|
| 52 | }
|
---|
| 53 | }
|
---|
| 54 | }
|
---|
| 55 |
|
---|
| 56 | unless ($ENCODING_LOCALE) {
|
---|
| 57 | eval {
|
---|
| 58 | require I18N::Langinfo;
|
---|
| 59 | $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
|
---|
| 60 |
|
---|
| 61 | # Workaround of Encode < v2.25. The "646" encoding alias was
|
---|
| 62 | # introduced in Encode-2.25, but we don't want to require that version
|
---|
| 63 | # quite yet. Should avoid the CPAN testers failure reported from
|
---|
| 64 | # openbsd-4.7/perl-5.10.0 combo.
|
---|
| 65 | $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
|
---|
| 66 |
|
---|
| 67 | # https://rt.cpan.org/Ticket/Display.html?id=66373
|
---|
| 68 | $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
|
---|
| 69 | };
|
---|
| 70 | $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 | if ($^O eq "darwin") {
|
---|
| 74 | $ENCODING_LOCALE_FS ||= "UTF-8";
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | # final fallback
|
---|
| 78 | $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
|
---|
| 79 | $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
|
---|
| 80 | $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
|
---|
| 81 | $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
|
---|
| 82 |
|
---|
| 83 | unless (Encode::find_encoding($ENCODING_LOCALE)) {
|
---|
| 84 | my $foundit;
|
---|
| 85 | if (lc($ENCODING_LOCALE) eq "gb18030") {
|
---|
| 86 | eval {
|
---|
| 87 | require Encode::HanExtra;
|
---|
| 88 | };
|
---|
| 89 | if ($@) {
|
---|
| 90 | die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
|
---|
| 91 | }
|
---|
| 92 | $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
|
---|
| 93 | }
|
---|
| 94 | die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
|
---|
| 95 | unless $foundit;
|
---|
| 96 |
|
---|
| 97 | }
|
---|
| 98 |
|
---|
| 99 | # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
|
---|
| 100 | }
|
---|
| 101 |
|
---|
| 102 | _init();
|
---|
| 103 | Encode::Alias::define_alias(sub {
|
---|
| 104 | no strict 'refs';
|
---|
| 105 | no warnings 'once';
|
---|
| 106 | return ${"ENCODING_" . uc(shift)};
|
---|
| 107 | }, "locale");
|
---|
| 108 |
|
---|
| 109 | sub _flush_aliases {
|
---|
| 110 | no strict 'refs';
|
---|
| 111 | for my $a (keys %Encode::Alias::Alias) {
|
---|
| 112 | if (defined ${"ENCODING_" . uc($a)}) {
|
---|
| 113 | delete $Encode::Alias::Alias{$a};
|
---|
| 114 | warn "Flushed alias cache for $a" if DEBUG;
|
---|
| 115 | }
|
---|
| 116 | }
|
---|
| 117 | }
|
---|
| 118 |
|
---|
| 119 | sub reinit {
|
---|
| 120 | $ENCODING_LOCALE = shift;
|
---|
| 121 | $ENCODING_LOCALE_FS = shift;
|
---|
| 122 | $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
|
---|
| 123 | $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
|
---|
| 124 | _init();
|
---|
| 125 | _flush_aliases();
|
---|
| 126 | }
|
---|
| 127 |
|
---|
| 128 | sub decode_argv {
|
---|
| 129 | die if defined wantarray;
|
---|
| 130 | for (@ARGV) {
|
---|
| 131 | $_ = Encode::decode(locale => $_, @_);
|
---|
| 132 | }
|
---|
| 133 | }
|
---|
| 134 |
|
---|
| 135 | sub env {
|
---|
| 136 | my $k = Encode::encode(locale => shift);
|
---|
| 137 | my $old = $ENV{$k};
|
---|
| 138 | if (@_) {
|
---|
| 139 | my $v = shift;
|
---|
| 140 | if (defined $v) {
|
---|
| 141 | $ENV{$k} = Encode::encode(locale => $v);
|
---|
| 142 | }
|
---|
| 143 | else {
|
---|
| 144 | delete $ENV{$k};
|
---|
| 145 | }
|
---|
| 146 | }
|
---|
| 147 | return Encode::decode(locale => $old) if defined wantarray;
|
---|
| 148 | }
|
---|
| 149 |
|
---|
| 150 | 1;
|
---|
| 151 |
|
---|
| 152 | __END__
|
---|
| 153 |
|
---|
| 154 | =head1 NAME
|
---|
| 155 |
|
---|
| 156 | Encode::Locale - Determine the locale encoding
|
---|
| 157 |
|
---|
| 158 | =head1 SYNOPSIS
|
---|
| 159 |
|
---|
| 160 | use Encode::Locale;
|
---|
| 161 | use Encode;
|
---|
| 162 |
|
---|
| 163 | $string = decode(locale => $bytes);
|
---|
| 164 | $bytes = encode(locale => $string);
|
---|
| 165 |
|
---|
| 166 | if (-t) {
|
---|
| 167 | binmode(STDIN, ":encoding(console_in)");
|
---|
| 168 | binmode(STDOUT, ":encoding(console_out)");
|
---|
| 169 | binmode(STDERR, ":encoding(console_out)");
|
---|
| 170 | }
|
---|
| 171 |
|
---|
| 172 | # Processing file names passed in as arguments
|
---|
| 173 | my $uni_filename = decode(locale => $ARGV[0]);
|
---|
| 174 | open(my $fh, "<", encode(locale_fs => $uni_filename))
|
---|
| 175 | || die "Can't open '$uni_filename': $!";
|
---|
| 176 | binmode($fh, ":encoding(locale)");
|
---|
| 177 | ...
|
---|
| 178 |
|
---|
| 179 | =head1 DESCRIPTION
|
---|
| 180 |
|
---|
| 181 | In many applications it's wise to let Perl use Unicode for the strings it
|
---|
| 182 | processes. Most of the interfaces Perl has to the outside world are still byte
|
---|
| 183 | based. Programs therefore need to decode byte strings that enter the program
|
---|
| 184 | from the outside and encode them again on the way out.
|
---|
| 185 |
|
---|
| 186 | The POSIX locale system is used to specify both the language conventions
|
---|
| 187 | requested by the user and the preferred character set to consume and
|
---|
| 188 | output. The C<Encode::Locale> module looks up the charset and encoding (called
|
---|
| 189 | a CODESET in the locale jargon) and arranges for the L<Encode> module to know
|
---|
| 190 | this encoding under the name "locale". It means bytes obtained from the
|
---|
| 191 | environment can be converted to Unicode strings by calling C<<
|
---|
| 192 | Encode::encode(locale => $bytes) >> and converted back again with C<<
|
---|
| 193 | Encode::decode(locale => $string) >>.
|
---|
| 194 |
|
---|
| 195 | Where file systems interfaces pass file names in and out of the program we also
|
---|
| 196 | need care. The trend is for operating systems to use a fixed file encoding
|
---|
| 197 | that don't actually depend on the locale; and this module determines the most
|
---|
| 198 | appropriate encoding for file names. The L<Encode> module will know this
|
---|
| 199 | encoding under the name "locale_fs". For traditional Unix systems this will
|
---|
| 200 | be an alias to the same encoding as "locale".
|
---|
| 201 |
|
---|
| 202 | For programs running in a terminal window (called a "Console" on some systems)
|
---|
| 203 | the "locale" encoding is usually a good choice for what to expect as input and
|
---|
| 204 | output. Some systems allows us to query the encoding set for the terminal and
|
---|
| 205 | C<Encode::Locale> will do that if available and make these encodings known
|
---|
| 206 | under the C<Encode> aliases "console_in" and "console_out". For systems where
|
---|
| 207 | we can't determine the terminal encoding these will be aliased as the same
|
---|
| 208 | encoding as "locale". The advice is to use "console_in" for input known to
|
---|
| 209 | come from the terminal and "console_out" for output known to go from the
|
---|
| 210 | terminal.
|
---|
| 211 |
|
---|
| 212 | In addition to arranging for various Encode aliases the following functions and
|
---|
| 213 | variables are provided:
|
---|
| 214 |
|
---|
| 215 | =over
|
---|
| 216 |
|
---|
| 217 | =item decode_argv( )
|
---|
| 218 |
|
---|
| 219 | =item decode_argv( Encode::FB_CROAK )
|
---|
| 220 |
|
---|
| 221 | This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
|
---|
| 222 |
|
---|
| 223 | The function will by default replace characters that can't be decoded by
|
---|
| 224 | "\x{FFFD}", the Unicode replacement character.
|
---|
| 225 |
|
---|
| 226 | Any argument provided is passed as CHECK to underlying Encode::decode() call.
|
---|
| 227 | Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
|
---|
| 228 | command line arguments can be decoded. See L<Encode/"Handling Malformed Data">
|
---|
| 229 | for details on other options for CHECK.
|
---|
| 230 |
|
---|
| 231 | =item env( $uni_key )
|
---|
| 232 |
|
---|
| 233 | =item env( $uni_key => $uni_value )
|
---|
| 234 |
|
---|
| 235 | Interface to get/set environment variables. Returns the current value as a
|
---|
| 236 | Unicode string. The $uni_key and $uni_value arguments are expected to be
|
---|
| 237 | Unicode strings as well. Passing C<undef> as $uni_value deletes the
|
---|
| 238 | environment variable named $uni_key.
|
---|
| 239 |
|
---|
| 240 | The returned value will have the characters that can't be decoded replaced by
|
---|
| 241 | "\x{FFFD}", the Unicode replacement character.
|
---|
| 242 |
|
---|
| 243 | There is no interface to request alternative CHECK behavior as for
|
---|
| 244 | decode_argv(). If you need that you need to call encode/decode yourself.
|
---|
| 245 | For example:
|
---|
| 246 |
|
---|
| 247 | my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
|
---|
| 248 | my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
|
---|
| 249 |
|
---|
| 250 | =item reinit( )
|
---|
| 251 |
|
---|
| 252 | =item reinit( $encoding )
|
---|
| 253 |
|
---|
| 254 | Reinitialize the encodings from the locale. You want to call this function if
|
---|
| 255 | you changed anything in the environment that might influence the locale.
|
---|
| 256 |
|
---|
| 257 | This function will croak if the determined encoding isn't recognized by
|
---|
| 258 | the Encode module.
|
---|
| 259 |
|
---|
| 260 | With argument force $ENCODING_... variables to set to the given value.
|
---|
| 261 |
|
---|
| 262 | =item $ENCODING_LOCALE
|
---|
| 263 |
|
---|
| 264 | The encoding name determined to be suitable for the current locale.
|
---|
| 265 | L<Encode> know this encoding as "locale".
|
---|
| 266 |
|
---|
| 267 | =item $ENCODING_LOCALE_FS
|
---|
| 268 |
|
---|
| 269 | The encoding name determined to be suiteable for file system interfaces
|
---|
| 270 | involving file names.
|
---|
| 271 | L<Encode> know this encoding as "locale_fs".
|
---|
| 272 |
|
---|
| 273 | =item $ENCODING_CONSOLE_IN
|
---|
| 274 |
|
---|
| 275 | =item $ENCODING_CONSOLE_OUT
|
---|
| 276 |
|
---|
| 277 | The encodings to be used for reading and writing output to the a console.
|
---|
| 278 | L<Encode> know these encodings as "console_in" and "console_out".
|
---|
| 279 |
|
---|
| 280 | =back
|
---|
| 281 |
|
---|
| 282 | =head1 NOTES
|
---|
| 283 |
|
---|
| 284 | This table summarizes the mapping of the encodings set up
|
---|
| 285 | by the C<Encode::Locale> module:
|
---|
| 286 |
|
---|
| 287 | Encode | | |
|
---|
| 288 | Alias | Windows | Mac OS X | POSIX
|
---|
| 289 | ------------+---------+--------------+------------
|
---|
| 290 | locale | ANSI | nl_langinfo | nl_langinfo
|
---|
| 291 | locale_fs | ANSI | UTF-8 | nl_langinfo
|
---|
| 292 | console_in | OEM | nl_langinfo | nl_langinfo
|
---|
| 293 | console_out | OEM | nl_langinfo | nl_langinfo
|
---|
| 294 |
|
---|
| 295 | =head2 Windows
|
---|
| 296 |
|
---|
| 297 | Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16
|
---|
| 298 | strings) and a byte based API based a character set called ANSI. The
|
---|
| 299 | regular Perl interfaces to the OS currently only uses the ANSI APIs.
|
---|
| 300 | Unfortunately ANSI is not a single character set.
|
---|
| 301 |
|
---|
| 302 | The encoding that corresponds to ANSI varies between different editions of
|
---|
| 303 | Windows. For many western editions of Windows ANSI corresponds to CP-1252
|
---|
| 304 | which is a character set similar to ISO-8859-1. Conceptually the ANSI
|
---|
| 305 | character set is a similar concept to the POSIX locale CODESET so this module
|
---|
| 306 | figures out what the ANSI code page is and make this available as
|
---|
| 307 | $ENCODING_LOCALE and the "locale" Encoding alias.
|
---|
| 308 |
|
---|
| 309 | Windows systems also operate with another byte based character set.
|
---|
| 310 | It's called the OEM code page. This is the encoding that the Console
|
---|
| 311 | takes as input and output. It's common for the OEM code page to
|
---|
| 312 | differ from the ANSI code page.
|
---|
| 313 |
|
---|
| 314 | =head2 Mac OS X
|
---|
| 315 |
|
---|
| 316 | On Mac OS X the file system encoding is always UTF-8 while the locale
|
---|
| 317 | can otherwise be set up as normal for POSIX systems.
|
---|
| 318 |
|
---|
| 319 | File names on Mac OS X will at the OS-level be converted to
|
---|
| 320 | NFD-form. A file created by passing a NFC-filename will come
|
---|
| 321 | in NFD-form from readdir(). See L<Unicode::Normalize> for details
|
---|
| 322 | of NFD/NFC.
|
---|
| 323 |
|
---|
| 324 | Actually, Apple does not follow the Unicode NFD standard since not all
|
---|
| 325 | character ranges are decomposed. The claim is that this avoids problems with
|
---|
| 326 | round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for
|
---|
| 327 | details.
|
---|
| 328 |
|
---|
| 329 | =head2 POSIX (Linux and other Unixes)
|
---|
| 330 |
|
---|
| 331 | File systems might vary in what encoding is to be used for
|
---|
| 332 | filenames. Since this module has no way to actually figure out
|
---|
| 333 | what the is correct it goes with the best guess which is to
|
---|
| 334 | assume filenames are encoding according to the current locale.
|
---|
| 335 | Users are advised to always specify UTF-8 as the locale charset.
|
---|
| 336 |
|
---|
| 337 | =head1 SEE ALSO
|
---|
| 338 |
|
---|
| 339 | L<I18N::Langinfo>, L<Encode>
|
---|
| 340 |
|
---|
| 341 | =head1 AUTHOR
|
---|
| 342 |
|
---|
| 343 | Copyright 2010 Gisle Aas <[email protected]>.
|
---|
| 344 |
|
---|
| 345 | This library is free software; you can redistribute it and/or
|
---|
| 346 | modify it under the same terms as Perl itself.
|
---|
| 347 |
|
---|
| 348 | =cut
|
---|