[33721] | 1 | package Encode::Locale;
|
---|
| 2 |
|
---|
| 3 | use strict;
|
---|
| 4 | our $VERSION = "1.05";
|
---|
| 5 |
|
---|
| 6 | use base 'Exporter';
|
---|
| 7 | our @EXPORT_OK = qw(
|
---|
| 8 | decode_argv env
|
---|
| 9 | $ENCODING_LOCALE $ENCODING_LOCALE_FS
|
---|
| 10 | $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
|
---|
| 11 | );
|
---|
| 12 |
|
---|
| 13 | use Encode ();
|
---|
| 14 | use Encode::Alias ();
|
---|
| 15 |
|
---|
| 16 | our $ENCODING_LOCALE;
|
---|
| 17 | our $ENCODING_LOCALE_FS;
|
---|
| 18 | our $ENCODING_CONSOLE_IN;
|
---|
| 19 | our $ENCODING_CONSOLE_OUT;
|
---|
| 20 |
|
---|
| 21 | sub DEBUG () { 0 }
|
---|
| 22 |
|
---|
| 23 | sub _init {
|
---|
| 24 | if ($^O eq "MSWin32") {
|
---|
| 25 | unless ($ENCODING_LOCALE) {
|
---|
| 26 | # Try to obtain what the Windows ANSI code page is
|
---|
| 27 | eval {
|
---|
| 28 | unless (defined &GetACP) {
|
---|
| 29 | require Win32;
|
---|
| 30 | eval { Win32::GetACP() };
|
---|
| 31 | *GetACP = sub { &Win32::GetACP } unless $@;
|
---|
| 32 | }
|
---|
| 33 | unless (defined &GetACP) {
|
---|
| 34 | require Win32::API;
|
---|
| 35 | Win32::API->Import('kernel32', 'int GetACP()');
|
---|
| 36 | }
|
---|
| 37 | if (defined &GetACP) {
|
---|
| 38 | my $cp = GetACP();
|
---|
| 39 | $ENCODING_LOCALE = "cp$cp" if $cp;
|
---|
| 40 | }
|
---|
| 41 | };
|
---|
| 42 | }
|
---|
| 43 |
|
---|
| 44 | unless ($ENCODING_CONSOLE_IN) {
|
---|
| 45 | # only test one since set together
|
---|
| 46 | unless (defined &GetInputCP) {
|
---|
| 47 | eval {
|
---|
| 48 | require Win32;
|
---|
| 49 | eval { Win32::GetConsoleCP() };
|
---|
| 50 | # manually "import" it since Win32->import refuses
|
---|
| 51 | *GetInputCP = sub { &Win32::GetConsoleCP } unless $@;
|
---|
| 52 | *GetOutputCP = sub { &Win32::GetConsoleOutputCP } unless $@;
|
---|
| 53 | };
|
---|
| 54 | unless (defined &GetInputCP) {
|
---|
| 55 | eval {
|
---|
| 56 | # try Win32::Console module for codepage to use
|
---|
| 57 | require Win32::Console;
|
---|
| 58 | eval { Win32::Console::InputCP() };
|
---|
| 59 | *GetInputCP = sub { &Win32::Console::InputCP }
|
---|
| 60 | unless $@;
|
---|
| 61 | *GetOutputCP = sub { &Win32::Console::OutputCP }
|
---|
| 62 | unless $@;
|
---|
| 63 | };
|
---|
| 64 | }
|
---|
| 65 | unless (defined &GetInputCP) {
|
---|
| 66 | # final fallback
|
---|
| 67 | *GetInputCP = *GetOutputCP = sub {
|
---|
| 68 | # another fallback that could work is:
|
---|
| 69 | # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
|
---|
| 70 | ((qx(chcp) || '') =~ /^Active code page: (\d+)/)
|
---|
| 71 | ? $1 : ();
|
---|
| 72 | };
|
---|
| 73 | }
|
---|
| 74 | }
|
---|
| 75 | my $cp = GetInputCP();
|
---|
| 76 | $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
|
---|
| 77 | $cp = GetOutputCP();
|
---|
| 78 | $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
|
---|
| 79 | }
|
---|
| 80 | }
|
---|
| 81 |
|
---|
| 82 | unless ($ENCODING_LOCALE) {
|
---|
| 83 | eval {
|
---|
| 84 | require I18N::Langinfo;
|
---|
| 85 | $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
|
---|
| 86 |
|
---|
| 87 | # Workaround of Encode < v2.25. The "646" encoding alias was
|
---|
| 88 | # introduced in Encode-2.25, but we don't want to require that version
|
---|
| 89 | # quite yet. Should avoid the CPAN testers failure reported from
|
---|
| 90 | # openbsd-4.7/perl-5.10.0 combo.
|
---|
| 91 | $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
|
---|
| 92 |
|
---|
| 93 | # https://rt.cpan.org/Ticket/Display.html?id=66373
|
---|
| 94 | $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
|
---|
| 95 | };
|
---|
| 96 | $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
|
---|
| 97 | }
|
---|
| 98 |
|
---|
| 99 | if ($^O eq "darwin") {
|
---|
| 100 | $ENCODING_LOCALE_FS ||= "UTF-8";
|
---|
| 101 | }
|
---|
| 102 |
|
---|
| 103 | # final fallback
|
---|
| 104 | $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
|
---|
| 105 | $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
|
---|
| 106 | $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
|
---|
| 107 | $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
|
---|
| 108 |
|
---|
| 109 | unless (Encode::find_encoding($ENCODING_LOCALE)) {
|
---|
| 110 | my $foundit;
|
---|
| 111 | if (lc($ENCODING_LOCALE) eq "gb18030") {
|
---|
| 112 | eval {
|
---|
| 113 | require Encode::HanExtra;
|
---|
| 114 | };
|
---|
| 115 | if ($@) {
|
---|
| 116 | die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
|
---|
| 117 | }
|
---|
| 118 | $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
|
---|
| 119 | }
|
---|
| 120 | die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
|
---|
| 121 | unless $foundit;
|
---|
| 122 |
|
---|
| 123 | }
|
---|
| 124 |
|
---|
| 125 | # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
|
---|
| 126 | }
|
---|
| 127 |
|
---|
| 128 | _init();
|
---|
| 129 | Encode::Alias::define_alias(sub {
|
---|
| 130 | no strict 'refs';
|
---|
| 131 | no warnings 'once';
|
---|
| 132 | return ${"ENCODING_" . uc(shift)};
|
---|
| 133 | }, "locale");
|
---|
| 134 |
|
---|
| 135 | sub _flush_aliases {
|
---|
| 136 | no strict 'refs';
|
---|
| 137 | for my $a (keys %Encode::Alias::Alias) {
|
---|
| 138 | if (defined ${"ENCODING_" . uc($a)}) {
|
---|
| 139 | delete $Encode::Alias::Alias{$a};
|
---|
| 140 | warn "Flushed alias cache for $a" if DEBUG;
|
---|
| 141 | }
|
---|
| 142 | }
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | sub reinit {
|
---|
| 146 | $ENCODING_LOCALE = shift;
|
---|
| 147 | $ENCODING_LOCALE_FS = shift;
|
---|
| 148 | $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
|
---|
| 149 | $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
|
---|
| 150 | _init();
|
---|
| 151 | _flush_aliases();
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | sub decode_argv {
|
---|
| 155 | die if defined wantarray;
|
---|
| 156 | for (@ARGV) {
|
---|
| 157 | $_ = Encode::decode(locale => $_, @_);
|
---|
| 158 | }
|
---|
| 159 | }
|
---|
| 160 |
|
---|
| 161 | sub env {
|
---|
| 162 | my $k = Encode::encode(locale => shift);
|
---|
| 163 | my $old = $ENV{$k};
|
---|
| 164 | if (@_) {
|
---|
| 165 | my $v = shift;
|
---|
| 166 | if (defined $v) {
|
---|
| 167 | $ENV{$k} = Encode::encode(locale => $v);
|
---|
| 168 | }
|
---|
| 169 | else {
|
---|
| 170 | delete $ENV{$k};
|
---|
| 171 | }
|
---|
| 172 | }
|
---|
| 173 | return Encode::decode(locale => $old) if defined wantarray;
|
---|
| 174 | }
|
---|
| 175 |
|
---|
| 176 | 1;
|
---|
| 177 |
|
---|
| 178 | __END__
|
---|
| 179 |
|
---|
| 180 | =head1 NAME
|
---|
| 181 |
|
---|
| 182 | Encode::Locale - Determine the locale encoding
|
---|
| 183 |
|
---|
| 184 | =head1 SYNOPSIS
|
---|
| 185 |
|
---|
| 186 | use Encode::Locale;
|
---|
| 187 | use Encode;
|
---|
| 188 |
|
---|
| 189 | $string = decode(locale => $bytes);
|
---|
| 190 | $bytes = encode(locale => $string);
|
---|
| 191 |
|
---|
| 192 | if (-t) {
|
---|
| 193 | binmode(STDIN, ":encoding(console_in)");
|
---|
| 194 | binmode(STDOUT, ":encoding(console_out)");
|
---|
| 195 | binmode(STDERR, ":encoding(console_out)");
|
---|
| 196 | }
|
---|
| 197 |
|
---|
| 198 | # Processing file names passed in as arguments
|
---|
| 199 | my $uni_filename = decode(locale => $ARGV[0]);
|
---|
| 200 | open(my $fh, "<", encode(locale_fs => $uni_filename))
|
---|
| 201 | || die "Can't open '$uni_filename': $!";
|
---|
| 202 | binmode($fh, ":encoding(locale)");
|
---|
| 203 | ...
|
---|
| 204 |
|
---|
| 205 | =head1 DESCRIPTION
|
---|
| 206 |
|
---|
| 207 | In many applications it's wise to let Perl use Unicode for the strings it
|
---|
| 208 | processes. Most of the interfaces Perl has to the outside world are still byte
|
---|
| 209 | based. Programs therefore need to decode byte strings that enter the program
|
---|
| 210 | from the outside and encode them again on the way out.
|
---|
| 211 |
|
---|
| 212 | The POSIX locale system is used to specify both the language conventions
|
---|
| 213 | requested by the user and the preferred character set to consume and
|
---|
| 214 | output. The C<Encode::Locale> module looks up the charset and encoding (called
|
---|
| 215 | a CODESET in the locale jargon) and arranges for the L<Encode> module to know
|
---|
| 216 | this encoding under the name "locale". It means bytes obtained from the
|
---|
| 217 | environment can be converted to Unicode strings by calling C<<
|
---|
| 218 | Encode::encode(locale => $bytes) >> and converted back again with C<<
|
---|
| 219 | Encode::decode(locale => $string) >>.
|
---|
| 220 |
|
---|
| 221 | Where file systems interfaces pass file names in and out of the program we also
|
---|
| 222 | need care. The trend is for operating systems to use a fixed file encoding
|
---|
| 223 | that don't actually depend on the locale; and this module determines the most
|
---|
| 224 | appropriate encoding for file names. The L<Encode> module will know this
|
---|
| 225 | encoding under the name "locale_fs". For traditional Unix systems this will
|
---|
| 226 | be an alias to the same encoding as "locale".
|
---|
| 227 |
|
---|
| 228 | For programs running in a terminal window (called a "Console" on some systems)
|
---|
| 229 | the "locale" encoding is usually a good choice for what to expect as input and
|
---|
| 230 | output. Some systems allows us to query the encoding set for the terminal and
|
---|
| 231 | C<Encode::Locale> will do that if available and make these encodings known
|
---|
| 232 | under the C<Encode> aliases "console_in" and "console_out". For systems where
|
---|
| 233 | we can't determine the terminal encoding these will be aliased as the same
|
---|
| 234 | encoding as "locale". The advice is to use "console_in" for input known to
|
---|
| 235 | come from the terminal and "console_out" for output to the terminal.
|
---|
| 236 |
|
---|
| 237 | In addition to arranging for various Encode aliases the following functions and
|
---|
| 238 | variables are provided:
|
---|
| 239 |
|
---|
| 240 | =over
|
---|
| 241 |
|
---|
| 242 | =item decode_argv( )
|
---|
| 243 |
|
---|
| 244 | =item decode_argv( Encode::FB_CROAK )
|
---|
| 245 |
|
---|
| 246 | This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
|
---|
| 247 |
|
---|
| 248 | The function will by default replace characters that can't be decoded by
|
---|
| 249 | "\x{FFFD}", the Unicode replacement character.
|
---|
| 250 |
|
---|
| 251 | Any argument provided is passed as CHECK to underlying Encode::decode() call.
|
---|
| 252 | Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
|
---|
| 253 | command line arguments can be decoded. See L<Encode/"Handling Malformed Data">
|
---|
| 254 | for details on other options for CHECK.
|
---|
| 255 |
|
---|
| 256 | =item env( $uni_key )
|
---|
| 257 |
|
---|
| 258 | =item env( $uni_key => $uni_value )
|
---|
| 259 |
|
---|
| 260 | Interface to get/set environment variables. Returns the current value as a
|
---|
| 261 | Unicode string. The $uni_key and $uni_value arguments are expected to be
|
---|
| 262 | Unicode strings as well. Passing C<undef> as $uni_value deletes the
|
---|
| 263 | environment variable named $uni_key.
|
---|
| 264 |
|
---|
| 265 | The returned value will have the characters that can't be decoded replaced by
|
---|
| 266 | "\x{FFFD}", the Unicode replacement character.
|
---|
| 267 |
|
---|
| 268 | There is no interface to request alternative CHECK behavior as for
|
---|
| 269 | decode_argv(). If you need that you need to call encode/decode yourself.
|
---|
| 270 | For example:
|
---|
| 271 |
|
---|
| 272 | my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
|
---|
| 273 | my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
|
---|
| 274 |
|
---|
| 275 | =item reinit( )
|
---|
| 276 |
|
---|
| 277 | =item reinit( $encoding )
|
---|
| 278 |
|
---|
| 279 | Reinitialize the encodings from the locale. You want to call this function if
|
---|
| 280 | you changed anything in the environment that might influence the locale.
|
---|
| 281 |
|
---|
| 282 | This function will croak if the determined encoding isn't recognized by
|
---|
| 283 | the Encode module.
|
---|
| 284 |
|
---|
| 285 | With argument force $ENCODING_... variables to set to the given value.
|
---|
| 286 |
|
---|
| 287 | =item $ENCODING_LOCALE
|
---|
| 288 |
|
---|
| 289 | The encoding name determined to be suitable for the current locale.
|
---|
| 290 | L<Encode> know this encoding as "locale".
|
---|
| 291 |
|
---|
| 292 | =item $ENCODING_LOCALE_FS
|
---|
| 293 |
|
---|
| 294 | The encoding name determined to be suitable for file system interfaces
|
---|
| 295 | involving file names.
|
---|
| 296 | L<Encode> know this encoding as "locale_fs".
|
---|
| 297 |
|
---|
| 298 | =item $ENCODING_CONSOLE_IN
|
---|
| 299 |
|
---|
| 300 | =item $ENCODING_CONSOLE_OUT
|
---|
| 301 |
|
---|
| 302 | The encodings to be used for reading and writing output to the a console.
|
---|
| 303 | L<Encode> know these encodings as "console_in" and "console_out".
|
---|
| 304 |
|
---|
| 305 | =back
|
---|
| 306 |
|
---|
| 307 | =head1 NOTES
|
---|
| 308 |
|
---|
| 309 | This table summarizes the mapping of the encodings set up
|
---|
| 310 | by the C<Encode::Locale> module:
|
---|
| 311 |
|
---|
| 312 | Encode | | |
|
---|
| 313 | Alias | Windows | Mac OS X | POSIX
|
---|
| 314 | ------------+---------+--------------+------------
|
---|
| 315 | locale | ANSI | nl_langinfo | nl_langinfo
|
---|
| 316 | locale_fs | ANSI | UTF-8 | nl_langinfo
|
---|
| 317 | console_in | OEM | nl_langinfo | nl_langinfo
|
---|
| 318 | console_out | OEM | nl_langinfo | nl_langinfo
|
---|
| 319 |
|
---|
| 320 | =head2 Windows
|
---|
| 321 |
|
---|
| 322 | Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16
|
---|
| 323 | strings) and a byte based API based a character set called ANSI. The
|
---|
| 324 | regular Perl interfaces to the OS currently only uses the ANSI APIs.
|
---|
| 325 | Unfortunately ANSI is not a single character set.
|
---|
| 326 |
|
---|
| 327 | The encoding that corresponds to ANSI varies between different editions of
|
---|
| 328 | Windows. For many western editions of Windows ANSI corresponds to CP-1252
|
---|
| 329 | which is a character set similar to ISO-8859-1. Conceptually the ANSI
|
---|
| 330 | character set is a similar concept to the POSIX locale CODESET so this module
|
---|
| 331 | figures out what the ANSI code page is and make this available as
|
---|
| 332 | $ENCODING_LOCALE and the "locale" Encoding alias.
|
---|
| 333 |
|
---|
| 334 | Windows systems also operate with another byte based character set.
|
---|
| 335 | It's called the OEM code page. This is the encoding that the Console
|
---|
| 336 | takes as input and output. It's common for the OEM code page to
|
---|
| 337 | differ from the ANSI code page.
|
---|
| 338 |
|
---|
| 339 | =head2 Mac OS X
|
---|
| 340 |
|
---|
| 341 | On Mac OS X the file system encoding is always UTF-8 while the locale
|
---|
| 342 | can otherwise be set up as normal for POSIX systems.
|
---|
| 343 |
|
---|
| 344 | File names on Mac OS X will at the OS-level be converted to
|
---|
| 345 | NFD-form. A file created by passing a NFC-filename will come
|
---|
| 346 | in NFD-form from readdir(). See L<Unicode::Normalize> for details
|
---|
| 347 | of NFD/NFC.
|
---|
| 348 |
|
---|
| 349 | Actually, Apple does not follow the Unicode NFD standard since not all
|
---|
| 350 | character ranges are decomposed. The claim is that this avoids problems with
|
---|
| 351 | round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for
|
---|
| 352 | details.
|
---|
| 353 |
|
---|
| 354 | =head2 POSIX (Linux and other Unixes)
|
---|
| 355 |
|
---|
| 356 | File systems might vary in what encoding is to be used for
|
---|
| 357 | filenames. Since this module has no way to actually figure out
|
---|
| 358 | what the is correct it goes with the best guess which is to
|
---|
| 359 | assume filenames are encoding according to the current locale.
|
---|
| 360 | Users are advised to always specify UTF-8 as the locale charset.
|
---|
| 361 |
|
---|
| 362 | =head1 SEE ALSO
|
---|
| 363 |
|
---|
| 364 | L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
|
---|
| 365 |
|
---|
| 366 | =head1 AUTHOR
|
---|
| 367 |
|
---|
| 368 | Copyright 2010 Gisle Aas <[email protected]>.
|
---|
| 369 |
|
---|
| 370 | This library is free software; you can redistribute it and/or
|
---|
| 371 | modify it under the same terms as Perl itself.
|
---|
| 372 |
|
---|
| 373 | =cut
|
---|