1 | package Encode::Locale;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 | our $VERSION = "1.05";
|
---|
5 |
|
---|
6 | use base 'Exporter';
|
---|
7 | our @EXPORT_OK = qw(
|
---|
8 | decode_argv env
|
---|
9 | $ENCODING_LOCALE $ENCODING_LOCALE_FS
|
---|
10 | $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
|
---|
11 | );
|
---|
12 |
|
---|
13 | use Encode ();
|
---|
14 | use Encode::Alias ();
|
---|
15 |
|
---|
16 | our $ENCODING_LOCALE;
|
---|
17 | our $ENCODING_LOCALE_FS;
|
---|
18 | our $ENCODING_CONSOLE_IN;
|
---|
19 | our $ENCODING_CONSOLE_OUT;
|
---|
20 |
|
---|
21 | sub DEBUG () { 0 }
|
---|
22 |
|
---|
23 | sub _init {
|
---|
24 | if ($^O eq "MSWin32") {
|
---|
25 | unless ($ENCODING_LOCALE) {
|
---|
26 | # Try to obtain what the Windows ANSI code page is
|
---|
27 | eval {
|
---|
28 | unless (defined &GetACP) {
|
---|
29 | require Win32;
|
---|
30 | eval { Win32::GetACP() };
|
---|
31 | *GetACP = sub { &Win32::GetACP } unless $@;
|
---|
32 | }
|
---|
33 | unless (defined &GetACP) {
|
---|
34 | require Win32::API;
|
---|
35 | Win32::API->Import('kernel32', 'int GetACP()');
|
---|
36 | }
|
---|
37 | if (defined &GetACP) {
|
---|
38 | my $cp = GetACP();
|
---|
39 | $ENCODING_LOCALE = "cp$cp" if $cp;
|
---|
40 | }
|
---|
41 | };
|
---|
42 | }
|
---|
43 |
|
---|
44 | unless ($ENCODING_CONSOLE_IN) {
|
---|
45 | # only test one since set together
|
---|
46 | unless (defined &GetInputCP) {
|
---|
47 | eval {
|
---|
48 | require Win32;
|
---|
49 | eval { Win32::GetConsoleCP() };
|
---|
50 | # manually "import" it since Win32->import refuses
|
---|
51 | *GetInputCP = sub { &Win32::GetConsoleCP } unless $@;
|
---|
52 | *GetOutputCP = sub { &Win32::GetConsoleOutputCP } unless $@;
|
---|
53 | };
|
---|
54 | unless (defined &GetInputCP) {
|
---|
55 | eval {
|
---|
56 | # try Win32::Console module for codepage to use
|
---|
57 | require Win32::Console;
|
---|
58 | eval { Win32::Console::InputCP() };
|
---|
59 | *GetInputCP = sub { &Win32::Console::InputCP }
|
---|
60 | unless $@;
|
---|
61 | *GetOutputCP = sub { &Win32::Console::OutputCP }
|
---|
62 | unless $@;
|
---|
63 | };
|
---|
64 | }
|
---|
65 | unless (defined &GetInputCP) {
|
---|
66 | # final fallback
|
---|
67 | *GetInputCP = *GetOutputCP = sub {
|
---|
68 | # another fallback that could work is:
|
---|
69 | # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
|
---|
70 | ((qx(chcp) || '') =~ /^Active code page: (\d+)/)
|
---|
71 | ? $1 : ();
|
---|
72 | };
|
---|
73 | }
|
---|
74 | }
|
---|
75 | my $cp = GetInputCP();
|
---|
76 | $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
|
---|
77 | $cp = GetOutputCP();
|
---|
78 | $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
|
---|
79 | }
|
---|
80 | }
|
---|
81 |
|
---|
82 | unless ($ENCODING_LOCALE) {
|
---|
83 | eval {
|
---|
84 | require I18N::Langinfo;
|
---|
85 | $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
|
---|
86 |
|
---|
87 | # Workaround of Encode < v2.25. The "646" encoding alias was
|
---|
88 | # introduced in Encode-2.25, but we don't want to require that version
|
---|
89 | # quite yet. Should avoid the CPAN testers failure reported from
|
---|
90 | # openbsd-4.7/perl-5.10.0 combo.
|
---|
91 | $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
|
---|
92 |
|
---|
93 | # https://rt.cpan.org/Ticket/Display.html?id=66373
|
---|
94 | $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
|
---|
95 | };
|
---|
96 | $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
|
---|
97 | }
|
---|
98 |
|
---|
99 | if ($^O eq "darwin") {
|
---|
100 | $ENCODING_LOCALE_FS ||= "UTF-8";
|
---|
101 | }
|
---|
102 |
|
---|
103 | # final fallback
|
---|
104 | $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
|
---|
105 | $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
|
---|
106 | $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
|
---|
107 | $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
|
---|
108 |
|
---|
109 | unless (Encode::find_encoding($ENCODING_LOCALE)) {
|
---|
110 | my $foundit;
|
---|
111 | if (lc($ENCODING_LOCALE) eq "gb18030") {
|
---|
112 | eval {
|
---|
113 | require Encode::HanExtra;
|
---|
114 | };
|
---|
115 | if ($@) {
|
---|
116 | die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
|
---|
117 | }
|
---|
118 | $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
|
---|
119 | }
|
---|
120 | die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
|
---|
121 | unless $foundit;
|
---|
122 |
|
---|
123 | }
|
---|
124 |
|
---|
125 | # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
|
---|
126 | }
|
---|
127 |
|
---|
128 | _init();
|
---|
129 | Encode::Alias::define_alias(sub {
|
---|
130 | no strict 'refs';
|
---|
131 | no warnings 'once';
|
---|
132 | return ${"ENCODING_" . uc(shift)};
|
---|
133 | }, "locale");
|
---|
134 |
|
---|
135 | sub _flush_aliases {
|
---|
136 | no strict 'refs';
|
---|
137 | for my $a (keys %Encode::Alias::Alias) {
|
---|
138 | if (defined ${"ENCODING_" . uc($a)}) {
|
---|
139 | delete $Encode::Alias::Alias{$a};
|
---|
140 | warn "Flushed alias cache for $a" if DEBUG;
|
---|
141 | }
|
---|
142 | }
|
---|
143 | }
|
---|
144 |
|
---|
145 | sub reinit {
|
---|
146 | $ENCODING_LOCALE = shift;
|
---|
147 | $ENCODING_LOCALE_FS = shift;
|
---|
148 | $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
|
---|
149 | $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
|
---|
150 | _init();
|
---|
151 | _flush_aliases();
|
---|
152 | }
|
---|
153 |
|
---|
154 | sub decode_argv {
|
---|
155 | die if defined wantarray;
|
---|
156 | for (@ARGV) {
|
---|
157 | $_ = Encode::decode(locale => $_, @_);
|
---|
158 | }
|
---|
159 | }
|
---|
160 |
|
---|
161 | sub env {
|
---|
162 | my $k = Encode::encode(locale => shift);
|
---|
163 | my $old = $ENV{$k};
|
---|
164 | if (@_) {
|
---|
165 | my $v = shift;
|
---|
166 | if (defined $v) {
|
---|
167 | $ENV{$k} = Encode::encode(locale => $v);
|
---|
168 | }
|
---|
169 | else {
|
---|
170 | delete $ENV{$k};
|
---|
171 | }
|
---|
172 | }
|
---|
173 | return Encode::decode(locale => $old) if defined wantarray;
|
---|
174 | }
|
---|
175 |
|
---|
176 | 1;
|
---|
177 |
|
---|
178 | __END__
|
---|
179 |
|
---|
180 | =head1 NAME
|
---|
181 |
|
---|
182 | Encode::Locale - Determine the locale encoding
|
---|
183 |
|
---|
184 | =head1 SYNOPSIS
|
---|
185 |
|
---|
186 | use Encode::Locale;
|
---|
187 | use Encode;
|
---|
188 |
|
---|
189 | $string = decode(locale => $bytes);
|
---|
190 | $bytes = encode(locale => $string);
|
---|
191 |
|
---|
192 | if (-t) {
|
---|
193 | binmode(STDIN, ":encoding(console_in)");
|
---|
194 | binmode(STDOUT, ":encoding(console_out)");
|
---|
195 | binmode(STDERR, ":encoding(console_out)");
|
---|
196 | }
|
---|
197 |
|
---|
198 | # Processing file names passed in as arguments
|
---|
199 | my $uni_filename = decode(locale => $ARGV[0]);
|
---|
200 | open(my $fh, "<", encode(locale_fs => $uni_filename))
|
---|
201 | || die "Can't open '$uni_filename': $!";
|
---|
202 | binmode($fh, ":encoding(locale)");
|
---|
203 | ...
|
---|
204 |
|
---|
205 | =head1 DESCRIPTION
|
---|
206 |
|
---|
207 | In many applications it's wise to let Perl use Unicode for the strings it
|
---|
208 | processes. Most of the interfaces Perl has to the outside world are still byte
|
---|
209 | based. Programs therefore need to decode byte strings that enter the program
|
---|
210 | from the outside and encode them again on the way out.
|
---|
211 |
|
---|
212 | The POSIX locale system is used to specify both the language conventions
|
---|
213 | requested by the user and the preferred character set to consume and
|
---|
214 | output. The C<Encode::Locale> module looks up the charset and encoding (called
|
---|
215 | a CODESET in the locale jargon) and arranges for the L<Encode> module to know
|
---|
216 | this encoding under the name "locale". It means bytes obtained from the
|
---|
217 | environment can be converted to Unicode strings by calling C<<
|
---|
218 | Encode::encode(locale => $bytes) >> and converted back again with C<<
|
---|
219 | Encode::decode(locale => $string) >>.
|
---|
220 |
|
---|
221 | Where file systems interfaces pass file names in and out of the program we also
|
---|
222 | need care. The trend is for operating systems to use a fixed file encoding
|
---|
223 | that don't actually depend on the locale; and this module determines the most
|
---|
224 | appropriate encoding for file names. The L<Encode> module will know this
|
---|
225 | encoding under the name "locale_fs". For traditional Unix systems this will
|
---|
226 | be an alias to the same encoding as "locale".
|
---|
227 |
|
---|
228 | For programs running in a terminal window (called a "Console" on some systems)
|
---|
229 | the "locale" encoding is usually a good choice for what to expect as input and
|
---|
230 | output. Some systems allows us to query the encoding set for the terminal and
|
---|
231 | C<Encode::Locale> will do that if available and make these encodings known
|
---|
232 | under the C<Encode> aliases "console_in" and "console_out". For systems where
|
---|
233 | we can't determine the terminal encoding these will be aliased as the same
|
---|
234 | encoding as "locale". The advice is to use "console_in" for input known to
|
---|
235 | come from the terminal and "console_out" for output to the terminal.
|
---|
236 |
|
---|
237 | In addition to arranging for various Encode aliases the following functions and
|
---|
238 | variables are provided:
|
---|
239 |
|
---|
240 | =over
|
---|
241 |
|
---|
242 | =item decode_argv( )
|
---|
243 |
|
---|
244 | =item decode_argv( Encode::FB_CROAK )
|
---|
245 |
|
---|
246 | This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
|
---|
247 |
|
---|
248 | The function will by default replace characters that can't be decoded by
|
---|
249 | "\x{FFFD}", the Unicode replacement character.
|
---|
250 |
|
---|
251 | Any argument provided is passed as CHECK to underlying Encode::decode() call.
|
---|
252 | Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
|
---|
253 | command line arguments can be decoded. See L<Encode/"Handling Malformed Data">
|
---|
254 | for details on other options for CHECK.
|
---|
255 |
|
---|
256 | =item env( $uni_key )
|
---|
257 |
|
---|
258 | =item env( $uni_key => $uni_value )
|
---|
259 |
|
---|
260 | Interface to get/set environment variables. Returns the current value as a
|
---|
261 | Unicode string. The $uni_key and $uni_value arguments are expected to be
|
---|
262 | Unicode strings as well. Passing C<undef> as $uni_value deletes the
|
---|
263 | environment variable named $uni_key.
|
---|
264 |
|
---|
265 | The returned value will have the characters that can't be decoded replaced by
|
---|
266 | "\x{FFFD}", the Unicode replacement character.
|
---|
267 |
|
---|
268 | There is no interface to request alternative CHECK behavior as for
|
---|
269 | decode_argv(). If you need that you need to call encode/decode yourself.
|
---|
270 | For example:
|
---|
271 |
|
---|
272 | my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
|
---|
273 | my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
|
---|
274 |
|
---|
275 | =item reinit( )
|
---|
276 |
|
---|
277 | =item reinit( $encoding )
|
---|
278 |
|
---|
279 | Reinitialize the encodings from the locale. You want to call this function if
|
---|
280 | you changed anything in the environment that might influence the locale.
|
---|
281 |
|
---|
282 | This function will croak if the determined encoding isn't recognized by
|
---|
283 | the Encode module.
|
---|
284 |
|
---|
285 | With argument force $ENCODING_... variables to set to the given value.
|
---|
286 |
|
---|
287 | =item $ENCODING_LOCALE
|
---|
288 |
|
---|
289 | The encoding name determined to be suitable for the current locale.
|
---|
290 | L<Encode> know this encoding as "locale".
|
---|
291 |
|
---|
292 | =item $ENCODING_LOCALE_FS
|
---|
293 |
|
---|
294 | The encoding name determined to be suitable for file system interfaces
|
---|
295 | involving file names.
|
---|
296 | L<Encode> know this encoding as "locale_fs".
|
---|
297 |
|
---|
298 | =item $ENCODING_CONSOLE_IN
|
---|
299 |
|
---|
300 | =item $ENCODING_CONSOLE_OUT
|
---|
301 |
|
---|
302 | The encodings to be used for reading and writing output to the a console.
|
---|
303 | L<Encode> know these encodings as "console_in" and "console_out".
|
---|
304 |
|
---|
305 | =back
|
---|
306 |
|
---|
307 | =head1 NOTES
|
---|
308 |
|
---|
309 | This table summarizes the mapping of the encodings set up
|
---|
310 | by the C<Encode::Locale> module:
|
---|
311 |
|
---|
312 | Encode | | |
|
---|
313 | Alias | Windows | Mac OS X | POSIX
|
---|
314 | ------------+---------+--------------+------------
|
---|
315 | locale | ANSI | nl_langinfo | nl_langinfo
|
---|
316 | locale_fs | ANSI | UTF-8 | nl_langinfo
|
---|
317 | console_in | OEM | nl_langinfo | nl_langinfo
|
---|
318 | console_out | OEM | nl_langinfo | nl_langinfo
|
---|
319 |
|
---|
320 | =head2 Windows
|
---|
321 |
|
---|
322 | Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16
|
---|
323 | strings) and a byte based API based a character set called ANSI. The
|
---|
324 | regular Perl interfaces to the OS currently only uses the ANSI APIs.
|
---|
325 | Unfortunately ANSI is not a single character set.
|
---|
326 |
|
---|
327 | The encoding that corresponds to ANSI varies between different editions of
|
---|
328 | Windows. For many western editions of Windows ANSI corresponds to CP-1252
|
---|
329 | which is a character set similar to ISO-8859-1. Conceptually the ANSI
|
---|
330 | character set is a similar concept to the POSIX locale CODESET so this module
|
---|
331 | figures out what the ANSI code page is and make this available as
|
---|
332 | $ENCODING_LOCALE and the "locale" Encoding alias.
|
---|
333 |
|
---|
334 | Windows systems also operate with another byte based character set.
|
---|
335 | It's called the OEM code page. This is the encoding that the Console
|
---|
336 | takes as input and output. It's common for the OEM code page to
|
---|
337 | differ from the ANSI code page.
|
---|
338 |
|
---|
339 | =head2 Mac OS X
|
---|
340 |
|
---|
341 | On Mac OS X the file system encoding is always UTF-8 while the locale
|
---|
342 | can otherwise be set up as normal for POSIX systems.
|
---|
343 |
|
---|
344 | File names on Mac OS X will at the OS-level be converted to
|
---|
345 | NFD-form. A file created by passing a NFC-filename will come
|
---|
346 | in NFD-form from readdir(). See L<Unicode::Normalize> for details
|
---|
347 | of NFD/NFC.
|
---|
348 |
|
---|
349 | Actually, Apple does not follow the Unicode NFD standard since not all
|
---|
350 | character ranges are decomposed. The claim is that this avoids problems with
|
---|
351 | round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for
|
---|
352 | details.
|
---|
353 |
|
---|
354 | =head2 POSIX (Linux and other Unixes)
|
---|
355 |
|
---|
356 | File systems might vary in what encoding is to be used for
|
---|
357 | filenames. Since this module has no way to actually figure out
|
---|
358 | what the is correct it goes with the best guess which is to
|
---|
359 | assume filenames are encoding according to the current locale.
|
---|
360 | Users are advised to always specify UTF-8 as the locale charset.
|
---|
361 |
|
---|
362 | =head1 SEE ALSO
|
---|
363 |
|
---|
364 | L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
|
---|
365 |
|
---|
366 | =head1 AUTHOR
|
---|
367 |
|
---|
368 | Copyright 2010 Gisle Aas <[email protected]>.
|
---|
369 |
|
---|
370 | This library is free software; you can redistribute it and/or
|
---|
371 | modify it under the same terms as Perl itself.
|
---|
372 |
|
---|
373 | =cut
|
---|