1 | package Encode::Locale;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 | our $VERSION = "1.03";
|
---|
5 |
|
---|
6 | use base 'Exporter';
|
---|
7 | our @EXPORT_OK = qw(
|
---|
8 | decode_argv env
|
---|
9 | $ENCODING_LOCALE $ENCODING_LOCALE_FS
|
---|
10 | $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
|
---|
11 | );
|
---|
12 |
|
---|
13 | use Encode ();
|
---|
14 | use Encode::Alias ();
|
---|
15 |
|
---|
16 | our $ENCODING_LOCALE;
|
---|
17 | our $ENCODING_LOCALE_FS;
|
---|
18 | our $ENCODING_CONSOLE_IN;
|
---|
19 | our $ENCODING_CONSOLE_OUT;
|
---|
20 |
|
---|
21 | sub DEBUG () { 0 }
|
---|
22 |
|
---|
23 | sub _init {
|
---|
24 | if ($^O eq "MSWin32") {
|
---|
25 | unless ($ENCODING_LOCALE) {
|
---|
26 | # Try to obtain what the Windows ANSI code page is
|
---|
27 | eval {
|
---|
28 | unless (defined &GetACP) {
|
---|
29 | require Win32::API;
|
---|
30 | Win32::API->Import('kernel32', 'int GetACP()');
|
---|
31 | };
|
---|
32 | if (defined &GetACP) {
|
---|
33 | my $cp = GetACP();
|
---|
34 | $ENCODING_LOCALE = "cp$cp" if $cp;
|
---|
35 | }
|
---|
36 | };
|
---|
37 | }
|
---|
38 |
|
---|
39 | unless ($ENCODING_CONSOLE_IN) {
|
---|
40 | # If we have the Win32::Console module installed we can ask
|
---|
41 | # it for the code set to use
|
---|
42 | eval {
|
---|
43 | require Win32::Console;
|
---|
44 | my $cp = Win32::Console::InputCP();
|
---|
45 | $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
|
---|
46 | $cp = Win32::Console::OutputCP();
|
---|
47 | $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
|
---|
48 | };
|
---|
49 | # Invoking the 'chcp' program might also work
|
---|
50 | if (!$ENCODING_CONSOLE_IN && (qx(chcp) || '') =~ /^Active code page: (\d+)/) {
|
---|
51 | $ENCODING_CONSOLE_IN = "cp$1";
|
---|
52 | }
|
---|
53 | }
|
---|
54 | }
|
---|
55 |
|
---|
56 | unless ($ENCODING_LOCALE) {
|
---|
57 | eval {
|
---|
58 | require I18N::Langinfo;
|
---|
59 | $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
|
---|
60 |
|
---|
61 | # Workaround of Encode < v2.25. The "646" encoding alias was
|
---|
62 | # introduced in Encode-2.25, but we don't want to require that version
|
---|
63 | # quite yet. Should avoid the CPAN testers failure reported from
|
---|
64 | # openbsd-4.7/perl-5.10.0 combo.
|
---|
65 | $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
|
---|
66 |
|
---|
67 | # https://rt.cpan.org/Ticket/Display.html?id=66373
|
---|
68 | $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
|
---|
69 | };
|
---|
70 | $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
|
---|
71 | }
|
---|
72 |
|
---|
73 | if ($^O eq "darwin") {
|
---|
74 | $ENCODING_LOCALE_FS ||= "UTF-8";
|
---|
75 | }
|
---|
76 |
|
---|
77 | # final fallback
|
---|
78 | $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
|
---|
79 | $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
|
---|
80 | $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
|
---|
81 | $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
|
---|
82 |
|
---|
83 | unless (Encode::find_encoding($ENCODING_LOCALE)) {
|
---|
84 | my $foundit;
|
---|
85 | if (lc($ENCODING_LOCALE) eq "gb18030") {
|
---|
86 | eval {
|
---|
87 | require Encode::HanExtra;
|
---|
88 | };
|
---|
89 | if ($@) {
|
---|
90 | die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
|
---|
91 | }
|
---|
92 | $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
|
---|
93 | }
|
---|
94 | die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
|
---|
95 | unless $foundit;
|
---|
96 |
|
---|
97 | }
|
---|
98 |
|
---|
99 | # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
|
---|
100 | }
|
---|
101 |
|
---|
102 | _init();
|
---|
103 | Encode::Alias::define_alias(sub {
|
---|
104 | no strict 'refs';
|
---|
105 | no warnings 'once';
|
---|
106 | return ${"ENCODING_" . uc(shift)};
|
---|
107 | }, "locale");
|
---|
108 |
|
---|
109 | sub _flush_aliases {
|
---|
110 | no strict 'refs';
|
---|
111 | for my $a (keys %Encode::Alias::Alias) {
|
---|
112 | if (defined ${"ENCODING_" . uc($a)}) {
|
---|
113 | delete $Encode::Alias::Alias{$a};
|
---|
114 | warn "Flushed alias cache for $a" if DEBUG;
|
---|
115 | }
|
---|
116 | }
|
---|
117 | }
|
---|
118 |
|
---|
119 | sub reinit {
|
---|
120 | $ENCODING_LOCALE = shift;
|
---|
121 | $ENCODING_LOCALE_FS = shift;
|
---|
122 | $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
|
---|
123 | $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
|
---|
124 | _init();
|
---|
125 | _flush_aliases();
|
---|
126 | }
|
---|
127 |
|
---|
128 | sub decode_argv {
|
---|
129 | die if defined wantarray;
|
---|
130 | for (@ARGV) {
|
---|
131 | $_ = Encode::decode(locale => $_, @_);
|
---|
132 | }
|
---|
133 | }
|
---|
134 |
|
---|
135 | sub env {
|
---|
136 | my $k = Encode::encode(locale => shift);
|
---|
137 | my $old = $ENV{$k};
|
---|
138 | if (@_) {
|
---|
139 | my $v = shift;
|
---|
140 | if (defined $v) {
|
---|
141 | $ENV{$k} = Encode::encode(locale => $v);
|
---|
142 | }
|
---|
143 | else {
|
---|
144 | delete $ENV{$k};
|
---|
145 | }
|
---|
146 | }
|
---|
147 | return Encode::decode(locale => $old) if defined wantarray;
|
---|
148 | }
|
---|
149 |
|
---|
150 | 1;
|
---|
151 |
|
---|
152 | __END__
|
---|
153 |
|
---|
154 | =head1 NAME
|
---|
155 |
|
---|
156 | Encode::Locale - Determine the locale encoding
|
---|
157 |
|
---|
158 | =head1 SYNOPSIS
|
---|
159 |
|
---|
160 | use Encode::Locale;
|
---|
161 | use Encode;
|
---|
162 |
|
---|
163 | $string = decode(locale => $bytes);
|
---|
164 | $bytes = encode(locale => $string);
|
---|
165 |
|
---|
166 | if (-t) {
|
---|
167 | binmode(STDIN, ":encoding(console_in)");
|
---|
168 | binmode(STDOUT, ":encoding(console_out)");
|
---|
169 | binmode(STDERR, ":encoding(console_out)");
|
---|
170 | }
|
---|
171 |
|
---|
172 | # Processing file names passed in as arguments
|
---|
173 | my $uni_filename = decode(locale => $ARGV[0]);
|
---|
174 | open(my $fh, "<", encode(locale_fs => $uni_filename))
|
---|
175 | || die "Can't open '$uni_filename': $!";
|
---|
176 | binmode($fh, ":encoding(locale)");
|
---|
177 | ...
|
---|
178 |
|
---|
179 | =head1 DESCRIPTION
|
---|
180 |
|
---|
181 | In many applications it's wise to let Perl use Unicode for the strings it
|
---|
182 | processes. Most of the interfaces Perl has to the outside world are still byte
|
---|
183 | based. Programs therefore need to decode byte strings that enter the program
|
---|
184 | from the outside and encode them again on the way out.
|
---|
185 |
|
---|
186 | The POSIX locale system is used to specify both the language conventions
|
---|
187 | requested by the user and the preferred character set to consume and
|
---|
188 | output. The C<Encode::Locale> module looks up the charset and encoding (called
|
---|
189 | a CODESET in the locale jargon) and arranges for the L<Encode> module to know
|
---|
190 | this encoding under the name "locale". It means bytes obtained from the
|
---|
191 | environment can be converted to Unicode strings by calling C<<
|
---|
192 | Encode::encode(locale => $bytes) >> and converted back again with C<<
|
---|
193 | Encode::decode(locale => $string) >>.
|
---|
194 |
|
---|
195 | Where file systems interfaces pass file names in and out of the program we also
|
---|
196 | need care. The trend is for operating systems to use a fixed file encoding
|
---|
197 | that don't actually depend on the locale; and this module determines the most
|
---|
198 | appropriate encoding for file names. The L<Encode> module will know this
|
---|
199 | encoding under the name "locale_fs". For traditional Unix systems this will
|
---|
200 | be an alias to the same encoding as "locale".
|
---|
201 |
|
---|
202 | For programs running in a terminal window (called a "Console" on some systems)
|
---|
203 | the "locale" encoding is usually a good choice for what to expect as input and
|
---|
204 | output. Some systems allows us to query the encoding set for the terminal and
|
---|
205 | C<Encode::Locale> will do that if available and make these encodings known
|
---|
206 | under the C<Encode> aliases "console_in" and "console_out". For systems where
|
---|
207 | we can't determine the terminal encoding these will be aliased as the same
|
---|
208 | encoding as "locale". The advice is to use "console_in" for input known to
|
---|
209 | come from the terminal and "console_out" for output known to go from the
|
---|
210 | terminal.
|
---|
211 |
|
---|
212 | In addition to arranging for various Encode aliases the following functions and
|
---|
213 | variables are provided:
|
---|
214 |
|
---|
215 | =over
|
---|
216 |
|
---|
217 | =item decode_argv( )
|
---|
218 |
|
---|
219 | =item decode_argv( Encode::FB_CROAK )
|
---|
220 |
|
---|
221 | This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
|
---|
222 |
|
---|
223 | The function will by default replace characters that can't be decoded by
|
---|
224 | "\x{FFFD}", the Unicode replacement character.
|
---|
225 |
|
---|
226 | Any argument provided is passed as CHECK to underlying Encode::decode() call.
|
---|
227 | Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
|
---|
228 | command line arguments can be decoded. See L<Encode/"Handling Malformed Data">
|
---|
229 | for details on other options for CHECK.
|
---|
230 |
|
---|
231 | =item env( $uni_key )
|
---|
232 |
|
---|
233 | =item env( $uni_key => $uni_value )
|
---|
234 |
|
---|
235 | Interface to get/set environment variables. Returns the current value as a
|
---|
236 | Unicode string. The $uni_key and $uni_value arguments are expected to be
|
---|
237 | Unicode strings as well. Passing C<undef> as $uni_value deletes the
|
---|
238 | environment variable named $uni_key.
|
---|
239 |
|
---|
240 | The returned value will have the characters that can't be decoded replaced by
|
---|
241 | "\x{FFFD}", the Unicode replacement character.
|
---|
242 |
|
---|
243 | There is no interface to request alternative CHECK behavior as for
|
---|
244 | decode_argv(). If you need that you need to call encode/decode yourself.
|
---|
245 | For example:
|
---|
246 |
|
---|
247 | my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
|
---|
248 | my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
|
---|
249 |
|
---|
250 | =item reinit( )
|
---|
251 |
|
---|
252 | =item reinit( $encoding )
|
---|
253 |
|
---|
254 | Reinitialize the encodings from the locale. You want to call this function if
|
---|
255 | you changed anything in the environment that might influence the locale.
|
---|
256 |
|
---|
257 | This function will croak if the determined encoding isn't recognized by
|
---|
258 | the Encode module.
|
---|
259 |
|
---|
260 | With argument force $ENCODING_... variables to set to the given value.
|
---|
261 |
|
---|
262 | =item $ENCODING_LOCALE
|
---|
263 |
|
---|
264 | The encoding name determined to be suitable for the current locale.
|
---|
265 | L<Encode> know this encoding as "locale".
|
---|
266 |
|
---|
267 | =item $ENCODING_LOCALE_FS
|
---|
268 |
|
---|
269 | The encoding name determined to be suiteable for file system interfaces
|
---|
270 | involving file names.
|
---|
271 | L<Encode> know this encoding as "locale_fs".
|
---|
272 |
|
---|
273 | =item $ENCODING_CONSOLE_IN
|
---|
274 |
|
---|
275 | =item $ENCODING_CONSOLE_OUT
|
---|
276 |
|
---|
277 | The encodings to be used for reading and writing output to the a console.
|
---|
278 | L<Encode> know these encodings as "console_in" and "console_out".
|
---|
279 |
|
---|
280 | =back
|
---|
281 |
|
---|
282 | =head1 NOTES
|
---|
283 |
|
---|
284 | This table summarizes the mapping of the encodings set up
|
---|
285 | by the C<Encode::Locale> module:
|
---|
286 |
|
---|
287 | Encode | | |
|
---|
288 | Alias | Windows | Mac OS X | POSIX
|
---|
289 | ------------+---------+--------------+------------
|
---|
290 | locale | ANSI | nl_langinfo | nl_langinfo
|
---|
291 | locale_fs | ANSI | UTF-8 | nl_langinfo
|
---|
292 | console_in | OEM | nl_langinfo | nl_langinfo
|
---|
293 | console_out | OEM | nl_langinfo | nl_langinfo
|
---|
294 |
|
---|
295 | =head2 Windows
|
---|
296 |
|
---|
297 | Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16
|
---|
298 | strings) and a byte based API based a character set called ANSI. The
|
---|
299 | regular Perl interfaces to the OS currently only uses the ANSI APIs.
|
---|
300 | Unfortunately ANSI is not a single character set.
|
---|
301 |
|
---|
302 | The encoding that corresponds to ANSI varies between different editions of
|
---|
303 | Windows. For many western editions of Windows ANSI corresponds to CP-1252
|
---|
304 | which is a character set similar to ISO-8859-1. Conceptually the ANSI
|
---|
305 | character set is a similar concept to the POSIX locale CODESET so this module
|
---|
306 | figures out what the ANSI code page is and make this available as
|
---|
307 | $ENCODING_LOCALE and the "locale" Encoding alias.
|
---|
308 |
|
---|
309 | Windows systems also operate with another byte based character set.
|
---|
310 | It's called the OEM code page. This is the encoding that the Console
|
---|
311 | takes as input and output. It's common for the OEM code page to
|
---|
312 | differ from the ANSI code page.
|
---|
313 |
|
---|
314 | =head2 Mac OS X
|
---|
315 |
|
---|
316 | On Mac OS X the file system encoding is always UTF-8 while the locale
|
---|
317 | can otherwise be set up as normal for POSIX systems.
|
---|
318 |
|
---|
319 | File names on Mac OS X will at the OS-level be converted to
|
---|
320 | NFD-form. A file created by passing a NFC-filename will come
|
---|
321 | in NFD-form from readdir(). See L<Unicode::Normalize> for details
|
---|
322 | of NFD/NFC.
|
---|
323 |
|
---|
324 | Actually, Apple does not follow the Unicode NFD standard since not all
|
---|
325 | character ranges are decomposed. The claim is that this avoids problems with
|
---|
326 | round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for
|
---|
327 | details.
|
---|
328 |
|
---|
329 | =head2 POSIX (Linux and other Unixes)
|
---|
330 |
|
---|
331 | File systems might vary in what encoding is to be used for
|
---|
332 | filenames. Since this module has no way to actually figure out
|
---|
333 | what the is correct it goes with the best guess which is to
|
---|
334 | assume filenames are encoding according to the current locale.
|
---|
335 | Users are advised to always specify UTF-8 as the locale charset.
|
---|
336 |
|
---|
337 | =head1 SEE ALSO
|
---|
338 |
|
---|
339 | L<I18N::Langinfo>, L<Encode>
|
---|
340 |
|
---|
341 | =head1 AUTHOR
|
---|
342 |
|
---|
343 | Copyright 2010 Gisle Aas <[email protected]>.
|
---|
344 |
|
---|
345 | This library is free software; you can redistribute it and/or
|
---|
346 | modify it under the same terms as Perl itself.
|
---|
347 |
|
---|
348 | =cut
|
---|