source: main/trunk/greenstone2/perllib/cpan/Encode/Locale.pm.newForLocalIssue@ 33721

Last change on this file since 33721 was 33721, checked in by ak19, 4 years ago

Inactive but committing to svn: Newer Locale.pm file, and introducing Alias.pm, and change to import Alias.pm in DirectoryPlugin.pm. These changes are for supporting perl to run in unicode (or perl to run perl code written in unicode), when using Windows in a non-English locale like Chinese. These changes were required and tested on Windows 10 in Chinese locale and other region (time) and display settings set for China.

File size: 12.1 KB
Line 
1package Encode::Locale;
2
3use strict;
4our $VERSION = "1.05";
5
6use base 'Exporter';
7our @EXPORT_OK = qw(
8 decode_argv env
9 $ENCODING_LOCALE $ENCODING_LOCALE_FS
10 $ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
11);
12
13use Encode ();
14use Encode::Alias ();
15
16our $ENCODING_LOCALE;
17our $ENCODING_LOCALE_FS;
18our $ENCODING_CONSOLE_IN;
19our $ENCODING_CONSOLE_OUT;
20
21sub DEBUG () { 0 }
22
23sub _init {
24 if ($^O eq "MSWin32") {
25 unless ($ENCODING_LOCALE) {
26 # Try to obtain what the Windows ANSI code page is
27 eval {
28 unless (defined &GetACP) {
29 require Win32;
30 eval { Win32::GetACP() };
31 *GetACP = sub { &Win32::GetACP } unless $@;
32 }
33 unless (defined &GetACP) {
34 require Win32::API;
35 Win32::API->Import('kernel32', 'int GetACP()');
36 }
37 if (defined &GetACP) {
38 my $cp = GetACP();
39 $ENCODING_LOCALE = "cp$cp" if $cp;
40 }
41 };
42 }
43
44 unless ($ENCODING_CONSOLE_IN) {
45 # only test one since set together
46 unless (defined &GetInputCP) {
47 eval {
48 require Win32;
49 eval { Win32::GetConsoleCP() };
50 # manually "import" it since Win32->import refuses
51 *GetInputCP = sub { &Win32::GetConsoleCP } unless $@;
52 *GetOutputCP = sub { &Win32::GetConsoleOutputCP } unless $@;
53 };
54 unless (defined &GetInputCP) {
55 eval {
56 # try Win32::Console module for codepage to use
57 require Win32::Console;
58 eval { Win32::Console::InputCP() };
59 *GetInputCP = sub { &Win32::Console::InputCP }
60 unless $@;
61 *GetOutputCP = sub { &Win32::Console::OutputCP }
62 unless $@;
63 };
64 }
65 unless (defined &GetInputCP) {
66 # final fallback
67 *GetInputCP = *GetOutputCP = sub {
68 # another fallback that could work is:
69 # reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
70 ((qx(chcp) || '') =~ /^Active code page: (\d+)/)
71 ? $1 : ();
72 };
73 }
74 }
75 my $cp = GetInputCP();
76 $ENCODING_CONSOLE_IN = "cp$cp" if $cp;
77 $cp = GetOutputCP();
78 $ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
79 }
80 }
81
82 unless ($ENCODING_LOCALE) {
83 eval {
84 require I18N::Langinfo;
85 $ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
86
87 # Workaround of Encode < v2.25. The "646" encoding alias was
88 # introduced in Encode-2.25, but we don't want to require that version
89 # quite yet. Should avoid the CPAN testers failure reported from
90 # openbsd-4.7/perl-5.10.0 combo.
91 $ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
92
93 # https://rt.cpan.org/Ticket/Display.html?id=66373
94 $ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
95 };
96 $ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
97 }
98
99 if ($^O eq "darwin") {
100 $ENCODING_LOCALE_FS ||= "UTF-8";
101 }
102
103 # final fallback
104 $ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
105 $ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
106 $ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
107 $ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
108
109 unless (Encode::find_encoding($ENCODING_LOCALE)) {
110 my $foundit;
111 if (lc($ENCODING_LOCALE) eq "gb18030") {
112 eval {
113 require Encode::HanExtra;
114 };
115 if ($@) {
116 die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
117 }
118 $foundit++ if Encode::find_encoding($ENCODING_LOCALE);
119 }
120 die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
121 unless $foundit;
122
123 }
124
125 # use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
126}
127
128_init();
129Encode::Alias::define_alias(sub {
130 no strict 'refs';
131 no warnings 'once';
132 return ${"ENCODING_" . uc(shift)};
133}, "locale");
134
135sub _flush_aliases {
136 no strict 'refs';
137 for my $a (keys %Encode::Alias::Alias) {
138 if (defined ${"ENCODING_" . uc($a)}) {
139 delete $Encode::Alias::Alias{$a};
140 warn "Flushed alias cache for $a" if DEBUG;
141 }
142 }
143}
144
145sub reinit {
146 $ENCODING_LOCALE = shift;
147 $ENCODING_LOCALE_FS = shift;
148 $ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
149 $ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
150 _init();
151 _flush_aliases();
152}
153
154sub decode_argv {
155 die if defined wantarray;
156 for (@ARGV) {
157 $_ = Encode::decode(locale => $_, @_);
158 }
159}
160
161sub env {
162 my $k = Encode::encode(locale => shift);
163 my $old = $ENV{$k};
164 if (@_) {
165 my $v = shift;
166 if (defined $v) {
167 $ENV{$k} = Encode::encode(locale => $v);
168 }
169 else {
170 delete $ENV{$k};
171 }
172 }
173 return Encode::decode(locale => $old) if defined wantarray;
174}
175
1761;
177
178__END__
179
180=head1 NAME
181
182Encode::Locale - Determine the locale encoding
183
184=head1 SYNOPSIS
185
186 use Encode::Locale;
187 use Encode;
188
189 $string = decode(locale => $bytes);
190 $bytes = encode(locale => $string);
191
192 if (-t) {
193 binmode(STDIN, ":encoding(console_in)");
194 binmode(STDOUT, ":encoding(console_out)");
195 binmode(STDERR, ":encoding(console_out)");
196 }
197
198 # Processing file names passed in as arguments
199 my $uni_filename = decode(locale => $ARGV[0]);
200 open(my $fh, "<", encode(locale_fs => $uni_filename))
201 || die "Can't open '$uni_filename': $!";
202 binmode($fh, ":encoding(locale)");
203 ...
204
205=head1 DESCRIPTION
206
207In many applications it's wise to let Perl use Unicode for the strings it
208processes. Most of the interfaces Perl has to the outside world are still byte
209based. Programs therefore need to decode byte strings that enter the program
210from the outside and encode them again on the way out.
211
212The POSIX locale system is used to specify both the language conventions
213requested by the user and the preferred character set to consume and
214output. The C<Encode::Locale> module looks up the charset and encoding (called
215a CODESET in the locale jargon) and arranges for the L<Encode> module to know
216this encoding under the name "locale". It means bytes obtained from the
217environment can be converted to Unicode strings by calling C<<
218Encode::encode(locale => $bytes) >> and converted back again with C<<
219Encode::decode(locale => $string) >>.
220
221Where file systems interfaces pass file names in and out of the program we also
222need care. The trend is for operating systems to use a fixed file encoding
223that don't actually depend on the locale; and this module determines the most
224appropriate encoding for file names. The L<Encode> module will know this
225encoding under the name "locale_fs". For traditional Unix systems this will
226be an alias to the same encoding as "locale".
227
228For programs running in a terminal window (called a "Console" on some systems)
229the "locale" encoding is usually a good choice for what to expect as input and
230output. Some systems allows us to query the encoding set for the terminal and
231C<Encode::Locale> will do that if available and make these encodings known
232under the C<Encode> aliases "console_in" and "console_out". For systems where
233we can't determine the terminal encoding these will be aliased as the same
234encoding as "locale". The advice is to use "console_in" for input known to
235come from the terminal and "console_out" for output to the terminal.
236
237In addition to arranging for various Encode aliases the following functions and
238variables are provided:
239
240=over
241
242=item decode_argv( )
243
244=item decode_argv( Encode::FB_CROAK )
245
246This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
247
248The function will by default replace characters that can't be decoded by
249"\x{FFFD}", the Unicode replacement character.
250
251Any argument provided is passed as CHECK to underlying Encode::decode() call.
252Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
253command line arguments can be decoded. See L<Encode/"Handling Malformed Data">
254for details on other options for CHECK.
255
256=item env( $uni_key )
257
258=item env( $uni_key => $uni_value )
259
260Interface to get/set environment variables. Returns the current value as a
261Unicode string. The $uni_key and $uni_value arguments are expected to be
262Unicode strings as well. Passing C<undef> as $uni_value deletes the
263environment variable named $uni_key.
264
265The returned value will have the characters that can't be decoded replaced by
266"\x{FFFD}", the Unicode replacement character.
267
268There is no interface to request alternative CHECK behavior as for
269decode_argv(). If you need that you need to call encode/decode yourself.
270For example:
271
272 my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
273 my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
274
275=item reinit( )
276
277=item reinit( $encoding )
278
279Reinitialize the encodings from the locale. You want to call this function if
280you changed anything in the environment that might influence the locale.
281
282This function will croak if the determined encoding isn't recognized by
283the Encode module.
284
285With argument force $ENCODING_... variables to set to the given value.
286
287=item $ENCODING_LOCALE
288
289The encoding name determined to be suitable for the current locale.
290L<Encode> know this encoding as "locale".
291
292=item $ENCODING_LOCALE_FS
293
294The encoding name determined to be suitable for file system interfaces
295involving file names.
296L<Encode> know this encoding as "locale_fs".
297
298=item $ENCODING_CONSOLE_IN
299
300=item $ENCODING_CONSOLE_OUT
301
302The encodings to be used for reading and writing output to the a console.
303L<Encode> know these encodings as "console_in" and "console_out".
304
305=back
306
307=head1 NOTES
308
309This table summarizes the mapping of the encodings set up
310by the C<Encode::Locale> module:
311
312 Encode | | |
313 Alias | Windows | Mac OS X | POSIX
314 ------------+---------+--------------+------------
315 locale | ANSI | nl_langinfo | nl_langinfo
316 locale_fs | ANSI | UTF-8 | nl_langinfo
317 console_in | OEM | nl_langinfo | nl_langinfo
318 console_out | OEM | nl_langinfo | nl_langinfo
319
320=head2 Windows
321
322Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16
323strings) and a byte based API based a character set called ANSI. The
324regular Perl interfaces to the OS currently only uses the ANSI APIs.
325Unfortunately ANSI is not a single character set.
326
327The encoding that corresponds to ANSI varies between different editions of
328Windows. For many western editions of Windows ANSI corresponds to CP-1252
329which is a character set similar to ISO-8859-1. Conceptually the ANSI
330character set is a similar concept to the POSIX locale CODESET so this module
331figures out what the ANSI code page is and make this available as
332$ENCODING_LOCALE and the "locale" Encoding alias.
333
334Windows systems also operate with another byte based character set.
335It's called the OEM code page. This is the encoding that the Console
336takes as input and output. It's common for the OEM code page to
337differ from the ANSI code page.
338
339=head2 Mac OS X
340
341On Mac OS X the file system encoding is always UTF-8 while the locale
342can otherwise be set up as normal for POSIX systems.
343
344File names on Mac OS X will at the OS-level be converted to
345NFD-form. A file created by passing a NFC-filename will come
346in NFD-form from readdir(). See L<Unicode::Normalize> for details
347of NFD/NFC.
348
349Actually, Apple does not follow the Unicode NFD standard since not all
350character ranges are decomposed. The claim is that this avoids problems with
351round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for
352details.
353
354=head2 POSIX (Linux and other Unixes)
355
356File systems might vary in what encoding is to be used for
357filenames. Since this module has no way to actually figure out
358what the is correct it goes with the best guess which is to
359assume filenames are encoding according to the current locale.
360Users are advised to always specify UTF-8 as the locale charset.
361
362=head1 SEE ALSO
363
364L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
365
366=head1 AUTHOR
367
368Copyright 2010 Gisle Aas <[email protected]>.
369
370This library is free software; you can redistribute it and/or
371modify it under the same terms as Perl itself.
372
373=cut
Note: See TracBrowser for help on using the repository browser.