source: for-distributions/trunk/bin/windows/perl/lib/Encode/Alias.pm@ 14489

Last change on this file since 14489 was 14489, checked in by oranfry, 17 years ago

upgrading to perl 5.8

File size: 10.3 KB
Line 
1package Encode::Alias;
2use strict;
3no warnings 'redefine';
4use Encode;
5our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
6sub DEBUG () { 0 }
7
8use base qw(Exporter);
9
10# Public, encouraged API is exported by default
11
12our @EXPORT =
13 qw (
14 define_alias
15 find_alias
16 );
17
18our @Alias; # ordered matching list
19our %Alias; # cached known aliases
20
21sub find_alias{
22 my $class = shift;
23 my $find = shift;
24 unless (exists $Alias{$find}) {
25 $Alias{$find} = undef; # Recursion guard
26 for (my $i=0; $i < @Alias; $i += 2){
27 my $alias = $Alias[$i];
28 my $val = $Alias[$i+1];
29 my $new;
30 if (ref($alias) eq 'Regexp' && $find =~ $alias){
31 DEBUG and warn "eval $val";
32 $new = eval $val;
33 DEBUG and $@ and warn "$val, $@";
34 }elsif (ref($alias) eq 'CODE'){
35 DEBUG and warn "$alias", "->", "($find)";
36 $new = $alias->($find);
37 }elsif (lc($find) eq lc($alias)){
38 $new = $val;
39 }
40 if (defined($new)){
41 next if $new eq $find; # avoid (direct) recursion on bugs
42 DEBUG and warn "$alias, $new";
43 my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
44 if ($enc){
45 $Alias{$find} = $enc;
46 last;
47 }
48 }
49 }
50 # case insensitive search when canonical is not in all lowercase
51 # RT ticket #7835
52 unless ($Alias{$find}){
53 my $lcfind = lc($find);
54 for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){
55 $lcfind eq lc($name) or next;
56 $Alias{$find} = Encode::find_encoding($name);
57 DEBUG and warn "$find => $name";
58 }
59 }
60 }
61 if (DEBUG){
62 my $name;
63 if (my $e = $Alias{$find}){
64 $name = $e->name;
65 }else{
66 $name = "";
67 }
68 warn "find_alias($class, $find)->name = $name";
69 }
70 return $Alias{$find};
71}
72
73sub define_alias{
74 while (@_){
75 my ($alias,$name) = splice(@_,0,2);
76 unshift(@Alias, $alias => $name); # newer one has precedence
77 if (ref($alias)){
78 # clear %Alias cache to allow overrides
79 my @a = keys %Alias;
80 for my $k (@a){
81 if (ref($alias) eq 'Regexp' && $k =~ $alias){
82 DEBUG and warn "delete \$Alias\{$k\}";
83 delete $Alias{$k};
84 }
85 elsif (ref($alias) eq 'CODE'){
86 DEBUG and warn "delete \$Alias\{$k\}";
87 delete $Alias{$alias->($name)};
88 }
89 }
90 }else{
91 DEBUG and warn "delete \$Alias\{$alias\}";
92 delete $Alias{$alias};
93 }
94 }
95}
96
97# Allow latin-1 style names as well
98# 0 1 2 3 4 5 6 7 8 9 10
99our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
100# Allow winlatin1 style names as well
101our %Winlatin2cp = (
102 'latin1' => 1252,
103 'latin2' => 1250,
104 'cyrillic' => 1251,
105 'greek' => 1253,
106 'turkish' => 1254,
107 'hebrew' => 1255,
108 'arabic' => 1256,
109 'baltic' => 1257,
110 'vietnamese' => 1258,
111 );
112
113init_aliases();
114
115sub undef_aliases{
116 @Alias = ();
117 %Alias = ();
118}
119
120sub init_aliases
121{
122 undef_aliases();
123 # Try all-lower-case version should all else fails
124 define_alias( qr/^(.*)$/ => '"\L$1"' );
125
126 # UTF/UCS stuff
127 define_alias( qr/^UTF-?7$/i => '"UTF-7"');
128 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
129 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
130 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
131 qr/^iso-10646-1$/i => '"UCS-2BE"' );
132 define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
133 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
134 qr/^UTF-?(16|32)$/i => '"UTF-$1"',
135 );
136 # ASCII
137 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
138 define_alias('C' => 'ascii');
139 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
140 # Allow variants of iso-8859-1 etc.
141 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
142
143 # At least HP-UX has these.
144 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
145
146 # More HP stuff.
147 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
148
149 # The Official name of ASCII.
150 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
151
152 # This is a font issue, not an encoding issue.
153 # (The currency symbol of the Latin 1 upper half
154 # has been redefined as the euro symbol.)
155 define_alias( qr/^(.+)\@euro$/i => '"$1"' );
156
157 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
158 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
159
160 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
161 hebrew|arabic|baltic|vietnamese)$/ix =>
162 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
163
164 # Common names for non-latin preferred MIME names
165 define_alias( 'ascii' => 'US-ascii',
166 'cyrillic' => 'iso-8859-5',
167 'arabic' => 'iso-8859-6',
168 'greek' => 'iso-8859-7',
169 'hebrew' => 'iso-8859-8',
170 'thai' => 'iso-8859-11',
171 'tis620' => 'iso-8859-11',
172 );
173
174 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
175 # And Microsoft has their own naming (again, surprisingly).
176 # And windows-* is registered in IANA!
177 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
178
179 # Sometimes seen with a leading zero.
180 # define_alias( qr/\bcp037\b/i => '"cp37"');
181
182 # Mac Mappings
183 # predefined in *.ucm; unneeded
184 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
185 define_alias( qr/^mac_(.*)$/i => '"mac$1"');
186 # Ououououou. gone. They are differente!
187 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
188
189 # Standardize on the dashed versions.
190 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
191
192 unless ($Encode::ON_EBCDIC){
193 # for Encode::CN
194 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
195 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
196 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
197 # CP936 doesn't have vendor-addon for GBK, so they're identical.
198 define_alias( qr/^gbk$/i => '"cp936"');
199 # This fixes gb2312 vs. euc-cn confusion, practically
200 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
201 # for Encode::JP
202 define_alias( qr/\bjis$/i => '"7bit-jis"' );
203 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
204 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
205 define_alias( qr/\bujis$/i => '"euc-jp"' );
206 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
207 define_alias( qr/\bsjis$/i => '"shiftjis"' );
208 define_alias( qr/\bwindows-31j$/i => '"cp932"' );
209 # for Encode::KR
210 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
211 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
212 # This fixes ksc5601 vs. euc-kr confusion, practically
213 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
214 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
215 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
216 # for Encode::TW
217 define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
218 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
219 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
220 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
221 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
222 }
223 # utf8 is blessed :)
224 define_alias( qr/^UTF-8$/i => '"utf-8-strict"');
225 # At last, Map white space and _ to '-'
226 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
227}
228
2291;
230__END__
231
232# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
233# TODO: HP-UX '15' encodings japanese15 korean15 roi15
234# TODO: Cyrillic encoding ISO-IR-111 (useful?)
235# TODO: Armenian encoding ARMSCII-8
236# TODO: Hebrew encoding ISO-8859-8-1
237# TODO: Thai encoding TCVN
238# TODO: Vietnamese encodings VPS
239# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
240# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
241# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
242# Kannada Khmer Korean Laotian Malayalam Mongolian
243# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
244
245=head1 NAME
246
247Encode::Alias - alias definitions to encodings
248
249=head1 SYNOPSIS
250
251 use Encode;
252 use Encode::Alias;
253 define_alias( newName => ENCODING);
254
255=head1 DESCRIPTION
256
257Allows newName to be used as an alias for ENCODING. ENCODING may be
258either the name of an encoding or an encoding object (as described
259in L<Encode>).
260
261Currently I<newName> can be specified in the following ways:
262
263=over 4
264
265=item As a simple string.
266
267=item As a qr// compiled regular expression, e.g.:
268
269 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
270
271In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
272in order to allow C<$1> etc. to be substituted. The example is one
273way to alias names as used in X11 fonts to the MIME names for the
274iso-8859-* family. Note the double quotes inside the single quotes.
275
276(or, you don't have to do this yourself because this example is predefined)
277
278If you are using a regex here, you have to use the quotes as shown or
279it won't work. Also note that regex handling is tricky even for the
280experienced. Use this feature with caution.
281
282=item As a code reference, e.g.:
283
284 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
285
286The same effect as the example above in a different way. The coderef
287takes the alias name as an argument and returns a canonical name on
288success or undef if not. Note the second argument is not required.
289Use this with even more caution than the regex version.
290
291=back
292
293=head3 Changes in code reference aliasing
294
295As of Encode 1.87, the older form
296
297 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
298
299no longer works.
300
301Encode up to 1.86 internally used "local $_" to implement ths older
302form. But consider the code below;
303
304 use Encode;
305 $_ = "eeeee" ;
306 while (/(e)/g) {
307 my $utf = decode('aliased-encoding-name', $1);
308 print "position:",pos,"\n";
309 }
310
311Prior to Encode 1.86 this fails because of "local $_".
312
313=head2 Alias overloading
314
315You can override predefined aliases by simply applying define_alias().
316The new alias is always evaluated first, and when necessary,
317define_alias() flushes the internal cache to make the new definition
318available.
319
320 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
321 # superset of SHIFT_JIS
322
323 define_alias( qr/shift.*jis$/i => '"cp932"' );
324 define_alias( qr/sjis$/i => '"cp932"' );
325
326If you want to zap all predefined aliases, you can use
327
328 Encode::Alias->undef_aliases;
329
330to do so. And
331
332 Encode::Alias->init_aliases;
333
334gets the factory settings back.
335
336=head1 SEE ALSO
337
338L<Encode>, L<Encode::Supported>
339
340=cut
341
Note: See TracBrowser for help on using the repository browser.