1 | package Encode::Alias;
|
---|
2 | use strict;
|
---|
3 | no warnings 'redefine';
|
---|
4 | use Encode;
|
---|
5 | our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
6 | sub DEBUG () { 0 }
|
---|
7 |
|
---|
8 | use base qw(Exporter);
|
---|
9 |
|
---|
10 | # Public, encouraged API is exported by default
|
---|
11 |
|
---|
12 | our @EXPORT =
|
---|
13 | qw (
|
---|
14 | define_alias
|
---|
15 | find_alias
|
---|
16 | );
|
---|
17 |
|
---|
18 | our @Alias; # ordered matching list
|
---|
19 | our %Alias; # cached known aliases
|
---|
20 |
|
---|
21 | sub find_alias{
|
---|
22 | my $class = shift;
|
---|
23 | my $find = shift;
|
---|
24 | unless (exists $Alias{$find}) {
|
---|
25 | $Alias{$find} = undef; # Recursion guard
|
---|
26 | for (my $i=0; $i < @Alias; $i += 2){
|
---|
27 | my $alias = $Alias[$i];
|
---|
28 | my $val = $Alias[$i+1];
|
---|
29 | my $new;
|
---|
30 | if (ref($alias) eq 'Regexp' && $find =~ $alias){
|
---|
31 | DEBUG and warn "eval $val";
|
---|
32 | $new = eval $val;
|
---|
33 | DEBUG and $@ and warn "$val, $@";
|
---|
34 | }elsif (ref($alias) eq 'CODE'){
|
---|
35 | DEBUG and warn "$alias", "->", "($find)";
|
---|
36 | $new = $alias->($find);
|
---|
37 | }elsif (lc($find) eq lc($alias)){
|
---|
38 | $new = $val;
|
---|
39 | }
|
---|
40 | if (defined($new)){
|
---|
41 | next if $new eq $find; # avoid (direct) recursion on bugs
|
---|
42 | DEBUG and warn "$alias, $new";
|
---|
43 | my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
|
---|
44 | if ($enc){
|
---|
45 | $Alias{$find} = $enc;
|
---|
46 | last;
|
---|
47 | }
|
---|
48 | }
|
---|
49 | }
|
---|
50 | # case insensitive search when canonical is not in all lowercase
|
---|
51 | # RT ticket #7835
|
---|
52 | unless ($Alias{$find}){
|
---|
53 | my $lcfind = lc($find);
|
---|
54 | for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){
|
---|
55 | $lcfind eq lc($name) or next;
|
---|
56 | $Alias{$find} = Encode::find_encoding($name);
|
---|
57 | DEBUG and warn "$find => $name";
|
---|
58 | }
|
---|
59 | }
|
---|
60 | }
|
---|
61 | if (DEBUG){
|
---|
62 | my $name;
|
---|
63 | if (my $e = $Alias{$find}){
|
---|
64 | $name = $e->name;
|
---|
65 | }else{
|
---|
66 | $name = "";
|
---|
67 | }
|
---|
68 | warn "find_alias($class, $find)->name = $name";
|
---|
69 | }
|
---|
70 | return $Alias{$find};
|
---|
71 | }
|
---|
72 |
|
---|
73 | sub define_alias{
|
---|
74 | while (@_){
|
---|
75 | my ($alias,$name) = splice(@_,0,2);
|
---|
76 | unshift(@Alias, $alias => $name); # newer one has precedence
|
---|
77 | if (ref($alias)){
|
---|
78 | # clear %Alias cache to allow overrides
|
---|
79 | my @a = keys %Alias;
|
---|
80 | for my $k (@a){
|
---|
81 | if (ref($alias) eq 'Regexp' && $k =~ $alias){
|
---|
82 | DEBUG and warn "delete \$Alias\{$k\}";
|
---|
83 | delete $Alias{$k};
|
---|
84 | }
|
---|
85 | elsif (ref($alias) eq 'CODE'){
|
---|
86 | DEBUG and warn "delete \$Alias\{$k\}";
|
---|
87 | delete $Alias{$alias->($name)};
|
---|
88 | }
|
---|
89 | }
|
---|
90 | }else{
|
---|
91 | DEBUG and warn "delete \$Alias\{$alias\}";
|
---|
92 | delete $Alias{$alias};
|
---|
93 | }
|
---|
94 | }
|
---|
95 | }
|
---|
96 |
|
---|
97 | # Allow latin-1 style names as well
|
---|
98 | # 0 1 2 3 4 5 6 7 8 9 10
|
---|
99 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
|
---|
100 | # Allow winlatin1 style names as well
|
---|
101 | our %Winlatin2cp = (
|
---|
102 | 'latin1' => 1252,
|
---|
103 | 'latin2' => 1250,
|
---|
104 | 'cyrillic' => 1251,
|
---|
105 | 'greek' => 1253,
|
---|
106 | 'turkish' => 1254,
|
---|
107 | 'hebrew' => 1255,
|
---|
108 | 'arabic' => 1256,
|
---|
109 | 'baltic' => 1257,
|
---|
110 | 'vietnamese' => 1258,
|
---|
111 | );
|
---|
112 |
|
---|
113 | init_aliases();
|
---|
114 |
|
---|
115 | sub undef_aliases{
|
---|
116 | @Alias = ();
|
---|
117 | %Alias = ();
|
---|
118 | }
|
---|
119 |
|
---|
120 | sub init_aliases
|
---|
121 | {
|
---|
122 | undef_aliases();
|
---|
123 | # Try all-lower-case version should all else fails
|
---|
124 | define_alias( qr/^(.*)$/ => '"\L$1"' );
|
---|
125 |
|
---|
126 | # UTF/UCS stuff
|
---|
127 | define_alias( qr/^UTF-?7$/i => '"UTF-7"');
|
---|
128 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
|
---|
129 | define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
|
---|
130 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
|
---|
131 | qr/^iso-10646-1$/i => '"UCS-2BE"' );
|
---|
132 | define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
|
---|
133 | qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
|
---|
134 | qr/^UTF-?(16|32)$/i => '"UTF-$1"',
|
---|
135 | );
|
---|
136 | # ASCII
|
---|
137 | define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
|
---|
138 | define_alias('C' => 'ascii');
|
---|
139 | define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
|
---|
140 | # Allow variants of iso-8859-1 etc.
|
---|
141 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
|
---|
142 |
|
---|
143 | # At least HP-UX has these.
|
---|
144 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
|
---|
145 |
|
---|
146 | # More HP stuff.
|
---|
147 | define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
|
---|
148 |
|
---|
149 | # The Official name of ASCII.
|
---|
150 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
|
---|
151 |
|
---|
152 | # This is a font issue, not an encoding issue.
|
---|
153 | # (The currency symbol of the Latin 1 upper half
|
---|
154 | # has been redefined as the euro symbol.)
|
---|
155 | define_alias( qr/^(.+)\@euro$/i => '"$1"' );
|
---|
156 |
|
---|
157 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
|
---|
158 | => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
|
---|
159 |
|
---|
160 | define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
|
---|
161 | hebrew|arabic|baltic|vietnamese)$/ix =>
|
---|
162 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
|
---|
163 |
|
---|
164 | # Common names for non-latin preferred MIME names
|
---|
165 | define_alias( 'ascii' => 'US-ascii',
|
---|
166 | 'cyrillic' => 'iso-8859-5',
|
---|
167 | 'arabic' => 'iso-8859-6',
|
---|
168 | 'greek' => 'iso-8859-7',
|
---|
169 | 'hebrew' => 'iso-8859-8',
|
---|
170 | 'thai' => 'iso-8859-11',
|
---|
171 | 'tis620' => 'iso-8859-11',
|
---|
172 | );
|
---|
173 |
|
---|
174 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
|
---|
175 | # And Microsoft has their own naming (again, surprisingly).
|
---|
176 | # And windows-* is registered in IANA!
|
---|
177 | define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
|
---|
178 |
|
---|
179 | # Sometimes seen with a leading zero.
|
---|
180 | # define_alias( qr/\bcp037\b/i => '"cp37"');
|
---|
181 |
|
---|
182 | # Mac Mappings
|
---|
183 | # predefined in *.ucm; unneeded
|
---|
184 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
|
---|
185 | define_alias( qr/^mac_(.*)$/i => '"mac$1"');
|
---|
186 | # Ououououou. gone. They are differente!
|
---|
187 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
|
---|
188 |
|
---|
189 | # Standardize on the dashed versions.
|
---|
190 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
|
---|
191 |
|
---|
192 | unless ($Encode::ON_EBCDIC){
|
---|
193 | # for Encode::CN
|
---|
194 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
|
---|
195 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
|
---|
196 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
|
---|
197 | # CP936 doesn't have vendor-addon for GBK, so they're identical.
|
---|
198 | define_alias( qr/^gbk$/i => '"cp936"');
|
---|
199 | # This fixes gb2312 vs. euc-cn confusion, practically
|
---|
200 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
|
---|
201 | # for Encode::JP
|
---|
202 | define_alias( qr/\bjis$/i => '"7bit-jis"' );
|
---|
203 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
|
---|
204 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
|
---|
205 | define_alias( qr/\bujis$/i => '"euc-jp"' );
|
---|
206 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
|
---|
207 | define_alias( qr/\bsjis$/i => '"shiftjis"' );
|
---|
208 | define_alias( qr/\bwindows-31j$/i => '"cp932"' );
|
---|
209 | # for Encode::KR
|
---|
210 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
|
---|
211 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
|
---|
212 | # This fixes ksc5601 vs. euc-kr confusion, practically
|
---|
213 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
|
---|
214 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
|
---|
215 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
|
---|
216 | # for Encode::TW
|
---|
217 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
|
---|
218 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
|
---|
219 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
|
---|
220 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
|
---|
221 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
|
---|
222 | }
|
---|
223 | # utf8 is blessed :)
|
---|
224 | define_alias( qr/^UTF-8$/i => '"utf-8-strict"');
|
---|
225 | # At last, Map white space and _ to '-'
|
---|
226 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
|
---|
227 | }
|
---|
228 |
|
---|
229 | 1;
|
---|
230 | __END__
|
---|
231 |
|
---|
232 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
|
---|
233 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15
|
---|
234 | # TODO: Cyrillic encoding ISO-IR-111 (useful?)
|
---|
235 | # TODO: Armenian encoding ARMSCII-8
|
---|
236 | # TODO: Hebrew encoding ISO-8859-8-1
|
---|
237 | # TODO: Thai encoding TCVN
|
---|
238 | # TODO: Vietnamese encodings VPS
|
---|
239 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
|
---|
240 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
|
---|
241 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
|
---|
242 | # Kannada Khmer Korean Laotian Malayalam Mongolian
|
---|
243 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
|
---|
244 |
|
---|
245 | =head1 NAME
|
---|
246 |
|
---|
247 | Encode::Alias - alias definitions to encodings
|
---|
248 |
|
---|
249 | =head1 SYNOPSIS
|
---|
250 |
|
---|
251 | use Encode;
|
---|
252 | use Encode::Alias;
|
---|
253 | define_alias( newName => ENCODING);
|
---|
254 |
|
---|
255 | =head1 DESCRIPTION
|
---|
256 |
|
---|
257 | Allows newName to be used as an alias for ENCODING. ENCODING may be
|
---|
258 | either the name of an encoding or an encoding object (as described
|
---|
259 | in L<Encode>).
|
---|
260 |
|
---|
261 | Currently I<newName> can be specified in the following ways:
|
---|
262 |
|
---|
263 | =over 4
|
---|
264 |
|
---|
265 | =item As a simple string.
|
---|
266 |
|
---|
267 | =item As a qr// compiled regular expression, e.g.:
|
---|
268 |
|
---|
269 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
|
---|
270 |
|
---|
271 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
|
---|
272 | in order to allow C<$1> etc. to be substituted. The example is one
|
---|
273 | way to alias names as used in X11 fonts to the MIME names for the
|
---|
274 | iso-8859-* family. Note the double quotes inside the single quotes.
|
---|
275 |
|
---|
276 | (or, you don't have to do this yourself because this example is predefined)
|
---|
277 |
|
---|
278 | If you are using a regex here, you have to use the quotes as shown or
|
---|
279 | it won't work. Also note that regex handling is tricky even for the
|
---|
280 | experienced. Use this feature with caution.
|
---|
281 |
|
---|
282 | =item As a code reference, e.g.:
|
---|
283 |
|
---|
284 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
|
---|
285 |
|
---|
286 | The same effect as the example above in a different way. The coderef
|
---|
287 | takes the alias name as an argument and returns a canonical name on
|
---|
288 | success or undef if not. Note the second argument is not required.
|
---|
289 | Use this with even more caution than the regex version.
|
---|
290 |
|
---|
291 | =back
|
---|
292 |
|
---|
293 | =head3 Changes in code reference aliasing
|
---|
294 |
|
---|
295 | As of Encode 1.87, the older form
|
---|
296 |
|
---|
297 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
|
---|
298 |
|
---|
299 | no longer works.
|
---|
300 |
|
---|
301 | Encode up to 1.86 internally used "local $_" to implement ths older
|
---|
302 | form. But consider the code below;
|
---|
303 |
|
---|
304 | use Encode;
|
---|
305 | $_ = "eeeee" ;
|
---|
306 | while (/(e)/g) {
|
---|
307 | my $utf = decode('aliased-encoding-name', $1);
|
---|
308 | print "position:",pos,"\n";
|
---|
309 | }
|
---|
310 |
|
---|
311 | Prior to Encode 1.86 this fails because of "local $_".
|
---|
312 |
|
---|
313 | =head2 Alias overloading
|
---|
314 |
|
---|
315 | You can override predefined aliases by simply applying define_alias().
|
---|
316 | The new alias is always evaluated first, and when necessary,
|
---|
317 | define_alias() flushes the internal cache to make the new definition
|
---|
318 | available.
|
---|
319 |
|
---|
320 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
|
---|
321 | # superset of SHIFT_JIS
|
---|
322 |
|
---|
323 | define_alias( qr/shift.*jis$/i => '"cp932"' );
|
---|
324 | define_alias( qr/sjis$/i => '"cp932"' );
|
---|
325 |
|
---|
326 | If you want to zap all predefined aliases, you can use
|
---|
327 |
|
---|
328 | Encode::Alias->undef_aliases;
|
---|
329 |
|
---|
330 | to do so. And
|
---|
331 |
|
---|
332 | Encode::Alias->init_aliases;
|
---|
333 |
|
---|
334 | gets the factory settings back.
|
---|
335 |
|
---|
336 | =head1 SEE ALSO
|
---|
337 |
|
---|
338 | L<Encode>, L<Encode::Supported>
|
---|
339 |
|
---|
340 | =cut
|
---|
341 |
|
---|