1 | package Encode::Alias;
|
---|
2 | use strict;
|
---|
3 | use warnings;
|
---|
4 | our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
---|
5 | use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
|
---|
6 |
|
---|
7 | use Exporter 'import';
|
---|
8 |
|
---|
9 | # Public, encouraged API is exported by default
|
---|
10 |
|
---|
11 | our @EXPORT =
|
---|
12 | qw (
|
---|
13 | define_alias
|
---|
14 | find_alias
|
---|
15 | );
|
---|
16 |
|
---|
17 | our @Alias; # ordered matching list
|
---|
18 | our %Alias; # cached known aliases
|
---|
19 |
|
---|
20 | sub find_alias {
|
---|
21 | my $class = shift;
|
---|
22 | my $find = shift;
|
---|
23 | unless ( exists $Alias{$find} ) {
|
---|
24 | $Alias{$find} = undef; # Recursion guard
|
---|
25 | for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
|
---|
26 | my $alias = $Alias[$i];
|
---|
27 | my $val = $Alias[ $i + 1 ];
|
---|
28 | my $new;
|
---|
29 | if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
|
---|
30 | DEBUG and warn "eval $val";
|
---|
31 | $new = eval $val;
|
---|
32 | DEBUG and $@ and warn "$val, $@";
|
---|
33 | }
|
---|
34 | elsif ( ref($alias) eq 'CODE' ) {
|
---|
35 | DEBUG and warn "$alias", "->", "($find)";
|
---|
36 | $new = $alias->($find);
|
---|
37 | }
|
---|
38 | elsif ( lc($find) eq lc($alias) ) {
|
---|
39 | $new = $val;
|
---|
40 | }
|
---|
41 | if ( defined($new) ) {
|
---|
42 | next if $new eq $find; # avoid (direct) recursion on bugs
|
---|
43 | DEBUG and warn "$alias, $new";
|
---|
44 | my $enc =
|
---|
45 | ( ref($new) ) ? $new : Encode::find_encoding($new);
|
---|
46 | if ($enc) {
|
---|
47 | $Alias{$find} = $enc;
|
---|
48 | last;
|
---|
49 | }
|
---|
50 | }
|
---|
51 | }
|
---|
52 |
|
---|
53 | # case insensitive search when canonical is not in all lowercase
|
---|
54 | # RT ticket #7835
|
---|
55 | unless ( $Alias{$find} ) {
|
---|
56 | my $lcfind = lc($find);
|
---|
57 | for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
|
---|
58 | {
|
---|
59 | $lcfind eq lc($name) or next;
|
---|
60 | $Alias{$find} = Encode::find_encoding($name);
|
---|
61 | DEBUG and warn "$find => $name";
|
---|
62 | }
|
---|
63 | }
|
---|
64 | }
|
---|
65 | if (DEBUG) {
|
---|
66 | my $name;
|
---|
67 | if ( my $e = $Alias{$find} ) {
|
---|
68 | $name = $e->name;
|
---|
69 | }
|
---|
70 | else {
|
---|
71 | $name = "";
|
---|
72 | }
|
---|
73 | warn "find_alias($class, $find)->name = $name";
|
---|
74 | }
|
---|
75 | return $Alias{$find};
|
---|
76 | }
|
---|
77 |
|
---|
78 | sub define_alias {
|
---|
79 | while (@_) {
|
---|
80 | my $alias = shift;
|
---|
81 | my $name = shift;
|
---|
82 | unshift( @Alias, $alias => $name ) # newer one has precedence
|
---|
83 | if defined $alias;
|
---|
84 | if ( ref($alias) ) {
|
---|
85 |
|
---|
86 | # clear %Alias cache to allow overrides
|
---|
87 | my @a = keys %Alias;
|
---|
88 | for my $k (@a) {
|
---|
89 | if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
|
---|
90 | DEBUG and warn "delete \$Alias\{$k\}";
|
---|
91 | delete $Alias{$k};
|
---|
92 | }
|
---|
93 | elsif ( ref($alias) eq 'CODE' && $alias->($k) ) {
|
---|
94 | DEBUG and warn "delete \$Alias\{$k\}";
|
---|
95 | delete $Alias{$k};
|
---|
96 | }
|
---|
97 | }
|
---|
98 | }
|
---|
99 | elsif (defined $alias) {
|
---|
100 | DEBUG and warn "delete \$Alias\{$alias\}";
|
---|
101 | delete $Alias{$alias};
|
---|
102 | }
|
---|
103 | elsif (DEBUG) {
|
---|
104 | require Carp;
|
---|
105 | Carp::croak("undef \$alias");
|
---|
106 | }
|
---|
107 | }
|
---|
108 | }
|
---|
109 |
|
---|
110 | # HACK: Encode must be used after define_alias is declarated as Encode calls define_alias
|
---|
111 | use Encode ();
|
---|
112 |
|
---|
113 | # Allow latin-1 style names as well
|
---|
114 | # 0 1 2 3 4 5 6 7 8 9 10
|
---|
115 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
|
---|
116 |
|
---|
117 | # Allow winlatin1 style names as well
|
---|
118 | our %Winlatin2cp = (
|
---|
119 | 'latin1' => 1252,
|
---|
120 | 'latin2' => 1250,
|
---|
121 | 'cyrillic' => 1251,
|
---|
122 | 'greek' => 1253,
|
---|
123 | 'turkish' => 1254,
|
---|
124 | 'hebrew' => 1255,
|
---|
125 | 'arabic' => 1256,
|
---|
126 | 'baltic' => 1257,
|
---|
127 | 'vietnamese' => 1258,
|
---|
128 | );
|
---|
129 |
|
---|
130 | init_aliases();
|
---|
131 |
|
---|
132 | sub undef_aliases {
|
---|
133 | @Alias = ();
|
---|
134 | %Alias = ();
|
---|
135 | }
|
---|
136 |
|
---|
137 | sub init_aliases {
|
---|
138 | undef_aliases();
|
---|
139 |
|
---|
140 | # Try all-lower-case version should all else fails
|
---|
141 | define_alias( qr/^(.*)$/ => '"\L$1"' );
|
---|
142 |
|
---|
143 | # UTF/UCS stuff
|
---|
144 | define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' );
|
---|
145 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
|
---|
146 | define_alias(
|
---|
147 | qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
|
---|
148 | qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")',
|
---|
149 | qr/^iso-10646-1$/i => '"UCS-2BE"'
|
---|
150 | );
|
---|
151 | define_alias(
|
---|
152 | qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
|
---|
153 | qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
|
---|
154 | qr/^UTF-?(16|32)$/i => '"UTF-$1"',
|
---|
155 | );
|
---|
156 |
|
---|
157 | # ASCII
|
---|
158 | define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
|
---|
159 | define_alias( 'C' => 'ascii' );
|
---|
160 | define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
|
---|
161 |
|
---|
162 | # Allow variants of iso-8859-1 etc.
|
---|
163 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
|
---|
164 |
|
---|
165 | # At least HP-UX has these.
|
---|
166 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
|
---|
167 |
|
---|
168 | # More HP stuff.
|
---|
169 | define_alias(
|
---|
170 | qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
|
---|
171 | '"${1}8"' );
|
---|
172 |
|
---|
173 | # The Official name of ASCII.
|
---|
174 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
|
---|
175 |
|
---|
176 | # This is a font issue, not an encoding issue.
|
---|
177 | # (The currency symbol of the Latin 1 upper half
|
---|
178 | # has been redefined as the euro symbol.)
|
---|
179 | define_alias( qr/^(.+)\@euro$/i => '"$1"' );
|
---|
180 |
|
---|
181 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
|
---|
182 | 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
|
---|
183 | );
|
---|
184 |
|
---|
185 | define_alias(
|
---|
186 | qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
|
---|
187 | hebrew|arabic|baltic|vietnamese)$/ix =>
|
---|
188 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
|
---|
189 | );
|
---|
190 |
|
---|
191 | # Common names for non-latin preferred MIME names
|
---|
192 | define_alias(
|
---|
193 | 'ascii' => 'US-ascii',
|
---|
194 | 'cyrillic' => 'iso-8859-5',
|
---|
195 | 'arabic' => 'iso-8859-6',
|
---|
196 | 'greek' => 'iso-8859-7',
|
---|
197 | 'hebrew' => 'iso-8859-8',
|
---|
198 | 'thai' => 'iso-8859-11',
|
---|
199 | );
|
---|
200 | # RT #20781
|
---|
201 | define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
|
---|
202 |
|
---|
203 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
|
---|
204 | # And Microsoft has their own naming (again, surprisingly).
|
---|
205 | # And windows-* is registered in IANA!
|
---|
206 | define_alias(
|
---|
207 | qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
|
---|
208 |
|
---|
209 | # Sometimes seen with a leading zero.
|
---|
210 | # define_alias( qr/\bcp037\b/i => '"cp37"');
|
---|
211 |
|
---|
212 | # Mac Mappings
|
---|
213 | # predefined in *.ucm; unneeded
|
---|
214 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
|
---|
215 | define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' );
|
---|
216 | # http://rt.cpan.org/Ticket/Display.html?id=36326
|
---|
217 | define_alias( qr/^macintosh$/i => '"MacRoman"' );
|
---|
218 | # https://rt.cpan.org/Ticket/Display.html?id=78125
|
---|
219 | define_alias( qr/^macce$/i => '"MacCentralEurRoman"' );
|
---|
220 | # Ououououou. gone. They are different!
|
---|
221 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
|
---|
222 |
|
---|
223 | # Standardize on the dashed versions.
|
---|
224 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
|
---|
225 |
|
---|
226 | unless ($Encode::ON_EBCDIC) {
|
---|
227 |
|
---|
228 | # for Encode::CN
|
---|
229 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
|
---|
230 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
|
---|
231 |
|
---|
232 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
|
---|
233 | # CP936 doesn't have vendor-addon for GBK, so they're identical.
|
---|
234 | define_alias( qr/^gbk$/i => '"cp936"' );
|
---|
235 |
|
---|
236 | # This fixes gb2312 vs. euc-cn confusion, practically
|
---|
237 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
|
---|
238 |
|
---|
239 | # for Encode::JP
|
---|
240 | define_alias( qr/\bjis$/i => '"7bit-jis"' );
|
---|
241 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
|
---|
242 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
|
---|
243 | define_alias( qr/\bujis$/i => '"euc-jp"' );
|
---|
244 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
|
---|
245 | define_alias( qr/\bsjis$/i => '"shiftjis"' );
|
---|
246 | define_alias( qr/\bwindows-31j$/i => '"cp932"' );
|
---|
247 |
|
---|
248 | # for Encode::KR
|
---|
249 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
|
---|
250 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
|
---|
251 |
|
---|
252 | # This fixes ksc5601 vs. euc-kr confusion, practically
|
---|
253 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
|
---|
254 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
|
---|
255 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
|
---|
256 |
|
---|
257 | # for Encode::TW
|
---|
258 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
|
---|
259 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
|
---|
260 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
|
---|
261 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
|
---|
262 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
|
---|
263 | }
|
---|
264 |
|
---|
265 | # https://github.com/dankogai/p5-encode/issues/37
|
---|
266 | define_alias(qr/cp65000/i => '"UTF-7"');
|
---|
267 | define_alias(qr/cp65001/i => '"utf-8-strict"');
|
---|
268 |
|
---|
269 | # utf8 is blessed :)
|
---|
270 | define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
|
---|
271 |
|
---|
272 | # At last, Map white space and _ to '-'
|
---|
273 | define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' );
|
---|
274 | }
|
---|
275 |
|
---|
276 | 1;
|
---|
277 | __END__
|
---|
278 |
|
---|
279 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
|
---|
280 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15
|
---|
281 | # TODO: Cyrillic encoding ISO-IR-111 (useful?)
|
---|
282 | # TODO: Armenian encoding ARMSCII-8
|
---|
283 | # TODO: Hebrew encoding ISO-8859-8-1
|
---|
284 | # TODO: Thai encoding TCVN
|
---|
285 | # TODO: Vietnamese encodings VPS
|
---|
286 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
|
---|
287 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
|
---|
288 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
|
---|
289 | # Kannada Khmer Korean Laotian Malayalam Mongolian
|
---|
290 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
|
---|
291 |
|
---|
292 | =head1 NAME
|
---|
293 |
|
---|
294 | Encode::Alias - alias definitions to encodings
|
---|
295 |
|
---|
296 | =head1 SYNOPSIS
|
---|
297 |
|
---|
298 | use Encode;
|
---|
299 | use Encode::Alias;
|
---|
300 | define_alias( "newName" => ENCODING);
|
---|
301 | define_alias( qr/.../ => ENCODING);
|
---|
302 | define_alias( sub { return ENCODING if ...; } );
|
---|
303 |
|
---|
304 | =head1 DESCRIPTION
|
---|
305 |
|
---|
306 | Allows newName to be used as an alias for ENCODING. ENCODING may be
|
---|
307 | either the name of an encoding or an encoding object (as described
|
---|
308 | in L<Encode>).
|
---|
309 |
|
---|
310 | Currently the first argument to define_alias() can be specified in the
|
---|
311 | following ways:
|
---|
312 |
|
---|
313 | =over 4
|
---|
314 |
|
---|
315 | =item As a simple string.
|
---|
316 |
|
---|
317 | =item As a qr// compiled regular expression, e.g.:
|
---|
318 |
|
---|
319 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
|
---|
320 |
|
---|
321 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
|
---|
322 | in order to allow C<$1> etc. to be substituted. The example is one
|
---|
323 | way to alias names as used in X11 fonts to the MIME names for the
|
---|
324 | iso-8859-* family. Note the double quotes inside the single quotes.
|
---|
325 |
|
---|
326 | (or, you don't have to do this yourself because this example is predefined)
|
---|
327 |
|
---|
328 | If you are using a regex here, you have to use the quotes as shown or
|
---|
329 | it won't work. Also note that regex handling is tricky even for the
|
---|
330 | experienced. Use this feature with caution.
|
---|
331 |
|
---|
332 | =item As a code reference, e.g.:
|
---|
333 |
|
---|
334 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
|
---|
335 |
|
---|
336 | The same effect as the example above in a different way. The coderef
|
---|
337 | takes the alias name as an argument and returns a canonical name on
|
---|
338 | success or undef if not. Note the second argument is ignored if provided.
|
---|
339 | Use this with even more caution than the regex version.
|
---|
340 |
|
---|
341 | =back
|
---|
342 |
|
---|
343 | =head3 Changes in code reference aliasing
|
---|
344 |
|
---|
345 | As of Encode 1.87, the older form
|
---|
346 |
|
---|
347 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
|
---|
348 |
|
---|
349 | no longer works.
|
---|
350 |
|
---|
351 | Encode up to 1.86 internally used "local $_" to implement this older
|
---|
352 | form. But consider the code below;
|
---|
353 |
|
---|
354 | use Encode;
|
---|
355 | $_ = "eeeee" ;
|
---|
356 | while (/(e)/g) {
|
---|
357 | my $utf = decode('aliased-encoding-name', $1);
|
---|
358 | print "position:",pos,"\n";
|
---|
359 | }
|
---|
360 |
|
---|
361 | Prior to Encode 1.86 this fails because of "local $_".
|
---|
362 |
|
---|
363 | =head2 Alias overloading
|
---|
364 |
|
---|
365 | You can override predefined aliases by simply applying define_alias().
|
---|
366 | The new alias is always evaluated first, and when necessary,
|
---|
367 | define_alias() flushes the internal cache to make the new definition
|
---|
368 | available.
|
---|
369 |
|
---|
370 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
|
---|
371 | # superset of SHIFT_JIS
|
---|
372 |
|
---|
373 | define_alias( qr/shift.*jis$/i => '"cp932"' );
|
---|
374 | define_alias( qr/sjis$/i => '"cp932"' );
|
---|
375 |
|
---|
376 | If you want to zap all predefined aliases, you can use
|
---|
377 |
|
---|
378 | Encode::Alias->undef_aliases;
|
---|
379 |
|
---|
380 | to do so. And
|
---|
381 |
|
---|
382 | Encode::Alias->init_aliases;
|
---|
383 |
|
---|
384 | gets the factory settings back.
|
---|
385 |
|
---|
386 | Note that define_alias() will not be able to override the canonical name
|
---|
387 | of encodings. Encodings are first looked up by canonical name before
|
---|
388 | potential aliases are tried.
|
---|
389 |
|
---|
390 | =head1 SEE ALSO
|
---|
391 |
|
---|
392 | L<Encode>, L<Encode::Supported>
|
---|
393 |
|
---|
394 | =cut
|
---|
395 |
|
---|