1 |
|
---|
2 | # Time-stamp: "2004-10-06 23:26:33 ADT"
|
---|
3 | # Sean M. Burke <[email protected]>
|
---|
4 |
|
---|
5 | require 5.000;
|
---|
6 | package I18N::LangTags;
|
---|
7 | use strict;
|
---|
8 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
|
---|
9 | require Exporter;
|
---|
10 | @ISA = qw(Exporter);
|
---|
11 | @EXPORT = qw();
|
---|
12 | @EXPORT_OK = qw(is_language_tag same_language_tag
|
---|
13 | extract_language_tags super_languages
|
---|
14 | similarity_language_tag is_dialect_of
|
---|
15 | locale2language_tag alternate_language_tags
|
---|
16 | encode_language_tag panic_languages
|
---|
17 | implicate_supers
|
---|
18 | implicate_supers_strictly
|
---|
19 | );
|
---|
20 | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
|
---|
21 |
|
---|
22 | $VERSION = "0.35";
|
---|
23 |
|
---|
24 | sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
|
---|
25 |
|
---|
26 |
|
---|
27 | =head1 NAME
|
---|
28 |
|
---|
29 | I18N::LangTags - functions for dealing with RFC3066-style language tags
|
---|
30 |
|
---|
31 | =head1 SYNOPSIS
|
---|
32 |
|
---|
33 | use I18N::LangTags();
|
---|
34 |
|
---|
35 | ...or specify whichever of those functions you want to import, like so:
|
---|
36 |
|
---|
37 | use I18N::LangTags qw(implicate_supers similarity_language_tag);
|
---|
38 |
|
---|
39 | All the exportable functions are listed below -- you're free to import
|
---|
40 | only some, or none at all. By default, none are imported. If you
|
---|
41 | say:
|
---|
42 |
|
---|
43 | use I18N::LangTags qw(:ALL)
|
---|
44 |
|
---|
45 | ...then all are exported. (This saves you from having to use
|
---|
46 | something less obvious like C<use I18N::LangTags qw(/./)>.)
|
---|
47 |
|
---|
48 | If you don't import any of these functions, assume a C<&I18N::LangTags::>
|
---|
49 | in front of all the function names in the following examples.
|
---|
50 |
|
---|
51 | =head1 DESCRIPTION
|
---|
52 |
|
---|
53 | Language tags are a formalism, described in RFC 3066 (obsoleting
|
---|
54 | 1766), for declaring what language form (language and possibly
|
---|
55 | dialect) a given chunk of information is in.
|
---|
56 |
|
---|
57 | This library provides functions for common tasks involving language
|
---|
58 | tags as they are needed in a variety of protocols and applications.
|
---|
59 |
|
---|
60 | Please see the "See Also" references for a thorough explanation
|
---|
61 | of how to correctly use language tags.
|
---|
62 |
|
---|
63 | =over
|
---|
64 |
|
---|
65 | =cut
|
---|
66 |
|
---|
67 | ###########################################################################
|
---|
68 |
|
---|
69 | =item * the function is_language_tag($lang1)
|
---|
70 |
|
---|
71 | Returns true iff $lang1 is a formally valid language tag.
|
---|
72 |
|
---|
73 | is_language_tag("fr") is TRUE
|
---|
74 | is_language_tag("x-jicarilla") is FALSE
|
---|
75 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
|
---|
76 |
|
---|
77 | is_language_tag("sgn-US") is TRUE
|
---|
78 | (That's American Sign Language)
|
---|
79 |
|
---|
80 | is_language_tag("i-Klikitat") is TRUE
|
---|
81 | (True without regard to the fact noone has actually
|
---|
82 | registered Klikitat -- it's a formally valid tag)
|
---|
83 |
|
---|
84 | is_language_tag("fr-patois") is TRUE
|
---|
85 | (Formally valid -- altho descriptively weak!)
|
---|
86 |
|
---|
87 | is_language_tag("Spanish") is FALSE
|
---|
88 | is_language_tag("french-patois") is FALSE
|
---|
89 | (No good -- first subtag has to match
|
---|
90 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
|
---|
91 |
|
---|
92 | is_language_tag("x-borg-prot2532") is TRUE
|
---|
93 | (Yes, subtags can contain digits, as of RFC3066)
|
---|
94 |
|
---|
95 | =cut
|
---|
96 |
|
---|
97 | sub is_language_tag {
|
---|
98 |
|
---|
99 | ## Changes in the language tagging standards may have to be reflected here.
|
---|
100 |
|
---|
101 | my($tag) = lc($_[0]);
|
---|
102 |
|
---|
103 | return 0 if $tag eq "i" or $tag eq "x";
|
---|
104 | # Bad degenerate cases that the following
|
---|
105 | # regexp would erroneously let pass
|
---|
106 |
|
---|
107 | return $tag =~
|
---|
108 | /^(?: # First subtag
|
---|
109 | [xi] | [a-z]{2,3}
|
---|
110 | )
|
---|
111 | (?: # Subtags thereafter
|
---|
112 | - # separator
|
---|
113 | [a-z0-9]{1,8} # subtag
|
---|
114 | )*
|
---|
115 | $/xs ? 1 : 0;
|
---|
116 | }
|
---|
117 |
|
---|
118 | ###########################################################################
|
---|
119 |
|
---|
120 | =item * the function extract_language_tags($whatever)
|
---|
121 |
|
---|
122 | Returns a list of whatever looks like formally valid language tags
|
---|
123 | in $whatever. Not very smart, so don't get too creative with
|
---|
124 | what you want to feed it.
|
---|
125 |
|
---|
126 | extract_language_tags("fr, fr-ca, i-mingo")
|
---|
127 | returns: ('fr', 'fr-ca', 'i-mingo')
|
---|
128 |
|
---|
129 | extract_language_tags("It's like this: I'm in fr -- French!")
|
---|
130 | returns: ('It', 'in', 'fr')
|
---|
131 | (So don't just feed it any old thing.)
|
---|
132 |
|
---|
133 | The output is untainted. If you don't know what tainting is,
|
---|
134 | don't worry about it.
|
---|
135 |
|
---|
136 | =cut
|
---|
137 |
|
---|
138 | sub extract_language_tags {
|
---|
139 |
|
---|
140 | ## Changes in the language tagging standards may have to be reflected here.
|
---|
141 |
|
---|
142 | my($text) =
|
---|
143 | $_[0] =~ m/(.+)/ # to make for an untainted result
|
---|
144 | ? $1 : ''
|
---|
145 | ;
|
---|
146 |
|
---|
147 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
|
---|
148 | $text =~
|
---|
149 | m/
|
---|
150 | \b
|
---|
151 | (?: # First subtag
|
---|
152 | [iIxX] | [a-zA-Z]{2,3}
|
---|
153 | )
|
---|
154 | (?: # Subtags thereafter
|
---|
155 | - # separator
|
---|
156 | [a-zA-Z0-9]{1,8} # subtag
|
---|
157 | )*
|
---|
158 | \b
|
---|
159 | /xsg
|
---|
160 | );
|
---|
161 | }
|
---|
162 |
|
---|
163 | ###########################################################################
|
---|
164 |
|
---|
165 | =item * the function same_language_tag($lang1, $lang2)
|
---|
166 |
|
---|
167 | Returns true iff $lang1 and $lang2 are acceptable variant tags
|
---|
168 | representing the same language-form.
|
---|
169 |
|
---|
170 | same_language_tag('x-kadara', 'i-kadara') is TRUE
|
---|
171 | (The x/i- alternation doesn't matter)
|
---|
172 | same_language_tag('X-KADARA', 'i-kadara') is TRUE
|
---|
173 | (...and neither does case)
|
---|
174 | same_language_tag('en', 'en-US') is FALSE
|
---|
175 | (all-English is not the SAME as US English)
|
---|
176 | same_language_tag('x-kadara', 'x-kadar') is FALSE
|
---|
177 | (these are totally unrelated tags)
|
---|
178 | same_language_tag('no-bok', 'nb') is TRUE
|
---|
179 | (no-bok is a legacy tag for nb (Norwegian Bokmal))
|
---|
180 |
|
---|
181 | C<same_language_tag> works by just seeing whether
|
---|
182 | C<encode_language_tag($lang1)> is the same as
|
---|
183 | C<encode_language_tag($lang2)>.
|
---|
184 |
|
---|
185 | (Yes, I know this function is named a bit oddly. Call it historic
|
---|
186 | reasons.)
|
---|
187 |
|
---|
188 | =cut
|
---|
189 |
|
---|
190 | sub same_language_tag {
|
---|
191 | my $el1 = &encode_language_tag($_[0]);
|
---|
192 | return 0 unless defined $el1;
|
---|
193 | # this avoids the problem of
|
---|
194 | # encode_language_tag($lang1) eq and encode_language_tag($lang2)
|
---|
195 | # being true if $lang1 and $lang2 are both undef
|
---|
196 |
|
---|
197 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
|
---|
198 | }
|
---|
199 |
|
---|
200 | ###########################################################################
|
---|
201 |
|
---|
202 | =item * the function similarity_language_tag($lang1, $lang2)
|
---|
203 |
|
---|
204 | Returns an integer representing the degree of similarity between
|
---|
205 | tags $lang1 and $lang2 (the order of which does not matter), where
|
---|
206 | similarity is the number of common elements on the left,
|
---|
207 | without regard to case and to x/i- alternation.
|
---|
208 |
|
---|
209 | similarity_language_tag('fr', 'fr-ca') is 1
|
---|
210 | (one element in common)
|
---|
211 | similarity_language_tag('fr-ca', 'fr-FR') is 1
|
---|
212 | (one element in common)
|
---|
213 |
|
---|
214 | similarity_language_tag('fr-CA-joual',
|
---|
215 | 'fr-CA-PEI') is 2
|
---|
216 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
|
---|
217 | (two elements in common)
|
---|
218 |
|
---|
219 | similarity_language_tag('x-kadara', 'i-kadara') is 1
|
---|
220 | (x/i- doesn't matter)
|
---|
221 |
|
---|
222 | similarity_language_tag('en', 'x-kadar') is 0
|
---|
223 | similarity_language_tag('x-kadara', 'x-kadar') is 0
|
---|
224 | (unrelated tags -- no similarity)
|
---|
225 |
|
---|
226 | similarity_language_tag('i-cree-syllabic',
|
---|
227 | 'i-cherokee-syllabic') is 0
|
---|
228 | (no B<leftmost> elements in common!)
|
---|
229 |
|
---|
230 | =cut
|
---|
231 |
|
---|
232 | sub similarity_language_tag {
|
---|
233 | my $lang1 = &encode_language_tag($_[0]);
|
---|
234 | my $lang2 = &encode_language_tag($_[1]);
|
---|
235 | # And encode_language_tag takes care of the whole
|
---|
236 | # no-nyn==nn, i-hakka==zh-hakka, etc, things
|
---|
237 |
|
---|
238 | # NB: (i-sil-...)? (i-sgn-...)?
|
---|
239 |
|
---|
240 | return undef if !defined($lang1) and !defined($lang2);
|
---|
241 | return 0 if !defined($lang1) or !defined($lang2);
|
---|
242 |
|
---|
243 | my @l1_subtags = split('-', $lang1);
|
---|
244 | my @l2_subtags = split('-', $lang2);
|
---|
245 | my $similarity = 0;
|
---|
246 |
|
---|
247 | while(@l1_subtags and @l2_subtags) {
|
---|
248 | if(shift(@l1_subtags) eq shift(@l2_subtags)) {
|
---|
249 | ++$similarity;
|
---|
250 | } else {
|
---|
251 | last;
|
---|
252 | }
|
---|
253 | }
|
---|
254 | return $similarity;
|
---|
255 | }
|
---|
256 |
|
---|
257 | ###########################################################################
|
---|
258 |
|
---|
259 | =item * the function is_dialect_of($lang1, $lang2)
|
---|
260 |
|
---|
261 | Returns true iff language tag $lang1 represents a subform of
|
---|
262 | language tag $lang2.
|
---|
263 |
|
---|
264 | B<Get the order right! It doesn't work the other way around!>
|
---|
265 |
|
---|
266 | is_dialect_of('en-US', 'en') is TRUE
|
---|
267 | (American English IS a dialect of all-English)
|
---|
268 |
|
---|
269 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
|
---|
270 | is_dialect_of('fr-CA-joual', 'fr') is TRUE
|
---|
271 | (Joual is a dialect of (a dialect of) French)
|
---|
272 |
|
---|
273 | is_dialect_of('en', 'en-US') is FALSE
|
---|
274 | (all-English is a NOT dialect of American English)
|
---|
275 |
|
---|
276 | is_dialect_of('fr', 'en-CA') is FALSE
|
---|
277 |
|
---|
278 | is_dialect_of('en', 'en' ) is TRUE
|
---|
279 | is_dialect_of('en-US', 'en-US') is TRUE
|
---|
280 | (B<Note:> these are degenerate cases)
|
---|
281 |
|
---|
282 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
|
---|
283 | (the x/i thing doesn't matter, nor does case)
|
---|
284 |
|
---|
285 | is_dialect_of('nn', 'no') is TRUE
|
---|
286 | (because 'nn' (New Norse) is aliased to 'no-nyn',
|
---|
287 | as a special legacy case, and 'no-nyn' is a
|
---|
288 | subform of 'no' (Norwegian))
|
---|
289 |
|
---|
290 | =cut
|
---|
291 |
|
---|
292 | sub is_dialect_of {
|
---|
293 |
|
---|
294 | my $lang1 = &encode_language_tag($_[0]);
|
---|
295 | my $lang2 = &encode_language_tag($_[1]);
|
---|
296 |
|
---|
297 | return undef if !defined($lang1) and !defined($lang2);
|
---|
298 | return 0 if !defined($lang1) or !defined($lang2);
|
---|
299 |
|
---|
300 | return 1 if $lang1 eq $lang2;
|
---|
301 | return 0 if length($lang1) < length($lang2);
|
---|
302 |
|
---|
303 | $lang1 .= '-';
|
---|
304 | $lang2 .= '-';
|
---|
305 | return
|
---|
306 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
|
---|
307 | }
|
---|
308 |
|
---|
309 | ###########################################################################
|
---|
310 |
|
---|
311 | =item * the function super_languages($lang1)
|
---|
312 |
|
---|
313 | Returns a list of language tags that are superordinate tags to $lang1
|
---|
314 | -- it gets this by removing subtags from the end of $lang1 until
|
---|
315 | nothing (or just "i" or "x") is left.
|
---|
316 |
|
---|
317 | super_languages("fr-CA-joual") is ("fr-CA", "fr")
|
---|
318 |
|
---|
319 | super_languages("en-AU") is ("en")
|
---|
320 |
|
---|
321 | super_languages("en") is empty-list, ()
|
---|
322 |
|
---|
323 | super_languages("i-cherokee") is empty-list, ()
|
---|
324 | ...not ("i"), which would be illegal as well as pointless.
|
---|
325 |
|
---|
326 | If $lang1 is not a valid language tag, returns empty-list in
|
---|
327 | a list context, undef in a scalar context.
|
---|
328 |
|
---|
329 | A notable and rather unavoidable problem with this method:
|
---|
330 | "x-mingo-tom" has an "x" because the whole tag isn't an
|
---|
331 | IANA-registered tag -- but super_languages('x-mingo-tom') is
|
---|
332 | ('x-mingo') -- which isn't really right, since 'i-mingo' is
|
---|
333 | registered. But this module has no way of knowing that. (But note
|
---|
334 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
|
---|
335 |
|
---|
336 | More importantly, you assume I<at your peril> that superordinates of
|
---|
337 | $lang1 are mutually intelligible with $lang1. Consider this
|
---|
338 | carefully.
|
---|
339 |
|
---|
340 | =cut
|
---|
341 |
|
---|
342 | sub super_languages {
|
---|
343 | my $lang1 = $_[0];
|
---|
344 | return() unless defined($lang1) && &is_language_tag($lang1);
|
---|
345 |
|
---|
346 | # a hack for those annoying new (2001) tags:
|
---|
347 | $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
|
---|
348 | $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
|
---|
349 | $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
|
---|
350 | # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
|
---|
351 |
|
---|
352 | my @l1_subtags = split('-', $lang1);
|
---|
353 |
|
---|
354 | ## Changes in the language tagging standards may have to be reflected here.
|
---|
355 |
|
---|
356 | # NB: (i-sil-...)?
|
---|
357 |
|
---|
358 | my @supers = ();
|
---|
359 | foreach my $bit (@l1_subtags) {
|
---|
360 | push @supers,
|
---|
361 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
|
---|
362 | }
|
---|
363 | pop @supers if @supers;
|
---|
364 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
|
---|
365 | return reverse @supers;
|
---|
366 | }
|
---|
367 |
|
---|
368 | ###########################################################################
|
---|
369 |
|
---|
370 | =item * the function locale2language_tag($locale_identifier)
|
---|
371 |
|
---|
372 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
|
---|
373 | and maps it to a language tag. If it's not mappable (as with,
|
---|
374 | notably, "C" and "POSIX"), this returns empty-list in a list context,
|
---|
375 | or undef in a scalar context.
|
---|
376 |
|
---|
377 | locale2language_tag("en") is "en"
|
---|
378 |
|
---|
379 | locale2language_tag("en_US") is "en-US"
|
---|
380 |
|
---|
381 | locale2language_tag("en_US.ISO8859-1") is "en-US"
|
---|
382 |
|
---|
383 | locale2language_tag("C") is undef or ()
|
---|
384 |
|
---|
385 | locale2language_tag("POSIX") is undef or ()
|
---|
386 |
|
---|
387 | locale2language_tag("POSIX") is undef or ()
|
---|
388 |
|
---|
389 | I'm not totally sure that locale names map satisfactorily to language
|
---|
390 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
|
---|
391 |
|
---|
392 | The output is untainted. If you don't know what tainting is,
|
---|
393 | don't worry about it.
|
---|
394 |
|
---|
395 | =cut
|
---|
396 |
|
---|
397 | sub locale2language_tag {
|
---|
398 | my $lang =
|
---|
399 | $_[0] =~ m/(.+)/ # to make for an untainted result
|
---|
400 | ? $1 : ''
|
---|
401 | ;
|
---|
402 |
|
---|
403 | return $lang if &is_language_tag($lang); # like "en"
|
---|
404 |
|
---|
405 | $lang =~ tr<_><->; # "en_US" -> en-US
|
---|
406 | $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US
|
---|
407 | # it_IT.utf8@euro => it-IT
|
---|
408 |
|
---|
409 | return $lang if &is_language_tag($lang);
|
---|
410 |
|
---|
411 | return;
|
---|
412 | }
|
---|
413 |
|
---|
414 | ###########################################################################
|
---|
415 |
|
---|
416 | =item * the function encode_language_tag($lang1)
|
---|
417 |
|
---|
418 | This function, if given a language tag, returns an encoding of it such
|
---|
419 | that:
|
---|
420 |
|
---|
421 | * tags representing different languages never get the same encoding.
|
---|
422 |
|
---|
423 | * tags representing the same language always get the same encoding.
|
---|
424 |
|
---|
425 | * an encoding of a formally valid language tag always is a string
|
---|
426 | value that is defined, has length, and is true if considered as a
|
---|
427 | boolean.
|
---|
428 |
|
---|
429 | Note that the encoding itself is B<not> a formally valid language tag.
|
---|
430 | Note also that you cannot, currently, go from an encoding back to a
|
---|
431 | language tag that it's an encoding of.
|
---|
432 |
|
---|
433 | Note also that you B<must> consider the encoded value as atomic; i.e.,
|
---|
434 | you should not consider it as anything but an opaque, unanalysable
|
---|
435 | string value. (The internals of the encoding method may change in
|
---|
436 | future versions, as the language tagging standard changes over time.)
|
---|
437 |
|
---|
438 | C<encode_language_tag> returns undef if given anything other than a
|
---|
439 | formally valid language tag.
|
---|
440 |
|
---|
441 | The reason C<encode_language_tag> exists is because different language
|
---|
442 | tags may represent the same language; this is normally treatable with
|
---|
443 | C<same_language_tag>, but consider this situation:
|
---|
444 |
|
---|
445 | You have a data file that expresses greetings in different languages.
|
---|
446 | Its format is "[language tag]=[how to say 'Hello']", like:
|
---|
447 |
|
---|
448 | en-US=Hiho
|
---|
449 | fr=Bonjour
|
---|
450 | i-mingo=Hau'
|
---|
451 |
|
---|
452 | And suppose you write a program that reads that file and then runs as
|
---|
453 | a daemon, answering client requests that specify a language tag and
|
---|
454 | then expect the string that says how to greet in that language. So an
|
---|
455 | interaction looks like:
|
---|
456 |
|
---|
457 | greeting-client asks: fr
|
---|
458 | greeting-server answers: Bonjour
|
---|
459 |
|
---|
460 | So far so good. But suppose the way you're implementing this is:
|
---|
461 |
|
---|
462 | my %greetings;
|
---|
463 | die unless open(IN, "<in.dat");
|
---|
464 | while(<IN>) {
|
---|
465 | chomp;
|
---|
466 | next unless /^([^=]+)=(.+)/s;
|
---|
467 | my($lang, $expr) = ($1, $2);
|
---|
468 | $greetings{$lang} = $expr;
|
---|
469 | }
|
---|
470 | close(IN);
|
---|
471 |
|
---|
472 | at which point %greetings has the contents:
|
---|
473 |
|
---|
474 | "en-US" => "Hiho"
|
---|
475 | "fr" => "Bonjour"
|
---|
476 | "i-mingo" => "Hau'"
|
---|
477 |
|
---|
478 | And suppose then that you answer client requests for language $wanted
|
---|
479 | by just looking up $greetings{$wanted}.
|
---|
480 |
|
---|
481 | If the client asks for "fr", that will look up successfully in
|
---|
482 | %greetings, to the value "Bonjour". And if the client asks for
|
---|
483 | "i-mingo", that will look up successfully in %greetings, to the value
|
---|
484 | "Hau'".
|
---|
485 |
|
---|
486 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
|
---|
487 | lookup in %greetings fails. That's the Wrong Thing.
|
---|
488 |
|
---|
489 | You could instead do lookups on $wanted with:
|
---|
490 |
|
---|
491 | use I18N::LangTags qw(same_language_tag);
|
---|
492 | my $repsonse = '';
|
---|
493 | foreach my $l2 (keys %greetings) {
|
---|
494 | if(same_language_tag($wanted, $l2)) {
|
---|
495 | $response = $greetings{$l2};
|
---|
496 | last;
|
---|
497 | }
|
---|
498 | }
|
---|
499 |
|
---|
500 | But that's rather inefficient. A better way to do it is to start your
|
---|
501 | program with:
|
---|
502 |
|
---|
503 | use I18N::LangTags qw(encode_language_tag);
|
---|
504 | my %greetings;
|
---|
505 | die unless open(IN, "<in.dat");
|
---|
506 | while(<IN>) {
|
---|
507 | chomp;
|
---|
508 | next unless /^([^=]+)=(.+)/s;
|
---|
509 | my($lang, $expr) = ($1, $2);
|
---|
510 | $greetings{
|
---|
511 | encode_language_tag($lang)
|
---|
512 | } = $expr;
|
---|
513 | }
|
---|
514 | close(IN);
|
---|
515 |
|
---|
516 | and then just answer client requests for language $wanted by just
|
---|
517 | looking up
|
---|
518 |
|
---|
519 | $greetings{encode_language_tag($wanted)}
|
---|
520 |
|
---|
521 | And that does the Right Thing.
|
---|
522 |
|
---|
523 | =cut
|
---|
524 |
|
---|
525 | sub encode_language_tag {
|
---|
526 | # Only similarity_language_tag() is allowed to analyse encodings!
|
---|
527 |
|
---|
528 | ## Changes in the language tagging standards may have to be reflected here.
|
---|
529 |
|
---|
530 | my($tag) = $_[0] || return undef;
|
---|
531 | return undef unless &is_language_tag($tag);
|
---|
532 |
|
---|
533 | # For the moment, these legacy variances are few enough that
|
---|
534 | # we can just handle them here with regexps.
|
---|
535 | $tag =~ s/^iw\b/he/i; # Hebrew
|
---|
536 | $tag =~ s/^in\b/id/i; # Indonesian
|
---|
537 | $tag =~ s/^cre\b/cr/i; # Cree
|
---|
538 | $tag =~ s/^jw\b/jv/i; # Javanese
|
---|
539 | $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
|
---|
540 | $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
|
---|
541 | $tag =~ s/^ji\b/yi/i; # Yiddish
|
---|
542 | # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now,
|
---|
543 | # but maybe they're all so obscure I can ignore them. "Obscure"
|
---|
544 | # meaning either that the language is obscure, and/or that the
|
---|
545 | # XXX form was extant so briefly that it's unlikely it was ever
|
---|
546 | # used. I hope.
|
---|
547 | #
|
---|
548 | # These go FROM the simplex to complex form, to get
|
---|
549 | # similarity-comparison right. And that's okay, since
|
---|
550 | # similarity_language_tag is the only thing that
|
---|
551 | # analyzes our output.
|
---|
552 | $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
|
---|
553 | $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
|
---|
554 | $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
|
---|
555 |
|
---|
556 | $tag =~ s/^[xiXI]-//s;
|
---|
557 | # Just lop off any leading "x/i-"
|
---|
558 |
|
---|
559 | return "~" . uc($tag);
|
---|
560 | }
|
---|
561 |
|
---|
562 | #--------------------------------------------------------------------------
|
---|
563 |
|
---|
564 | =item * the function alternate_language_tags($lang1)
|
---|
565 |
|
---|
566 | This function, if given a language tag, returns all language tags that
|
---|
567 | are alternate forms of this language tag. (I.e., tags which refer to
|
---|
568 | the same language.) This is meant to handle legacy tags caused by
|
---|
569 | the minor changes in language tag standards over the years; and
|
---|
570 | the x-/i- alternation is also dealt with.
|
---|
571 |
|
---|
572 | Note that this function does I<not> try to equate new (and never-used,
|
---|
573 | and unusable)
|
---|
574 | ISO639-2 three-letter tags to old (and still in use) ISO639-1
|
---|
575 | two-letter equivalents -- like "ara" -> "ar" -- because
|
---|
576 | "ara" has I<never> been in use as an Internet language tag,
|
---|
577 | and RFC 3066 stipulates that it never should be, since a shorter
|
---|
578 | tag ("ar") exists.
|
---|
579 |
|
---|
580 | Examples:
|
---|
581 |
|
---|
582 | alternate_language_tags('no-bok') is ('nb')
|
---|
583 | alternate_language_tags('nb') is ('no-bok')
|
---|
584 | alternate_language_tags('he') is ('iw')
|
---|
585 | alternate_language_tags('iw') is ('he')
|
---|
586 | alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
|
---|
587 | alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
|
---|
588 | alternate_language_tags('en') is ()
|
---|
589 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
|
---|
590 | alternate_language_tags('x-klikitat') is ('i-klikitat')
|
---|
591 | alternate_language_tags('i-klikitat') is ('x-klikitat')
|
---|
592 |
|
---|
593 | This function returns empty-list if given anything other than a formally
|
---|
594 | valid language tag.
|
---|
595 |
|
---|
596 | =cut
|
---|
597 |
|
---|
598 | my %alt = qw( i x x i I X X I );
|
---|
599 | sub alternate_language_tags {
|
---|
600 | my $tag = $_[0];
|
---|
601 | return() unless &is_language_tag($tag);
|
---|
602 |
|
---|
603 | my @em; # push 'em real goood!
|
---|
604 |
|
---|
605 | # For the moment, these legacy variances are few enough that
|
---|
606 | # we can just handle them here with regexps.
|
---|
607 |
|
---|
608 | if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
|
---|
609 | } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
|
---|
610 |
|
---|
611 | } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
|
---|
612 | } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
|
---|
613 |
|
---|
614 | } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
|
---|
615 | } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
|
---|
616 |
|
---|
617 | } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
|
---|
618 | } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
|
---|
619 |
|
---|
620 | } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
|
---|
621 | } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
|
---|
622 |
|
---|
623 | } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
|
---|
624 | } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
|
---|
625 |
|
---|
626 | } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
|
---|
627 | } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
|
---|
628 |
|
---|
629 | } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
|
---|
630 | } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
|
---|
631 | }
|
---|
632 |
|
---|
633 | push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
|
---|
634 | return @em;
|
---|
635 | }
|
---|
636 |
|
---|
637 | ###########################################################################
|
---|
638 |
|
---|
639 | {
|
---|
640 | # Init %Panic...
|
---|
641 |
|
---|
642 | my @panic = ( # MUST all be lowercase!
|
---|
643 | # Only large ("national") languages make it in this list.
|
---|
644 | # If you, as a user, are so bizarre that the /only/ language
|
---|
645 | # you claim to accept is Galician, then no, we won't do you
|
---|
646 | # the favor of providing Catalan as a panic-fallback for
|
---|
647 | # you. Because if I start trying to add "little languages" in
|
---|
648 | # here, I'll just go crazy.
|
---|
649 |
|
---|
650 | # Scandinavian lgs. All based on opinion and hearsay.
|
---|
651 | 'sv' => [qw(nb no da nn)],
|
---|
652 | 'da' => [qw(nb no sv nn)], # I guess
|
---|
653 | [qw(no nn nb)], [qw(no nn nb sv da)],
|
---|
654 | 'is' => [qw(da sv no nb nn)],
|
---|
655 | 'fo' => [qw(da is no nb nn sv)], # I guess
|
---|
656 |
|
---|
657 | # I think this is about the extent of tolerable intelligibility
|
---|
658 | # among large modern Romance languages.
|
---|
659 | 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
|
---|
660 | 'ca' => [qw(es pt it fr)],
|
---|
661 | 'es' => [qw(ca it fr pt)],
|
---|
662 | 'it' => [qw(es fr ca pt)],
|
---|
663 | 'fr' => [qw(es it ca pt)],
|
---|
664 |
|
---|
665 | # Also assume that speakers of the main Indian languages prefer
|
---|
666 | # to read/hear Hindi over English
|
---|
667 | [qw(
|
---|
668 | as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
|
---|
669 | )] => 'hi',
|
---|
670 | # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
|
---|
671 | # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
|
---|
672 | # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
|
---|
673 | 'hi' => [qw(bn pa as or)],
|
---|
674 | # I welcome finer data for the other Indian languages.
|
---|
675 | # E.g., what should Oriya's list be, besides just Hindi?
|
---|
676 |
|
---|
677 | # And the panic languages for English is, of course, nil!
|
---|
678 |
|
---|
679 | # My guesses at Slavic intelligibility:
|
---|
680 | ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
|
---|
681 | 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
|
---|
682 | 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
|
---|
683 |
|
---|
684 | 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
|
---|
685 |
|
---|
686 | 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
|
---|
687 |
|
---|
688 | #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
|
---|
689 |
|
---|
690 | );
|
---|
691 | my($k,$v);
|
---|
692 | while(@panic) {
|
---|
693 | ($k,$v) = splice(@panic,0,2);
|
---|
694 | foreach my $k (ref($k) ? @$k : $k) {
|
---|
695 | foreach my $v (ref($v) ? @$v : $v) {
|
---|
696 | push @{$Panic{$k} ||= []}, $v unless $k eq $v;
|
---|
697 | }
|
---|
698 | }
|
---|
699 | }
|
---|
700 | }
|
---|
701 |
|
---|
702 | =item * the function @langs = panic_languages(@accept_languages)
|
---|
703 |
|
---|
704 | This function takes a list of 0 or more language
|
---|
705 | tags that constitute a given user's Accept-Language list, and
|
---|
706 | returns a list of tags for I<other> (non-super)
|
---|
707 | languages that are probably acceptable to the user, to be
|
---|
708 | used I<if all else fails>.
|
---|
709 |
|
---|
710 | For example, if a user accepts only 'ca' (Catalan) and
|
---|
711 | 'es' (Spanish), and the documents/interfaces you have
|
---|
712 | available are just in German, Italian, and Chinese, then
|
---|
713 | the user will most likely want the Italian one (and not
|
---|
714 | the Chinese or German one!), instead of getting
|
---|
715 | nothing. So C<panic_languages('ca', 'es')> returns
|
---|
716 | a list containing 'it' (Italian).
|
---|
717 |
|
---|
718 | English ('en') is I<always> in the return list, but
|
---|
719 | whether it's at the very end or not depends
|
---|
720 | on the input languages. This function works by consulting
|
---|
721 | an internal table that stipulates what common
|
---|
722 | languages are "close" to each other.
|
---|
723 |
|
---|
724 | A useful construct you might consider using is:
|
---|
725 |
|
---|
726 | @fallbacks = super_languages(@accept_languages);
|
---|
727 | push @fallbacks, panic_languages(
|
---|
728 | @accept_languages, @fallbacks,
|
---|
729 | );
|
---|
730 |
|
---|
731 | =cut
|
---|
732 |
|
---|
733 | sub panic_languages {
|
---|
734 | # When in panic or in doubt, run in circles, scream, and shout!
|
---|
735 | my(@out, %seen);
|
---|
736 | foreach my $t (@_) {
|
---|
737 | next unless $t;
|
---|
738 | next if $seen{$t}++; # so we don't return it or hit it again
|
---|
739 | # push @out, super_languages($t); # nah, keep that separate
|
---|
740 | push @out, @{ $Panic{lc $t} || next };
|
---|
741 | }
|
---|
742 | return grep !$seen{$_}++, @out, 'en';
|
---|
743 | }
|
---|
744 |
|
---|
745 | #---------------------------------------------------------------------------
|
---|
746 | #---------------------------------------------------------------------------
|
---|
747 |
|
---|
748 | =item * the function implicate_supers( ...languages... )
|
---|
749 |
|
---|
750 | This takes a list of strings (which are presumed to be language-tags;
|
---|
751 | strings that aren't, are ignored); and after each one, this function
|
---|
752 | inserts super-ordinate forms that don't already appear in the list.
|
---|
753 | The original list, plus these insertions, is returned.
|
---|
754 |
|
---|
755 | In other words, it takes this:
|
---|
756 |
|
---|
757 | pt-br de-DE en-US fr pt-br-janeiro
|
---|
758 |
|
---|
759 | and returns this:
|
---|
760 |
|
---|
761 | pt-br pt de-DE de en-US en fr pt-br-janeiro
|
---|
762 |
|
---|
763 | This function is most useful in the idiom
|
---|
764 |
|
---|
765 | implicate_supers( I18N::LangTags::Detect::detect() );
|
---|
766 |
|
---|
767 | (See L<I18N::LangTags::Detect>.)
|
---|
768 |
|
---|
769 |
|
---|
770 | =item * the function implicate_supers_strictly( ...languages... )
|
---|
771 |
|
---|
772 | This works like C<implicate_supers> except that the implicated
|
---|
773 | forms are added to the end of the return list.
|
---|
774 |
|
---|
775 | In other words, implicate_supers_strictly takes a list of strings
|
---|
776 | (which are presumed to be language-tags; strings that aren't, are
|
---|
777 | ignored) and after the whole given list, it inserts the super-ordinate forms
|
---|
778 | of all given tags, minus any tags that already appear in the input list.
|
---|
779 |
|
---|
780 | In other words, it takes this:
|
---|
781 |
|
---|
782 | pt-br de-DE en-US fr pt-br-janeiro
|
---|
783 |
|
---|
784 | and returns this:
|
---|
785 |
|
---|
786 | pt-br de-DE en-US fr pt-br-janeiro pt de en
|
---|
787 |
|
---|
788 | The reason this function has "_strictly" in its name is that when
|
---|
789 | you're processing an Accept-Language list according to the RFCs, if
|
---|
790 | you interpret the RFCs quite strictly, then you would use
|
---|
791 | implicate_supers_strictly, but for normal use (i.e., common-sense use,
|
---|
792 | as far as I'm concerned) you'd use implicate_supers.
|
---|
793 |
|
---|
794 | =cut
|
---|
795 |
|
---|
796 | sub implicate_supers {
|
---|
797 | my @languages = grep is_language_tag($_), @_;
|
---|
798 | my %seen_encoded;
|
---|
799 | foreach my $lang (@languages) {
|
---|
800 | $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
|
---|
801 | }
|
---|
802 |
|
---|
803 | my(@output_languages);
|
---|
804 | foreach my $lang (@languages) {
|
---|
805 | push @output_languages, $lang;
|
---|
806 | foreach my $s ( I18N::LangTags::super_languages($lang) ) {
|
---|
807 | # Note that super_languages returns the longest first.
|
---|
808 | last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
|
---|
809 | push @output_languages, $s;
|
---|
810 | }
|
---|
811 | }
|
---|
812 | return uniq( @output_languages );
|
---|
813 |
|
---|
814 | }
|
---|
815 |
|
---|
816 | sub implicate_supers_strictly {
|
---|
817 | my @tags = grep is_language_tag($_), @_;
|
---|
818 | return uniq( @_, map super_languages($_), @_ );
|
---|
819 | }
|
---|
820 |
|
---|
821 |
|
---|
822 |
|
---|
823 | ###########################################################################
|
---|
824 | 1;
|
---|
825 | __END__
|
---|
826 |
|
---|
827 | =back
|
---|
828 |
|
---|
829 | =head1 ABOUT LOWERCASING
|
---|
830 |
|
---|
831 | I've considered making all the above functions that output language
|
---|
832 | tags return all those tags strictly in lowercase. Having all your
|
---|
833 | language tags in lowercase does make some things easier. But you
|
---|
834 | might as well just lowercase as you like, or call
|
---|
835 | C<encode_language_tag($lang1)> where appropriate.
|
---|
836 |
|
---|
837 | =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
|
---|
838 |
|
---|
839 | In some future version of I18N::LangTags, I plan to include support
|
---|
840 | for RFC2482-style language tags -- which are basically just normal
|
---|
841 | language tags with their ASCII characters shifted into Plane 14.
|
---|
842 |
|
---|
843 | =head1 SEE ALSO
|
---|
844 |
|
---|
845 | * L<I18N::LangTags::List|I18N::LangTags::List>
|
---|
846 |
|
---|
847 | * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
|
---|
848 | Identification of Languages". (Obsoletes RFC 1766)
|
---|
849 |
|
---|
850 | * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
|
---|
851 | Character Sets and Languages".
|
---|
852 |
|
---|
853 | * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
|
---|
854 | Value and Encoded Word Extensions: Character Sets, Languages, and
|
---|
855 | Continuations".
|
---|
856 |
|
---|
857 | * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
|
---|
858 | "Language Tagging in Unicode Plain Text".
|
---|
859 |
|
---|
860 | * Locale::Codes, in
|
---|
861 | C<http://www.perl.com/CPAN/modules/by-module/Locale/>
|
---|
862 |
|
---|
863 | * ISO 639-2, "Codes for the representation of names of languages",
|
---|
864 | including two-letter and three-letter codes,
|
---|
865 | C<http://www.loc.gov/standards/iso639-2/langcodes.html>
|
---|
866 |
|
---|
867 | * The IANA list of registered languages (hopefully up-to-date),
|
---|
868 | C<http://www.iana.org/assignments/language-tags>
|
---|
869 |
|
---|
870 | =head1 COPYRIGHT
|
---|
871 |
|
---|
872 | Copyright (c) 1998+ Sean M. Burke. All rights reserved.
|
---|
873 |
|
---|
874 | This library is free software; you can redistribute it and/or
|
---|
875 | modify it under the same terms as Perl itself.
|
---|
876 |
|
---|
877 | The programs and documentation in this dist are distributed in
|
---|
878 | the hope that they will be useful, but without any warranty; without
|
---|
879 | even the implied warranty of merchantability or fitness for a
|
---|
880 | particular purpose.
|
---|
881 |
|
---|
882 | =head1 AUTHOR
|
---|
883 |
|
---|
884 | Sean M. Burke C<[email protected]>
|
---|
885 |
|
---|
886 | =cut
|
---|
887 |
|
---|