Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: for-distributions/trunk/bin/windows/perl/lib/Unicode/UCD.pm@ 14489

Last change on this file since 14489 was 14489, checked in by oranfry, 17 years ago
upgrading to perl 5.8
File size: 22.3 KB

Line
1	package Unicode::UCD;
2
3	use strict;
4	use warnings;
5
6	our $VERSION = '0.24';
7
8	use Storable qw(dclone);
9
10	require Exporter;
11
12	our @ISA = qw(Exporter);
13
14	our @EXPORT_OK = qw(charinfo
15	charblock charscript
16	charblocks charscripts
17	charinrange
18	compexcl
19	casefold casespec
20	namedseq);
21
22	use Carp;
23
24	=head1 NAME
25
26	Unicode::UCD - Unicode character database
27
28	=head1 SYNOPSIS
29
30	use Unicode::UCD 'charinfo';
31	my $charinfo = charinfo($codepoint);
32
33	use Unicode::UCD 'charblock';
34	my $charblock = charblock($codepoint);
35
36	use Unicode::UCD 'charscript';
37	my $charscript = charscript($codepoint);
38
39	use Unicode::UCD 'charblocks';
40	my $charblocks = charblocks();
41
42	use Unicode::UCD 'charscripts';
43	my %charscripts = charscripts();
44
45	use Unicode::UCD qw(charscript charinrange);
46	my $range = charscript($script);
47	print "looks like $script\n" if charinrange($range, $codepoint);
48
49	use Unicode::UCD 'compexcl';
50	my $compexcl = compexcl($codepoint);
51
52	use Unicode::UCD 'namedseq';
53	my $namedseq = namedseq($named_sequence_name);
54
55	my $unicode_version = Unicode::UCD::UnicodeVersion();
56
57	=head1 DESCRIPTION
58
59	The Unicode::UCD module offers a simple interface to the Unicode
60	Character Database.
61
62	=cut
63
64	my $UNICODEFH;
65	my $BLOCKSFH;
66	my $SCRIPTSFH;
67	my $VERSIONFH;
68	my $COMPEXCLFH;
69	my $CASEFOLDFH;
70	my $CASESPECFH;
71	my $NAMEDSEQFH;
72
73	sub openunicode {
74	my ($rfh, @path) = @_;
75	my $f;
76	unless (defined $$rfh) {
77	for my $d (@INC) {
78	use File::Spec;
79	$f = File::Spec->catfile($d, "unicore", @path);
80	last if open($$rfh, $f);
81	undef $f;
82	}
83	croak __PACKAGE__, ": failed to find ",
84	File::Spec->catfile(@path), " in @INC"
85	unless defined $f;
86	}
87	return $f;
88	}
89
90	=head2 charinfo
91
92	use Unicode::UCD 'charinfo';
93
94	my $charinfo = charinfo(0x41);
95
96	charinfo() returns a reference to a hash that has the following fields
97	as defined by the Unicode standard:
98
99	key
100
101	code code point with at least four hexdigits
102	name name of the character IN UPPER CASE
103	category general category of the character
104	combining classes used in the Canonical Ordering Algorithm
105	bidi bidirectional category
106	decomposition character decomposition mapping
107	decimal if decimal digit this is the integer numeric value
108	digit if digit this is the numeric value
109	numeric if numeric is the integer or rational numeric value
110	mirrored if mirrored in bidirectional text
111	unicode10 Unicode 1.0 name if existed and different
112	comment ISO 10646 comment field
113	upper uppercase equivalent mapping
114	lower lowercase equivalent mapping
115	title titlecase equivalent mapping
116
117	block block the character belongs to (used in \p{In...})
118	script script the character belongs to
119
120	If no match is found, a reference to an empty hash is returned.
121
122	The C<block> property is the same as returned by charinfo(). It is
123	not defined in the Unicode Character Database proper (Chapter 4 of the
124	Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
125	(Chapter 14 of TUS3). Similarly for the C<script> property.
126
127	Note that you cannot do (de)composition and casing based solely on the
128	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
129	you will need also the compexcl(), casefold(), and casespec() functions.
130
131	=cut
132
133	# NB: This function is duplicated in charnames.pm
134	sub _getcode {
135	my $arg = shift;
136
137	if ($arg =~ /^[1-9]\d*$/) {
138	return $arg;
139	} elsif ($arg =~ /^(?:[Uu]\+\|0[xX])?([[:xdigit:]]+)$/) {
140	return hex($1);
141	}
142
143	return;
144	}
145
146	# Lingua::KO::Hangul::Util not part of the standard distribution
147	# but it will be used if available.
148
149	eval { require Lingua::KO::Hangul::Util };
150	my $hasHangulUtil = ! $@;
151	if ($hasHangulUtil) {
152	Lingua::KO::Hangul::Util->import();
153	}
154
155	sub hangul_decomp { # internal: called from charinfo
156	if ($hasHangulUtil) {
157	my @tmp = decomposeHangul(shift);
158	return sprintf("%04X %04X", @tmp) if @tmp == 2;
159	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
160	}
161	return;
162	}
163
164	sub hangul_charname { # internal: called from charinfo
165	return sprintf("HANGUL SYLLABLE-%04X", shift);
166	}
167
168	sub han_charname { # internal: called from charinfo
169	return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
170	}
171
172	my @CharinfoRanges = (
173	# block name
174	# [ first, last, coderef to name, coderef to decompose ],
175	# CJK Ideographs Extension A
176	[ 0x3400, 0x4DB5, \&han_charname, undef ],
177	# CJK Ideographs
178	[ 0x4E00, 0x9FA5, \&han_charname, undef ],
179	# Hangul Syllables
180	[ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
181	# Non-Private Use High Surrogates
182	[ 0xD800, 0xDB7F, undef, undef ],
183	# Private Use High Surrogates
184	[ 0xDB80, 0xDBFF, undef, undef ],
185	# Low Surrogates
186	[ 0xDC00, 0xDFFF, undef, undef ],
187	# The Private Use Area
188	[ 0xE000, 0xF8FF, undef, undef ],
189	# CJK Ideographs Extension B
190	[ 0x20000, 0x2A6D6, \&han_charname, undef ],
191	# Plane 15 Private Use Area
192	[ 0xF0000, 0xFFFFD, undef, undef ],
193	# Plane 16 Private Use Area
194	[ 0x100000, 0x10FFFD, undef, undef ],
195	);
196
197	sub charinfo {
198	my $arg = shift;
199	my $code = _getcode($arg);
200	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
201	unless defined $code;
202	my $hexk = sprintf("%06X", $code);
203	my($rcode,$rname,$rdec);
204	foreach my $range (@CharinfoRanges){
205	if ($range->[0] <= $code && $code <= $range->[1]) {
206	$rcode = $hexk;
207	$rcode =~ s/^0+//;
208	$rcode = sprintf("%04X", hex($rcode));
209	$rname = $range->[2] ? $range->[2]->($code) : '';
210	$rdec = $range->[3] ? $range->[3]->($code) : '';
211	$hexk = sprintf("%06X", $range->[0]); # replace by the first
212	last;
213	}
214	}
215	openunicode(\$UNICODEFH, "UnicodeData.txt");
216	if (defined $UNICODEFH) {
217	use Search::Dict 1.02;
218	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
219	my $line = <$UNICODEFH>;
220	return unless defined $line;
221	chomp $line;
222	my %prop;
223	@prop{qw(
224	code name category
225	combining bidi decomposition
226	decimal digit numeric
227	mirrored unicode10 comment
228	upper lower title
229	)} = split(/;/, $line, -1);
230	$hexk =~ s/^0+//;
231	$hexk = sprintf("%04X", hex($hexk));
232	if ($prop{code} eq $hexk) {
233	$prop{block} = charblock($code);
234	$prop{script} = charscript($code);
235	if(defined $rname){
236	$prop{code} = $rcode;
237	$prop{name} = $rname;
238	$prop{decomposition} = $rdec;
239	}
240	return \%prop;
241	}
242	}
243	}
244	return;
245	}
246
247	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
248	my ($table, $lo, $hi, $code) = @_;
249
250	return if $lo > $hi;
251
252	my $mid = int(($lo+$hi) / 2);
253
254	if ($table->[$mid]->[0] < $code) {
255	if ($table->[$mid]->[1] >= $code) {
256	return $table->[$mid]->[2];
257	} else {
258	_search($table, $mid + 1, $hi, $code);
259	}
260	} elsif ($table->[$mid]->[0] > $code) {
261	_search($table, $lo, $mid - 1, $code);
262	} else {
263	return $table->[$mid]->[2];
264	}
265	}
266
267	sub charinrange {
268	my ($range, $arg) = @_;
269	my $code = _getcode($arg);
270	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
271	unless defined $code;
272	_search($range, 0, $#$range, $code);
273	}
274
275	=head2 charblock
276
277	use Unicode::UCD 'charblock';
278
279	my $charblock = charblock(0x41);
280	my $charblock = charblock(1234);
281	my $charblock = charblock("0x263a");
282	my $charblock = charblock("U+263a");
283
284	my $range = charblock('Armenian');
285
286	With a B<code point argument> charblock() returns the I<block> the character
287	belongs to, e.g. C<Basic Latin>. Note that not all the character
288	positions within all blocks are defined.
289
290	See also L</Blocks versus Scripts>.
291
292	If supplied with an argument that can't be a code point, charblock() tries
293	to do the opposite and interpret the argument as a character block. The
294	return value is a I<range>: an anonymous list of lists that contain
295	I<start-of-range>, I<end-of-range> code point pairs. You can test whether
296	a code point is in a range using the L</charinrange> function. If the
297	argument is not a known character block, C<undef> is returned.
298
299	=cut
300
301	my @BLOCKS;
302	my %BLOCKS;
303
304	sub _charblocks {
305	unless (@BLOCKS) {
306	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
307	local $_;
308	while (<$BLOCKSFH>) {
309	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
310	my ($lo, $hi) = (hex($1), hex($2));
311	my $subrange = [ $lo, $hi, $3 ];
312	push @BLOCKS, $subrange;
313	push @{$BLOCKS{$3}}, $subrange;
314	}
315	}
316	close($BLOCKSFH);
317	}
318	}
319	}
320
321	sub charblock {
322	my $arg = shift;
323
324	_charblocks() unless @BLOCKS;
325
326	my $code = _getcode($arg);
327
328	if (defined $code) {
329	_search(\@BLOCKS, 0, $#BLOCKS, $code);
330	} else {
331	if (exists $BLOCKS{$arg}) {
332	return dclone $BLOCKS{$arg};
333	} else {
334	return;
335	}
336	}
337	}
338
339	=head2 charscript
340
341	use Unicode::UCD 'charscript';
342
343	my $charscript = charscript(0x41);
344	my $charscript = charscript(1234);
345	my $charscript = charscript("U+263a");
346
347	my $range = charscript('Thai');
348
349	With a B<code point argument> charscript() returns the I<script> the
350	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
351
352	See also L</Blocks versus Scripts>.
353
354	If supplied with an argument that can't be a code point, charscript() tries
355	to do the opposite and interpret the argument as a character script. The
356	return value is a I<range>: an anonymous list of lists that contain
357	I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
358	code point is in a range using the L</charinrange> function. If the
359	argument is not a known character script, C<undef> is returned.
360
361	=cut
362
363	my @SCRIPTS;
364	my %SCRIPTS;
365
366	sub _charscripts {
367	unless (@SCRIPTS) {
368	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
369	local $_;
370	while (<$SCRIPTSFH>) {
371	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
372	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
373	my $script = lc($3);
374	$script =~ s/\b(\w)/uc($1)/ge;
375	my $subrange = [ $lo, $hi, $script ];
376	push @SCRIPTS, $subrange;
377	push @{$SCRIPTS{$script}}, $subrange;
378	}
379	}
380	close($SCRIPTSFH);
381	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
382	}
383	}
384	}
385
386	sub charscript {
387	my $arg = shift;
388
389	_charscripts() unless @SCRIPTS;
390
391	my $code = _getcode($arg);
392
393	if (defined $code) {
394	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
395	} else {
396	if (exists $SCRIPTS{$arg}) {
397	return dclone $SCRIPTS{$arg};
398	} else {
399	return;
400	}
401	}
402	}
403
404	=head2 charblocks
405
406	use Unicode::UCD 'charblocks';
407
408	my $charblocks = charblocks();
409
410	charblocks() returns a reference to a hash with the known block names
411	as the keys, and the code point ranges (see L</charblock>) as the values.
412
413	See also L</Blocks versus Scripts>.
414
415	=cut
416
417	sub charblocks {
418	_charblocks() unless %BLOCKS;
419	return dclone \%BLOCKS;
420	}
421
422	=head2 charscripts
423
424	use Unicode::UCD 'charscripts';
425
426	my %charscripts = charscripts();
427
428	charscripts() returns a hash with the known script names as the keys,
429	and the code point ranges (see L</charscript>) as the values.
430
431	See also L</Blocks versus Scripts>.
432
433	=cut
434
435	sub charscripts {
436	_charscripts() unless %SCRIPTS;
437	return dclone \%SCRIPTS;
438	}
439
440	=head2 Blocks versus Scripts
441
442	The difference between a block and a script is that scripts are closer
443	to the linguistic notion of a set of characters required to present
444	languages, while block is more of an artifact of the Unicode character
445	numbering and separation into blocks of (mostly) 256 characters.
446
447	For example the Latin B<script> is spread over several B<blocks>, such
448	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
449	C<Latin Extended-B>. On the other hand, the Latin script does not
450	contain all the characters of the C<Basic Latin> block (also known as
451	the ASCII): it includes only the letters, and not, for example, the digits
452	or the punctuation.
453
454	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
455
456	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
457
458	=head2 Matching Scripts and Blocks
459
460	Scripts are matched with the regular-expression construct
461	C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
462	while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
463	any of the 256 code points in the Tibetan block).
464
465	=head2 Code Point Arguments
466
467	A I<code point argument> is either a decimal or a hexadecimal scalar
468	designating a Unicode character, or C<U+> followed by hexadecimals
469	designating a Unicode character. In other words, if you want a code
470	point to be interpreted as a hexadecimal number, you must prefix it
471	with either C<0x> or C<U+>, because a string like e.g. C<123> will
472	be interpreted as a decimal code point. Also note that Unicode is
473	B<not> limited to 16 bits (the number of Unicode characters is
474	open-ended, in theory unlimited): you may have more than 4 hexdigits.
475
476	=head2 charinrange
477
478	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
479	can also test whether a code point is in the I<range> as returned by
480	L</charblock> and L</charscript> or as the values of the hash returned
481	by L</charblocks> and L</charscripts> by using charinrange():
482
483	use Unicode::UCD qw(charscript charinrange);
484
485	$range = charscript('Hiragana');
486	print "looks like hiragana\n" if charinrange($range, $codepoint);
487
488	=cut
489
490	=head2 compexcl
491
492	use Unicode::UCD 'compexcl';
493
494	my $compexcl = compexcl("09dc");
495
496	The compexcl() returns the composition exclusion (that is, if the
497	character should not be produced during a precomposition) of the
498	character specified by a B<code point argument>.
499
500	If there is a composition exclusion for the character, true is
501	returned. Otherwise, false is returned.
502
503	=cut
504
505	my %COMPEXCL;
506
507	sub _compexcl {
508	unless (%COMPEXCL) {
509	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
510	local $_;
511	while (<$COMPEXCLFH>) {
512	if (/^([0-9A-F]+)\s+\#\s+/) {
513	my $code = hex($1);
514	$COMPEXCL{$code} = undef;
515	}
516	}
517	close($COMPEXCLFH);
518	}
519	}
520	}
521
522	sub compexcl {
523	my $arg = shift;
524	my $code = _getcode($arg);
525	croak __PACKAGE__, "::compexcl: unknown code '$arg'"
526	unless defined $code;
527
528	_compexcl() unless %COMPEXCL;
529
530	return exists $COMPEXCL{$code};
531	}
532
533	=head2 casefold
534
535	use Unicode::UCD 'casefold';
536
537	my $casefold = casefold("00DF");
538
539	The casefold() returns the locale-independent case folding of the
540	character specified by a B<code point argument>.
541
542	If there is a case folding for that character, a reference to a hash
543	with the following fields is returned:
544
545	key
546
547	code code point with at least four hexdigits
548	status "C", "F", "S", or "I"
549	mapping one or more codes separated by spaces
550
551	The meaning of the I<status> is as follows:
552
553	C common case folding, common mappings shared
554	by both simple and full mappings
555	F full case folding, mappings that cause strings
556	to grow in length. Multiple characters are separated
557	by spaces
558	S simple case folding, mappings to single characters
559	where different from F
560	I special case for dotted uppercase I and
561	dotless lowercase i
562	- If this mapping is included, the result is
563	case-insensitive, but dotless and dotted I's
564	are not distinguished
565	- If this mapping is excluded, the result is not
566	fully case-insensitive, but dotless and dotted
567	I's are distinguished
568
569	If there is no case folding for that character, C<undef> is returned.
570
571	For more information about case mappings see
572	http://www.unicode.org/unicode/reports/tr21/
573
574	=cut
575
576	my %CASEFOLD;
577
578	sub _casefold {
579	unless (%CASEFOLD) {
580	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
581	local $_;
582	while (<$CASEFOLDFH>) {
583	if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
584	my $code = hex($1);
585	$CASEFOLD{$code} = { code => $1,
586	status => $2,
587	mapping => $3 };
588	}
589	}
590	close($CASEFOLDFH);
591	}
592	}
593	}
594
595	sub casefold {
596	my $arg = shift;
597	my $code = _getcode($arg);
598	croak __PACKAGE__, "::casefold: unknown code '$arg'"
599	unless defined $code;
600
601	_casefold() unless %CASEFOLD;
602
603	return $CASEFOLD{$code};
604	}
605
606	=head2 casespec
607
608	use Unicode::UCD 'casespec';
609
610	my $casespec = casespec("FB00");
611
612	The casespec() returns the potentially locale-dependent case mapping
613	of the character specified by a B<code point argument>. The mapping
614	may change the length of the string (which the basic Unicode case
615	mappings as returned by charinfo() never do).
616
617	If there is a case folding for that character, a reference to a hash
618	with the following fields is returned:
619
620	key
621
622	code code point with at least four hexdigits
623	lower lowercase
624	title titlecase
625	upper uppercase
626	condition condition list (may be undef)
627
628	The C<condition> is optional. Where present, it consists of one or
629	more I<locales> or I<contexts>, separated by spaces (other than as
630	used to separate elements, spaces are to be ignored). A condition
631	list overrides the normal behavior if all of the listed conditions are
632	true. Case distinctions in the condition list are not significant.
633	Conditions preceded by "NON_" represent the negation of the condition.
634
635	Note that when there are multiple case folding definitions for a
636	single code point because of different locales, the value returned by
637	casespec() is a hash reference which has the locales as the keys and
638	hash references as described above as the values.
639
640	A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
641	followed by a "_" and a 2-letter ISO language code (possibly followed
642	by a "_" and a variant code). You can find the lists of those codes,
643	see L<Locale::Country> and L<Locale::Language>.
644
645	A I<context> is one of the following choices:
646
647	FINAL The letter is not followed by a letter of
648	general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
649	MODERN The mapping is only used for modern text
650	AFTER_i The last base character was "i" (U+0069)
651
652	For more information about case mappings see
653	http://www.unicode.org/unicode/reports/tr21/
654
655	=cut
656
657	my %CASESPEC;
658
659	sub _casespec {
660	unless (%CASESPEC) {
661	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
662	local $_;
663	while (<$CASESPECFH>) {
664	if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; (\w+(?: \w+))?/) {
665	my ($hexcode, $lower, $title, $upper, $condition) =
666	($1, $2, $3, $4, $5);
667	my $code = hex($hexcode);
668	if (exists $CASESPEC{$code}) {
669	if (exists $CASESPEC{$code}->{code}) {
670	my ($oldlower,
671	$oldtitle,
672	$oldupper,
673	$oldcondition) =
674	@{$CASESPEC{$code}}{qw(lower
675	title
676	upper
677	condition)};
678	if (defined $oldcondition) {
679	my ($oldlocale) =
680	($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
681	delete $CASESPEC{$code};
682	$CASESPEC{$code}->{$oldlocale} =
683	{ code => $hexcode,
684	lower => $oldlower,
685	title => $oldtitle,
686	upper => $oldupper,
687	condition => $oldcondition };
688	}
689	}
690	my ($locale) =
691	($condition =~ /^([a-z][a-z](?:_\S+)?)/);
692	$CASESPEC{$code}->{$locale} =
693	{ code => $hexcode,
694	lower => $lower,
695	title => $title,
696	upper => $upper,
697	condition => $condition };
698	} else {
699	$CASESPEC{$code} =
700	{ code => $hexcode,
701	lower => $lower,
702	title => $title,
703	upper => $upper,
704	condition => $condition };
705	}
706	}
707	}
708	close($CASESPECFH);
709	}
710	}
711	}
712
713	sub casespec {
714	my $arg = shift;
715	my $code = _getcode($arg);
716	croak __PACKAGE__, "::casespec: unknown code '$arg'"
717	unless defined $code;
718
719	_casespec() unless %CASESPEC;
720
721	return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
722	}
723
724	=head2 namedseq()
725
726	use Unicode::UCD 'namedseq';
727
728	my $namedseq = namedseq("KATAKANA LETTER AINU P");
729	my @namedseq = namedseq("KATAKANA LETTER AINU P");
730	my %namedseq = namedseq();
731
732	If used with a single argument in a scalar context, returns the string
733	consisting of the code points of the named sequence, or C<undef> if no
734	named sequence by that name exists. If used with a single argument in
735	a list context, returns list of the code points. If used with no
736	arguments in a list context, returns a hash with the names of the
737	named sequences as the keys and the named sequences as strings as
738	the values. Otherwise, returns C<undef> or empty list depending
739	on the context.
740
741	(New from Unicode 4.1.0)
742
743	=cut
744
745	my %NAMEDSEQ;
746
747	sub _namedseq {
748	unless (%NAMEDSEQ) {
749	if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
750	local $_;
751	while (<$NAMEDSEQFH>) {
752	if (/^(.+)\s;\s([0-9A-F]+(?: [0-9A-F]+)*)$/) {
753	my ($n, $s) = ($1, $2);
754	my @s = map { chr(hex($_)) } split(' ', $s);
755	$NAMEDSEQ{$n} = join("", @s);
756	}
757	}
758	close($NAMEDSEQFH);
759	}
760	}
761	}
762
763	sub namedseq {
764	_namedseq() unless %NAMEDSEQ;
765	my $wantarray = wantarray();
766	if (defined $wantarray) {
767	if ($wantarray) {
768	if (@_ == 0) {
769	return %NAMEDSEQ;
770	} elsif (@_ == 1) {
771	my $s = $NAMEDSEQ{ $_[0] };
772	return defined $s ? map { ord($_) } split('', $s) : ();
773	}
774	} elsif (@_ == 1) {
775	return $NAMEDSEQ{ $_[0] };
776	}
777	}
778	return;
779	}
780
781	=head2 Unicode::UCD::UnicodeVersion
782
783	Unicode::UCD::UnicodeVersion() returns the version of the Unicode
784	Character Database, in other words, the version of the Unicode
785	standard the database implements. The version is a string
786	of numbers delimited by dots (C<'.'>).
787
788	=cut
789
790	my $UNICODEVERSION;
791
792	sub UnicodeVersion {
793	unless (defined $UNICODEVERSION) {
794	openunicode(\$VERSIONFH, "version");
795	chomp($UNICODEVERSION = <$VERSIONFH>);
796	close($VERSIONFH);
797	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
798	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
799	}
800	return $UNICODEVERSION;
801	}
802
803	=head2 Implementation Note
804
805	The first use of charinfo() opens a read-only filehandle to the Unicode
806	Character Database (the database is included in the Perl distribution).
807	The filehandle is then kept open for further queries. In other words,
808	if you are wondering where one of your filehandles went, that's where.
809
810	=head1 BUGS
811
812	Does not yet support EBCDIC platforms.
813
814	=head1 AUTHOR
815
816	Jarkko Hietaniemi
817
818	=cut
819
820	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: