1 | package Unicode::Normalize;
|
---|
2 |
|
---|
3 | BEGIN {
|
---|
4 | unless ("A" eq pack('U', 0x41)) {
|
---|
5 | die "Unicode::Normalize cannot stringify a Unicode code point\n";
|
---|
6 | }
|
---|
7 | }
|
---|
8 |
|
---|
9 | use 5.006;
|
---|
10 | use strict;
|
---|
11 | use warnings;
|
---|
12 | use Carp;
|
---|
13 |
|
---|
14 | no warnings 'utf8';
|
---|
15 |
|
---|
16 | our $VERSION = '0.32';
|
---|
17 | our $PACKAGE = __PACKAGE__;
|
---|
18 |
|
---|
19 | require Exporter;
|
---|
20 | require DynaLoader;
|
---|
21 |
|
---|
22 | our @ISA = qw(Exporter DynaLoader);
|
---|
23 | our @EXPORT = qw( NFC NFD NFKC NFKD );
|
---|
24 | our @EXPORT_OK = qw(
|
---|
25 | normalize decompose reorder compose
|
---|
26 | checkNFD checkNFKD checkNFC checkNFKC check
|
---|
27 | getCanon getCompat getComposite getCombinClass
|
---|
28 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
|
---|
29 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
|
---|
30 | FCD checkFCD FCC checkFCC composeContiguous
|
---|
31 | splitOnLastStarter
|
---|
32 | );
|
---|
33 | our %EXPORT_TAGS = (
|
---|
34 | all => [ @EXPORT, @EXPORT_OK ],
|
---|
35 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
|
---|
36 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
|
---|
37 | fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
|
---|
38 | );
|
---|
39 |
|
---|
40 | ######
|
---|
41 |
|
---|
42 | bootstrap Unicode::Normalize $VERSION;
|
---|
43 |
|
---|
44 | ######
|
---|
45 |
|
---|
46 | sub pack_U {
|
---|
47 | return pack('U*', @_);
|
---|
48 | }
|
---|
49 |
|
---|
50 | sub unpack_U {
|
---|
51 | return unpack('U*', pack('U*').shift);
|
---|
52 | }
|
---|
53 |
|
---|
54 |
|
---|
55 | ##
|
---|
56 | ## normalization forms
|
---|
57 | ##
|
---|
58 |
|
---|
59 | use constant COMPAT => 1;
|
---|
60 |
|
---|
61 | sub NFD ($) { reorder(decompose($_[0])) }
|
---|
62 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
|
---|
63 | sub NFC ($) { compose(reorder(decompose($_[0]))) }
|
---|
64 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
|
---|
65 |
|
---|
66 | sub FCD ($) {
|
---|
67 | my $str = shift;
|
---|
68 | return checkFCD($str) ? $str : NFD($str);
|
---|
69 | }
|
---|
70 | sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
|
---|
71 |
|
---|
72 | our %formNorm = (
|
---|
73 | NFC => \&NFC, C => \&NFC,
|
---|
74 | NFD => \&NFD, D => \&NFD,
|
---|
75 | NFKC => \&NFKC, KC => \&NFKC,
|
---|
76 | NFKD => \&NFKD, KD => \&NFKD,
|
---|
77 | FCD => \&FCD, FCC => \&FCC,
|
---|
78 | );
|
---|
79 |
|
---|
80 | sub normalize($$)
|
---|
81 | {
|
---|
82 | my $form = shift;
|
---|
83 | my $str = shift;
|
---|
84 | return exists $formNorm{$form}
|
---|
85 | ? $formNorm{$form}->($str)
|
---|
86 | : croak $PACKAGE."::normalize: invalid form name: $form";
|
---|
87 | }
|
---|
88 |
|
---|
89 |
|
---|
90 | ##
|
---|
91 | ## quick check
|
---|
92 | ##
|
---|
93 |
|
---|
94 | our %formCheck = (
|
---|
95 | NFC => \&checkNFC, C => \&checkNFC,
|
---|
96 | NFD => \&checkNFD, D => \&checkNFD,
|
---|
97 | NFKC => \&checkNFKC, KC => \&checkNFKC,
|
---|
98 | NFKD => \&checkNFKD, KD => \&checkNFKD,
|
---|
99 | FCD => \&checkFCD, FCC => \&checkFCC,
|
---|
100 | );
|
---|
101 |
|
---|
102 | sub check($$)
|
---|
103 | {
|
---|
104 | my $form = shift;
|
---|
105 | my $str = shift;
|
---|
106 | return exists $formCheck{$form}
|
---|
107 | ? $formCheck{$form}->($str)
|
---|
108 | : croak $PACKAGE."::check: invalid form name: $form";
|
---|
109 | }
|
---|
110 |
|
---|
111 | 1;
|
---|
112 | __END__
|
---|
113 |
|
---|
114 | =head1 NAME
|
---|
115 |
|
---|
116 | Unicode::Normalize - Unicode Normalization Forms
|
---|
117 |
|
---|
118 | =head1 SYNOPSIS
|
---|
119 |
|
---|
120 | (1) using function names exported by default:
|
---|
121 |
|
---|
122 | use Unicode::Normalize;
|
---|
123 |
|
---|
124 | $NFD_string = NFD($string); # Normalization Form D
|
---|
125 | $NFC_string = NFC($string); # Normalization Form C
|
---|
126 | $NFKD_string = NFKD($string); # Normalization Form KD
|
---|
127 | $NFKC_string = NFKC($string); # Normalization Form KC
|
---|
128 |
|
---|
129 | (2) using function names exported on request:
|
---|
130 |
|
---|
131 | use Unicode::Normalize 'normalize';
|
---|
132 |
|
---|
133 | $NFD_string = normalize('D', $string); # Normalization Form D
|
---|
134 | $NFC_string = normalize('C', $string); # Normalization Form C
|
---|
135 | $NFKD_string = normalize('KD', $string); # Normalization Form KD
|
---|
136 | $NFKC_string = normalize('KC', $string); # Normalization Form KC
|
---|
137 |
|
---|
138 | =head1 DESCRIPTION
|
---|
139 |
|
---|
140 | Parameters:
|
---|
141 |
|
---|
142 | C<$string> is used as a string under character semantics
|
---|
143 | (see F<perlunicode>).
|
---|
144 |
|
---|
145 | C<$codepoint> should be an unsigned integer
|
---|
146 | representing a Unicode code point.
|
---|
147 |
|
---|
148 | Note: Between XSUB and pure Perl, there is an incompatibility
|
---|
149 | about the interpretation of C<$codepoint> as a decimal number.
|
---|
150 | XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not.
|
---|
151 | Do not use a floating point nor a negative sign in C<$codepoint>.
|
---|
152 |
|
---|
153 | =head2 Normalization Forms
|
---|
154 |
|
---|
155 | =over 4
|
---|
156 |
|
---|
157 | =item C<$NFD_string = NFD($string)>
|
---|
158 |
|
---|
159 | returns the Normalization Form D (formed by canonical decomposition).
|
---|
160 |
|
---|
161 | =item C<$NFC_string = NFC($string)>
|
---|
162 |
|
---|
163 | returns the Normalization Form C (formed by canonical decomposition
|
---|
164 | followed by canonical composition).
|
---|
165 |
|
---|
166 | =item C<$NFKD_string = NFKD($string)>
|
---|
167 |
|
---|
168 | returns the Normalization Form KD (formed by compatibility decomposition).
|
---|
169 |
|
---|
170 | =item C<$NFKC_string = NFKC($string)>
|
---|
171 |
|
---|
172 | returns the Normalization Form KC (formed by compatibility decomposition
|
---|
173 | followed by B<canonical> composition).
|
---|
174 |
|
---|
175 | =item C<$FCD_string = FCD($string)>
|
---|
176 |
|
---|
177 | If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
|
---|
178 | returns it without modification; otherwise returns an FCD string.
|
---|
179 |
|
---|
180 | Note: FCD is not always unique, then plural forms may be equivalent
|
---|
181 | each other. C<FCD()> will return one of these equivalent forms.
|
---|
182 |
|
---|
183 | =item C<$FCC_string = FCC($string)>
|
---|
184 |
|
---|
185 | returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
|
---|
186 |
|
---|
187 | Note: FCC is unique, as well as four normalization forms (NF*).
|
---|
188 |
|
---|
189 | =item C<$normalized_string = normalize($form_name, $string)>
|
---|
190 |
|
---|
191 | As C<$form_name>, one of the following names must be given.
|
---|
192 |
|
---|
193 | 'C' or 'NFC' for Normalization Form C (UAX #15)
|
---|
194 | 'D' or 'NFD' for Normalization Form D (UAX #15)
|
---|
195 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
|
---|
196 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
|
---|
197 |
|
---|
198 | 'FCD' for "Fast C or D" Form (UTN #5)
|
---|
199 | 'FCC' for "Fast C Contiguous" (UTN #5)
|
---|
200 |
|
---|
201 | =back
|
---|
202 |
|
---|
203 | =head2 Decomposition and Composition
|
---|
204 |
|
---|
205 | =over 4
|
---|
206 |
|
---|
207 | =item C<$decomposed_string = decompose($string)>
|
---|
208 |
|
---|
209 | =item C<$decomposed_string = decompose($string, $useCompatMapping)>
|
---|
210 |
|
---|
211 | Decomposes the specified string and returns the result.
|
---|
212 |
|
---|
213 | If the second parameter (a boolean) is omitted or false, decomposes it
|
---|
214 | using the Canonical Decomposition Mapping.
|
---|
215 | If true, decomposes it using the Compatibility Decomposition Mapping.
|
---|
216 |
|
---|
217 | The string returned is not always in NFD/NFKD.
|
---|
218 | Reordering may be required.
|
---|
219 |
|
---|
220 | $NFD_string = reorder(decompose($string)); # eq. to NFD()
|
---|
221 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
|
---|
222 |
|
---|
223 | =item C<$reordered_string = reorder($string)>
|
---|
224 |
|
---|
225 | Reorders the combining characters and the like in the canonical ordering
|
---|
226 | and returns the result.
|
---|
227 |
|
---|
228 | E.g., when you have a list of NFD/NFKD strings,
|
---|
229 | you can get the concatenated NFD/NFKD string from them, saying
|
---|
230 |
|
---|
231 | $concat_NFD = reorder(join '', @NFD_strings);
|
---|
232 | $concat_NFKD = reorder(join '', @NFKD_strings);
|
---|
233 |
|
---|
234 | =item C<$composed_string = compose($string)>
|
---|
235 |
|
---|
236 | Returns the string where composable pairs are composed.
|
---|
237 |
|
---|
238 | E.g., when you have a NFD/NFKD string,
|
---|
239 | you can get its NFC/NFKC string, saying
|
---|
240 |
|
---|
241 | $NFC_string = compose($NFD_string);
|
---|
242 | $NFKC_string = compose($NFKD_string);
|
---|
243 |
|
---|
244 | =back
|
---|
245 |
|
---|
246 | =head2 Quick Check
|
---|
247 |
|
---|
248 | (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
|
---|
249 |
|
---|
250 | The following functions check whether the string is in that normalization form.
|
---|
251 |
|
---|
252 | The result returned will be:
|
---|
253 |
|
---|
254 | YES The string is in that normalization form.
|
---|
255 | NO The string is not in that normalization form.
|
---|
256 | MAYBE Dubious. Maybe yes, maybe no.
|
---|
257 |
|
---|
258 | =over 4
|
---|
259 |
|
---|
260 | =item C<$result = checkNFD($string)>
|
---|
261 |
|
---|
262 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
|
---|
263 |
|
---|
264 | =item C<$result = checkNFC($string)>
|
---|
265 |
|
---|
266 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
|
---|
267 | C<undef> if C<MAYBE>.
|
---|
268 |
|
---|
269 | =item C<$result = checkNFKD($string)>
|
---|
270 |
|
---|
271 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
|
---|
272 |
|
---|
273 | =item C<$result = checkNFKC($string)>
|
---|
274 |
|
---|
275 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
|
---|
276 | C<undef> if C<MAYBE>.
|
---|
277 |
|
---|
278 | =item C<$result = checkFCD($string)>
|
---|
279 |
|
---|
280 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
|
---|
281 |
|
---|
282 | =item C<$result = checkFCC($string)>
|
---|
283 |
|
---|
284 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
|
---|
285 | C<undef> if C<MAYBE>.
|
---|
286 |
|
---|
287 | If a string is not in FCD, it must not be in FCC.
|
---|
288 | So C<checkFCC($not_FCD_string)> should return C<NO>.
|
---|
289 |
|
---|
290 | =item C<$result = check($form_name, $string)>
|
---|
291 |
|
---|
292 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
|
---|
293 | C<undef> if C<MAYBE>.
|
---|
294 |
|
---|
295 | As C<$form_name>, one of the following names must be given.
|
---|
296 |
|
---|
297 | 'C' or 'NFC' for Normalization Form C (UAX #15)
|
---|
298 | 'D' or 'NFD' for Normalization Form D (UAX #15)
|
---|
299 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
|
---|
300 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
|
---|
301 |
|
---|
302 | 'FCD' for "Fast C or D" Form (UTN #5)
|
---|
303 | 'FCC' for "Fast C Contiguous" (UTN #5)
|
---|
304 |
|
---|
305 | =back
|
---|
306 |
|
---|
307 | B<Note>
|
---|
308 |
|
---|
309 | In the cases of NFD, NFKD, and FCD, the answer must be
|
---|
310 | either C<YES> or C<NO>. The answer C<MAYBE> may be returned
|
---|
311 | in the cases of NFC, NFKC, and FCC.
|
---|
312 |
|
---|
313 | A C<MAYBE> string should contain at least one combining character
|
---|
314 | or the like. For example, C<COMBINING ACUTE ACCENT> has
|
---|
315 | the MAYBE_NFC/MAYBE_NFKC property.
|
---|
316 |
|
---|
317 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
|
---|
318 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
|
---|
319 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
|
---|
320 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
|
---|
321 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
|
---|
322 |
|
---|
323 | If you want to check exactly, compare the string with its NFC/NFKC/FCC.
|
---|
324 |
|
---|
325 | if ($string eq NFC($string)) {
|
---|
326 | # $string is exactly normalized in NFC;
|
---|
327 | } else {
|
---|
328 | # $string is not normalized in NFC;
|
---|
329 | }
|
---|
330 |
|
---|
331 | if ($string eq NFKC($string)) {
|
---|
332 | # $string is exactly normalized in NFKC;
|
---|
333 | } else {
|
---|
334 | # $string is not normalized in NFKC;
|
---|
335 | }
|
---|
336 |
|
---|
337 | =head2 Character Data
|
---|
338 |
|
---|
339 | These functions are interface of character data used internally.
|
---|
340 | If you want only to get Unicode normalization forms, you don't need
|
---|
341 | call them yourself.
|
---|
342 |
|
---|
343 | =over 4
|
---|
344 |
|
---|
345 | =item C<$canonical_decomposed = getCanon($codepoint)>
|
---|
346 |
|
---|
347 | If the character of the specified codepoint is canonically
|
---|
348 | decomposable (including Hangul Syllables),
|
---|
349 | returns the B<completely decomposed> string canonically equivalent to it.
|
---|
350 |
|
---|
351 | If it is not decomposable, returns C<undef>.
|
---|
352 |
|
---|
353 | =item C<$compatibility_decomposed = getCompat($codepoint)>
|
---|
354 |
|
---|
355 | If the character of the specified codepoint is compatibility
|
---|
356 | decomposable (including Hangul Syllables),
|
---|
357 | returns the B<completely decomposed> string compatibility equivalent to it.
|
---|
358 |
|
---|
359 | If it is not decomposable, returns C<undef>.
|
---|
360 |
|
---|
361 | =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
|
---|
362 |
|
---|
363 | If two characters here and next (as codepoints) are composable
|
---|
364 | (including Hangul Jamo/Syllables and Composition Exclusions),
|
---|
365 | returns the codepoint of the composite.
|
---|
366 |
|
---|
367 | If they are not composable, returns C<undef>.
|
---|
368 |
|
---|
369 | =item C<$combining_class = getCombinClass($codepoint)>
|
---|
370 |
|
---|
371 | Returns the combining class of the character as an integer.
|
---|
372 |
|
---|
373 | =item C<$is_exclusion = isExclusion($codepoint)>
|
---|
374 |
|
---|
375 | Returns a boolean whether the character of the specified codepoint
|
---|
376 | is a composition exclusion.
|
---|
377 |
|
---|
378 | =item C<$is_singleton = isSingleton($codepoint)>
|
---|
379 |
|
---|
380 | Returns a boolean whether the character of the specified codepoint is
|
---|
381 | a singleton.
|
---|
382 |
|
---|
383 | =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
|
---|
384 |
|
---|
385 | Returns a boolean whether the canonical decomposition
|
---|
386 | of the character of the specified codepoint
|
---|
387 | is a Non-Starter Decomposition.
|
---|
388 |
|
---|
389 | =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
|
---|
390 |
|
---|
391 | Returns a boolean whether the character of the specified codepoint
|
---|
392 | may be composed with the previous one in a certain composition
|
---|
393 | (including Hangul Compositions, but excluding
|
---|
394 | Composition Exclusions and Non-Starter Decompositions).
|
---|
395 |
|
---|
396 | =back
|
---|
397 |
|
---|
398 | =head1 EXPORT
|
---|
399 |
|
---|
400 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
|
---|
401 |
|
---|
402 | C<normalize> and other some functions: on request.
|
---|
403 |
|
---|
404 | =head1 CAVEATS
|
---|
405 |
|
---|
406 | =over 4
|
---|
407 |
|
---|
408 | =item Perl's version vs. Unicode version
|
---|
409 |
|
---|
410 | Since this module refers to perl core's Unicode database in the directory
|
---|
411 | F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
|
---|
412 | normalization implemented by this module depends on your perl's version.
|
---|
413 |
|
---|
414 | perl's version implemented Unicode version
|
---|
415 | 5.6.1 3.0.1
|
---|
416 | 5.7.2 3.1.0
|
---|
417 | 5.7.3 3.1.1 (same normalized form as that of 3.1.0)
|
---|
418 | 5.8.0 3.2.0
|
---|
419 | 5.8.1-5.8.3 4.0.0
|
---|
420 | 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0)
|
---|
421 |
|
---|
422 | =item Correction of decomposition mapping
|
---|
423 |
|
---|
424 | In older Unicode versions, a small number of characters (all of which are
|
---|
425 | CJK compatibility ideographs as far as they have been found) may have
|
---|
426 | an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
|
---|
427 | Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
|
---|
428 | nor provide any specific version of normalization. Therefore this module
|
---|
429 | running on an older perl with an older Unicode database may use
|
---|
430 | the erroneous decomposition mapping blindly conforming to the Unicode database.
|
---|
431 |
|
---|
432 | =item Revised definition of canonical composition
|
---|
433 |
|
---|
434 | In Unicode 4.1.0, the definition D2 of canonical composition (which
|
---|
435 | affects NFC and NFKC) has been changed (see Public Review Issue #29
|
---|
436 | and recent UAX #15). This module has used the newer definition
|
---|
437 | since the version 0.07 (Oct 31, 2001).
|
---|
438 | This module does not support normalization according to the older
|
---|
439 | definition, even if the Unicode version implemented by perl is
|
---|
440 | lower than 4.1.0.
|
---|
441 |
|
---|
442 | =back
|
---|
443 |
|
---|
444 | =head1 AUTHOR
|
---|
445 |
|
---|
446 | SADAHIRO Tomoyuki <[email protected]>
|
---|
447 |
|
---|
448 | Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
|
---|
449 |
|
---|
450 | This module is free software; you can redistribute it
|
---|
451 | and/or modify it under the same terms as Perl itself.
|
---|
452 |
|
---|
453 | =head1 SEE ALSO
|
---|
454 |
|
---|
455 | =over 4
|
---|
456 |
|
---|
457 | =item http://www.unicode.org/reports/tr15/
|
---|
458 |
|
---|
459 | Unicode Normalization Forms - UAX #15
|
---|
460 |
|
---|
461 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
|
---|
462 |
|
---|
463 | Derived Normalization Properties
|
---|
464 |
|
---|
465 | =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
|
---|
466 |
|
---|
467 | Normalization Corrections
|
---|
468 |
|
---|
469 | =item http://www.unicode.org/review/pr-29.html
|
---|
470 |
|
---|
471 | Public Review Issue #29: Normalization Issue
|
---|
472 |
|
---|
473 | =item http://www.unicode.org/notes/tn5/
|
---|
474 |
|
---|
475 | Canonical Equivalence in Applications - UTN #5
|
---|
476 |
|
---|
477 | =back
|
---|
478 |
|
---|
479 | =cut
|
---|