source:
trunk/gsdl/perllib/cnseg.pm@
241
Last change on this file since 241 was 76, checked in by , 26 years ago | |
---|---|
|
|
File size: 1.2 KB |
Line | |
---|---|
1 | # this package segments a chinese UTF-8 encoded Unicode |
2 | # string into words. |
3 | |
4 | package cnseg; |
5 | |
6 | use unicode; |
7 | |
8 | |
9 | # 'segment' takes a UTF-8 encoded Unicode Chinese-language |
10 | # string and places U-200B between words -- the ZERO |
11 | # WIDTH SPACE. Each line is treated as a separate |
12 | # paragraph, so lines in one paragraph should |
13 | # be joined before using this method (normally a single |
14 | # word might span more than one line within a paragraph). |
15 | # |
16 | # 'segment' is currently written in Perl, however, I (Rodger) |
17 | # plan to use C++ (via pipes) once a more complex (and useful!) |
18 | # algorithm is being used. Currently, each Chinese character |
19 | # is treated as a seperate word. |
20 | |
21 | sub segment { |
22 | my ($in) = @_; |
23 | my ($c); |
24 | |
25 | my $uniin = &unicode::utf82unicode($in); |
26 | my $out = []; |
27 | |
28 | my $space = 1; # start doesn't need a space |
29 | foreach $c (@$uniin) { |
30 | if (($c >= 0x4e00 && $c <= 0x9fa5) || |
31 | ($c >= 0xf900 && $c <= 0xfa2d)) { |
32 | # Chinese character |
33 | push (@$out, 0x200b) unless $space; |
34 | push (@$out, $c); |
35 | push (@$out, 0x200b); |
36 | $space = 1; |
37 | |
38 | } else { |
39 | # non-Chinese character |
40 | push (@$out, $c); |
41 | $space = 0; |
42 | } |
43 | } |
44 | |
45 | return &unicode::unicode2utf8($out); |
46 | } |
47 | |
48 | 1; |
Note:
See TracBrowser
for help on using the repository browser.