source: trunk/gsdl/perllib/cnseg.pm@ 241

Last change on this file since 241 was 76, checked in by rjmcnab, 26 years ago

Initial revision.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 1.2 KB
Line 
1# this package segments a chinese UTF-8 encoded Unicode
2# string into words.
3
4package cnseg;
5
6use unicode;
7
8
9# 'segment' takes a UTF-8 encoded Unicode Chinese-language
10# string and places U-200B between words -- the ZERO
11# WIDTH SPACE. Each line is treated as a separate
12# paragraph, so lines in one paragraph should
13# be joined before using this method (normally a single
14# word might span more than one line within a paragraph).
15#
16# 'segment' is currently written in Perl, however, I (Rodger)
17# plan to use C++ (via pipes) once a more complex (and useful!)
18# algorithm is being used. Currently, each Chinese character
19# is treated as a seperate word.
20
21sub segment {
22 my ($in) = @_;
23 my ($c);
24
25 my $uniin = &unicode::utf82unicode($in);
26 my $out = [];
27
28 my $space = 1; # start doesn't need a space
29 foreach $c (@$uniin) {
30 if (($c >= 0x4e00 && $c <= 0x9fa5) ||
31 ($c >= 0xf900 && $c <= 0xfa2d)) {
32 # Chinese character
33 push (@$out, 0x200b) unless $space;
34 push (@$out, $c);
35 push (@$out, 0x200b);
36 $space = 1;
37
38 } else {
39 # non-Chinese character
40 push (@$out, $c);
41 $space = 0;
42 }
43 }
44
45 return &unicode::unicode2utf8($out);
46}
47
481;
Note: See TracBrowser for help on using the repository browser.