Context Navigation

cnseg.pm@ 241

Last change on this file since 241 was 76, checked in by rjmcnab, 26 years ago
Initial revision.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 1.2 KB

Line
1	# this package segments a chinese UTF-8 encoded Unicode
2	# string into words.
3
4	package cnseg;
5
6	use unicode;
7
8
9	# 'segment' takes a UTF-8 encoded Unicode Chinese-language
10	# string and places U-200B between words -- the ZERO
11	# WIDTH SPACE. Each line is treated as a separate
12	# paragraph, so lines in one paragraph should
13	# be joined before using this method (normally a single
14	# word might span more than one line within a paragraph).
15	#
16	# 'segment' is currently written in Perl, however, I (Rodger)
17	# plan to use C++ (via pipes) once a more complex (and useful!)
18	# algorithm is being used. Currently, each Chinese character
19	# is treated as a seperate word.
20
21	sub segment {
22	my ($in) = @_;
23	my ($c);
24
25	my $uniin = &unicode::utf82unicode($in);
26	my $out = [];
27
28	my $space = 1; # start doesn't need a space
29	foreach $c (@$uniin) {
30	if (($c >= 0x4e00 && $c <= 0x9fa5) \|\|
31	($c >= 0xf900 && $c <= 0xfa2d)) {
32	# Chinese character
33	push (@$out, 0x200b) unless $space;
34	push (@$out, $c);
35	push (@$out, 0x200b);
36	$space = 1;
37
38	} else {
39	# non-Chinese character
40	push (@$out, $c);
41	$space = 0;
42	}
43	}
44
45	return &unicode::unicode2utf8($out);
46	}
47
48	1;

Note: See TracBrowser for help on using the repository browser.