Context Navigation

cnseg.pm@ 32130

Last change on this file since 32130 was 25788, checked in by kjdon, 12 years ago
segmentation code was assuming strings in utf8 but we have changed to using unicode aware strings, so no conversion needed.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.4 KB

Line
1	###########################################################################
2	#
3	# cnseg.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26
27	# this package segments a chinese UTF-8 encoded Unicode
28	# string into words.
29
30	package cnseg;
31
32	use strict;
33	use unicode;
34
35
36	# 'segment' takes a UTF-8 encoded Unicode Chinese-language
37	# string and places U-200B between words -- the ZERO
38	# WIDTH SPACE. Each line is treated as a separate
39	# paragraph, so lines in one paragraph should
40	# be joined before using this method (normally a single
41	# word might span more than one line within a paragraph).
42	#
43	# 'segment' is currently written in Perl, however, I (Rodger)
44	# plan to use C++ (via pipes) once a more complex (and useful!)
45	# algorithm is being used. Currently, each Chinese character
46	# is treated as a seperate word.
47
48	sub segment {
49	my ($in) = @_;
50	my ($c);
51	my ($cl);
52	my $len = length($in);
53	my $i = 0;
54	my $out = "";
55	my $space = 1; # start doesn't need a space
56	while ($i < $len) {
57	$c = substr ($in, $i, 1);
58	$cl = ord($c);
59	if (($cl >= 0x2e80 && $cl <= 0xd7a3) \|\|
60	( $cl >= 0xf900 && $cl <= 0xfa6a)) { # main east asian codes
61	# currently c++ receptionist code can't handle these large numbers
62	# search terms need to be segmented the same way. Add these back
63	# in when fix up c++
64	# ($cl >= 0x20000 && $cl <= 0x2a6d6) \|\| # cjk unified ideographs ext B
65	# ($cl >= 0x2f800 && $cl <= 0x2fa1d)) { #cjk compatibility ideographs supplement
66	# CJK character
67	$out .= chr(0x200b) unless $space;
68	$out .= $c;
69	$out .= chr(0x200b);
70	$space = 1;
71	} else {
72	$out .=$c;
73	$space = 0;
74	}
75	$i++;
76	}
77	return $out;
78	}
79
80	sub segment_old {
81	my ($in) = @_;
82	my ($c);
83	my $uniin = &unicode::utf82unicode($in);
84	my $out = [];
85
86	my $space = 1; # start doesn't need a space
87	foreach $c (@$uniin) {
88	if (($c >= 0x2e80 && $c <= 0xd7a3) \|\|
89	( $c >= 0xf900 && $c <= 0xfa6a)) { # main east asian codes
90	# currently c++ receptionist code can't handle these large numbers
91	# search terms need to be segmented the same way. Add these back
92	# in when fix up c++
93	# ($c >= 0x20000 && $c <= 0x2a6d6) \|\| # cjk unified ideographs ext B
94	# ($c >= 0x2f800 && $c <= 0x2fa1d)) { #cjk compatibility ideographs supplement
95	# CJK character
96	push (@$out, 0x200b) unless $space;
97	push (@$out, $c);
98	push (@$out, 0x200b);
99	$space = 1;
100
101	} else {
102	# non-Chinese character
103	push (@$out, $c);
104	$space = 0;
105	}
106	}
107
108	return &unicode::unicode2utf8($out);
109	}
110
111	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cnseg.pm@ 32130

Download in other formats: