Context Navigation

source: trunk/gsdl/perllib/cnseg.pm@ 1991

Last change on this file since 1991 was 537, checked in by sjboddie, 25 years ago
added GPL headers
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 2.2 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# cnseg.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26
[76]	27	# this package segments a chinese UTF-8 encoded Unicode
	28	# string into words.
	29
	30	package cnseg;
	31
	32	use unicode;
	33
	34
	35	# 'segment' takes a UTF-8 encoded Unicode Chinese-language
	36	# string and places U-200B between words -- the ZERO
	37	# WIDTH SPACE. Each line is treated as a separate
	38	# paragraph, so lines in one paragraph should
	39	# be joined before using this method (normally a single
	40	# word might span more than one line within a paragraph).
	41	#
	42	# 'segment' is currently written in Perl, however, I (Rodger)
	43	# plan to use C++ (via pipes) once a more complex (and useful!)
	44	# algorithm is being used. Currently, each Chinese character
	45	# is treated as a seperate word.
	46
	47	sub segment {
	48	my ($in) = @_;
	49	my ($c);
	50
	51	my $uniin = &unicode::utf82unicode($in);
	52	my $out = [];
	53
	54	my $space = 1; # start doesn't need a space
	55	foreach $c (@$uniin) {
	56	if (($c >= 0x4e00 && $c <= 0x9fa5) \|\|
	57	($c >= 0xf900 && $c <= 0xfa2d)) {
	58	# Chinese character
	59	push (@$out, 0x200b) unless $space;
	60	push (@$out, $c);
	61	push (@$out, 0x200b);
	62	$space = 1;
	63
	64	} else {
	65	# non-Chinese character
	66	push (@$out, $c);
	67	$space = 0;
	68	}
	69	}
	70
	71	return &unicode::unicode2utf8($out);
	72	}
	73
	74	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: