source: gsdl/trunk/perllib/plugins/CJKTextSegmenter.pm@ 16640

Last change on this file since 16640 was 16640, checked in by kjdon, 16 years ago

helper plugin to separate cjk text into individual characters

File size: 2.3 KB
Line 
1###########################################################################
2#
3# CJKTextSegmenter - helper plugin that segments chinese/japanese/korean text
4# into single characters
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2008 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package CJKTextSegmenter;
29
30use cnseg;
31use PrintInfo;
32use strict;
33
34BEGIN {
35 @CJKTextSegmenter::ISA = ('PrintInfo');
36}
37
38my $arguments = [
39 { 'name' => "separate_cjk",
40 'desc' => "{CJKTextSegmenter.separate_cjk}",
41 'type' => "flag",
42 'reqd' => "no" }
43 ];
44my $options = { 'name' => "CJKTextSegmenter",
45 'desc' => "{CJKTextSegmenter.desc}",
46 'abstract' => "yes",
47 'inherits' => "yes",
48 'args' => $arguments };
49
50
51sub new {
52 my ($class) = shift (@_);
53 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
54 push(@$pluginlist, $class);
55
56 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
57 push(@{$hashArgOptLists->{"OptList"}},$options);
58
59 my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
60
61 return bless $self, $class;
62
63}
64
65
66sub separate_cjk_text {
67
68 my $self = shift (@_);
69 my ($doc_obj) = @_;
70
71 if($self->{'separate_cjk'}) {
72 my $thissection = $doc_obj->get_top_section();
73 while (defined $thissection) {
74
75 my $text = $doc_obj->get_text($thissection);
76 $text = &cnseg::segment($text);
77 $doc_obj->add_utf8_text($thissection, $text);
78 $thissection = $doc_obj->get_next_section ($thissection);
79 }
80 }
81}
82
831;
Note: See TracBrowser for help on using the repository browser.