root/gsdl/trunk/perllib/plugins/CJKTextSegmenter.pm @ 16646

Revision 16646, 2.5 KB (checked in by kjdon, 12 years ago)

now segments all metadata as well as text

Line 
1###########################################################################
2#
3# CJKTextSegmenter - helper plugin that segments chinese/japanese/korean text
4# into single characters
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2008 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package CJKTextSegmenter;
29
30use cnseg;
31use PrintInfo;
32use strict;
33
34BEGIN {
35    @CJKTextSegmenter::ISA = ('PrintInfo');
36}
37
38my $arguments = [
39      { 'name' => "separate_cjk",
40    'desc' => "{CJKTextSegmenter.separate_cjk}",
41    'type' => "flag",
42    'reqd' => "no" }
43         ];
44my $options = { 'name'     => "CJKTextSegmenter",
45        'desc'     => "{CJKTextSegmenter.desc}",
46        'abstract' => "yes",
47        'inherits' => "yes",
48        'args'     => $arguments };
49
50
51sub new {
52    my ($class) = shift (@_);
53    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
54    push(@$pluginlist, $class);
55
56    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
57    push(@{$hashArgOptLists->{"OptList"}},$options);
58
59    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);
60
61    return bless $self, $class;
62
63}
64
65
66sub separate_cjk_text {
67
68    my $self = shift (@_);
69    my ($doc_obj) = @_;
70   
71    if($self->{'separate_cjk'}) {
72    my $thissection = $doc_obj->get_top_section();
73    while (defined $thissection) {
74       
75        # segment the text
76        my $text = $doc_obj->get_text($thissection);
77        $text = &cnseg::segment($text);
78        $doc_obj->add_utf8_text($thissection, $text);
79
80        # and now the metadata
81        my $metadataref = $doc_obj->get_all_metadata($thissection);
82        foreach my $data (@$metadataref) {
83        $data->[1] = &cnseg::segment($data->[1]);
84        }
85        $thissection = $doc_obj->get_next_section ($thissection);
86    }
87    }
88}
89
901;
Note: See TracBrowser for help on using the browser.