###########################################################################
#
# CJKTextSegmenter - helper plugin that segments chinese/japanese/korean text 
# into single characters 
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2008 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package CJKTextSegmenter;

use cnseg;
use PrintInfo;
use strict;

BEGIN {
    @CJKTextSegmenter::ISA = ('PrintInfo');
}

my $arguments = [
      { 'name' => "separate_cjk",
	'desc' => "{CJKTextSegmenter.separate_cjk}",
	'type' => "flag",
	'reqd' => "no" }
		 ];
my $options = { 'name'     => "CJKTextSegmenter",
		'desc'     => "{CJKTextSegmenter.desc}",
		'abstract' => "yes",
		'inherits' => "yes",
		'args'     => $arguments };


sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    my $self = new PrintInfo($pluginlist, $inputargs, $hashArgOptLists, 1);

    return bless $self, $class;

}


sub separate_cjk_text {

    my $self = shift (@_);
    my ($doc_obj) = @_;
    
    if($self->{'separate_cjk'}) {
	my $thissection = $doc_obj->get_top_section();
	while (defined $thissection) {
	    
	    # segment the text
	    my $text = $doc_obj->get_text($thissection);
	    $text = &cnseg::segment($text);
	    $doc_obj->add_utf8_text($thissection, $text);

	    # and now the metadata
	    my $metadataref = $doc_obj->get_all_metadata($thissection);
	    foreach my $data (@$metadataref) {
		$data->[1] = &cnseg::segment($data->[1]);
	    }
	    $thissection = $doc_obj->get_next_section ($thissection);
	}
    }
}

1;