###########################################################################
#
# cjk.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2001 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# functions for dealing with CJK (and possibly other) complex character
# encodings

package cjk;

use util;


###########################################################################

# GB to Unicode
sub gb2unicode {
    return &cjk2unicode ("GBK", 0x25a1, @_);
}

sub unicode2gb {
    return &unicode2cjk ("GBK", 0xA1F5, @_);
}

###########################################################################

# EUC encoded Japanese to unicode (doesn't currently support JIS X
# 0212-1990 3-byte characters)
sub eucjp2unicode {
    my ($intext) = @_;

    if (!&loadmapencoding ("JIS-unicode")) {
	print STDERR "cjk: ERROR - could not load encoding JIS\n";
	return [];
    }

    my @outtext = ();
    my $encodename = "JIS-unicode";
    my $len = length($intext);
    my ($c1, $c2);
    my $i = 0;

    while ($i < $len) {
	$c1 = ord(substr($intext, $i, 1));
	if ($c1 < 0x80) {
	    # ASCII/JIS Roman character
	    $c1 = 0x00A5 if $c1 == 0x5C; # yen sign
	    $c1 = 0x203E if $c1 == 0x7E; # overline
	    push (@outtext, $c1);
	    $i ++;

	} elsif ($c1 == 0x8E) {
	    # half-width katakana character
	    $c2 = ord(substr($intext, $i+1, 1));
	    if ($c2 >= 0xA1 && $c2 <= 0xDF) {
		my $c = &transchar ($encodename, $c2);
		# put a black square if cannot translate
		$c = 0x25A1 if $c == 0;
		push (@outtext, $c);
	    } else {
		# error
		print STDERR "cjk: ERROR Invalid EUS-JP character\n";		
	    }
	    $i += 2;

	} elsif ($c1 >= 0xA1 && $c1 <= 0xFE) {
	    # double byte character
	    $c2 = ord(substr($intext, $i+1, 1));
	    if ($c2 >= 0xA1 && $c2 <= 0xFE) {
		$c1 = $c1 & 0x7F;
		$c2 = $c2 & 0x7F;
		my $c = &transchar ($encodename, ($c1 << 8) | $c2);

		# katakana midle dot seems to be the default character of choice
		# for this encoding
		$c = 0x30FB if $c == 0;
		push (@outtext, $c);
	    } else {
		# error
		print STDERR "cjk: ERROR Invalid EUS-JP character\n";
	    }
	    $i += 2;

	} elsif ($c1 == 0x8F) {
	    # three byte character - not supported - output a black square
	    print STDERR "cjk: WARNING EUS-JP string appears to contain 3 byte characters ";
	    print STDERR "which aren't supported\n";
	    push (@outtext, 0x25A1);
	    $i += 3;

	} else {
	    # error
	    print STDERR "cjk: ERROR Invalid EUS-JP character\n";
	    $i ++;
	}
    }
    return \@outtext;
}

# todo unicode2eucjp

###########################################################################

# UHC Korean to unicode
sub uhc2unicode {
    return &cjk2unicode ("UHC", 0x25a1, @_);
}

###########################################################################

# Shift-JIS to unicode
sub sjis2unicode {
    my ($intext) = @_;

    if (!&loadmapencoding ("SJIS-unicode")) {
	print STDERR "cjk: ERROR - could not load encoding SJIS\n";
	return [];
    }
    
    my @outtext = ();
    my $encodename = "SJIS-unicode";
    my $len = length($intext);
    my ($c1, $c2);
    my $i = 0;

    while ($i < $len) {
	$c1 = ord(substr($intext, $i, 1));

	if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
	    # Single byte half-width katakana character or
	    # JIS Roman yen or overline characters
	    my $c = &transchar ($encodename, $c1);
	    # - put a black square if cannot translate
	    $c = 0x25A1 if $c == 0;
	    push (@outtext, $c);
	    $i++;

	} elsif ($c1 < 0x80) {
	    # ASCII
	    push (@outtext, $c1);
	    $i ++;

	} elsif ($c1 < 0xEF) {
	    if ($i+1 < $len) {
		$c2 = ord(substr($intext, $i+1, 1));
		if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
		    # Double byte shift-jis character
		    my $c = &transchar ($encodename, ($c1 << 8) | $c2);
		    # put a black square if cannot translate
		    $c = 0x25A1 if $c == 0;
		    push (@outtext, $c);
		} else {
		    # error
		    print STDERR "cjk: ERROR Invalid Shift-JIS character\n";
		}
		$i += 2;
	    } else {
		# error
		print STDERR "cjk: ERROR missing second half of Shift-JIS character\n";
		$i ++;
	    }
	} else {
	    # error
	    print STDERR "cjk: ERROR Invalid Shift-JIS character\n";
	    $i ++;
	}
    }
    return \@outtext;
}

sub unicode2sjis {
    return &unicode2cjk ("SJIS", 0x81A0, @_,);
}

###########################################################################

# %translations is of the form:
#
# encodings{encodingname-encodingname}->blocktranslation
# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
#
# Any of the top translation blocks can point to an undefined
# value. This data structure aims to allow fast translation and 
# efficient storage.
%translations = ();

# @array256 is used for initialisation, there must be
# a better way...
@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);

# The following two functions may be used when converting between some cjk
# encodings and unicode. They treat all characters <= 0x80 as ascii and
# convert everything else using the mapping for $encoding.
# $dchar is the default character that's inserted if a character can't be
# translated.

# returns a pointer to unicode array
sub cjk2unicode {
    my ($encoding, $dchar, $intext) = @_;
    $dchar = 0x20 unless defined $dchar;

    if (!&loadmapencoding ("$encoding-unicode")) {
	print STDERR "cjk: ERROR - could not load encoding $encoding\n";
	return [];
    }
    
    my @outtext = ();
    my $encodename = "$encoding-unicode";
    my $len = length($intext);
    my ($c1, $c2);
    my $i = 0;

    while ($i < $len) {
	if (($c1 = ord(substr($intext, $i, 1))) >= 0x80) {
	    if ($i+1 < $len) {
		# double byte character
		$c2 = ord(substr($intext, $i+1, 1));
		my $c = &transchar ($encodename, ($c1 << 8) | $c2);
		$c = $dchar if $c == 0;
		push (@outtext, $c);
		$i += 2;
		
	    } else {
		# error
		print STDERR "cjk: ERROR missing second half of double byte character\n";
		$i++;
	    }
	    
	} else {
	    # normal ascii character
	    push (@outtext, $c1);
	    $i++;
	}
    }
    return \@outtext;
}

sub unicode2cjk {
    my ($encoding, $dchar, $intext) = @_;
    $dchar = 0x20 unless defined $dchar;

    # load the encoding (if it is not already loaded)
    if (!&loadmapencoding ("unicode-$encoding")) {
	print STDERR "cjk: ERROR - could not load encoding $encoding\n";
	return "";
    }
    
    # translate the string
    my ($num, $char);
    my $outtext = "";
    my $encodename = "unicode-$encoding";
    foreach $num (@$intext) {
	if ($num < 0x80) {
	    # normal ascii character
	    $outtext .= chr ($num);
	} else {
	    # mapped character
	    my $char = &transchar ($encodename, $num);
	    $char = $dchar if $char == 0;
	    $outtext .= chr ($char >> 8);
	    $outtext .= chr ($char & 0xff);
	}
    }
    return $outtext;
}

# returns 1 if successful, 0 if unsuccessful
sub loadmapencoding {
    my ($encoding) = @_;
    
    # check to see if the encoding has already been loaded
    return 1 if (defined $translations{$encoding});

    my $filename = $encoding;
    $filename =~ s/\-//;
    $filename =~ s/unicode/u/i;
    $filename =~ tr/A-Z/a-z/;
    $filename .= ".ump"; # unicode map file

    return 0 unless open (MAPFILE, &util::filename_cat ($ENV{'GSDLHOME'}, "unicode", $filename));
    binmode (MAPFILE);

    $translations{$encoding} = [@array256];
    my $block = $translations{$encoding};

    my ($in,$i,$j);
    while (read(MAPFILE, $in, 1) == 1) {
	$i = unpack ("C", $in);
	$block->[$i] = [@array256];
	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
	    my ($n1, $n2) = unpack ("CC", $in);
	    $block->[$i]->[$j] = ($n1*256) + $n2;
	}
    }

    close (MAPFILE);
}

# loadencoding expects the mapfile to contain (at least) two
# tab-separated fields. The first field is the mapped value
# and the second field is the unicode value.
#
# It returns 1 if successful, 0 if unsuccessful
sub loadencoding {
    my ($encoding, $mapfile) = @_;
    
    my $to = "$encoding-unicode";
    my $from = "unicode-$encoding";

    # check to see if the encoding has already been loaded
    if (defined $translations{$to} && defined $translations{$from}) {
	return 1;
    }

    return 0 unless open (MAPFILE, $mapfile);

    my ($line, @line);
    $translations{$to} = [@array256];
    $translations{$from} = [@array256];
    while (defined ($line = <MAPFILE>)) {
	chomp $line;
	# remove comments
	$line =~ s/\#.*$//;
	next unless $line =~ /\S/;

	# split the line into fields and do a few
	# simple sanity checks
	@line = split (/\t/, $line);
	next unless (scalar(@line) >= 2 &&
		     $line[0] =~ /^0x/ &&
		     $line[1] =~ /^0x/);

	my $char = hex($line[0]);
	my $unic = hex($line[1]);

	# might need this for some versions of gb but not gbk
#	$char = $char | 0x8080 unless ($encoding =~ /gbk/i);

	&addchartrans ($translations{$to}, $char, $unic);
	&addchartrans ($translations{$from}, $unic, $char);
    }

    close (MAPFILE);

    return 1;
}

# addchartrans adds one character translation to a translation block.
# It also simplifies the translation block if possible.
sub addchartrans {
    my ($block, $from, $to) = @_;
    my $i = 0;

    my $high = ($from / 256) % 256;
    my $low = $from % 256;

    if (ref ($block->[$high]) ne "ARRAY") {
	$block->[$high] = [@array256];
    }
    $block->[$high]->[$low] = $to;
}

sub transchar {
    my ($encoding, $from) = @_;
    my $high = ($from / 256) % 256;
    my $low = $from % 256;

    return 0 unless defined $translations{$encoding};

    my $block = $translations{$encoding};

    if (ref ($block->[$high]) ne "ARRAY") {
	return 0;
    }
    return $block->[$high]->[$low];
}

sub writemapfile {
    my ($encoding, $filename, $tounicode) = @_;

    $filename .= ".ump"; # unicode map file
    if ($tounicode) {
	$filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename);
    } else {
	$filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename);
    }

    die "translation not defined" if (!defined $translations{$encoding});
    my $block = $translations{$encoding};

    print "writing $filename\n";
    open (MAPFILE, ">" . $filename) || die;
    binmode (MAPFILE);

    my ($i, $j);
    for ($i=0; $i<256; $i++) {
	if (ref ($block->[$i]) eq "ARRAY") {
	    print MAPFILE pack ("C", $i);
	    for ($j=0; $j<256; $j++) {
		# unsigned short in network order
		print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256), 
				    $block->[$i]->[$j] % 256);
	    }
	}
    }
    close (MAPFILE);
}

sub makeencodingmapfile {
    my ($encoding, $txtmapfile) = @_;

    if (!&loadencoding ($encoding, $txtmapfile)) {
	die "couldn't load encoding $encoding";
    }

    # write out map files
    &writemapfile ("$encoding-unicode", $encoding, 1);
    &writemapfile ("unicode-$encoding", $encoding, 0);
}

1;