###########################################################################
#
# gb.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

#! /usr/local/bin/perl5 -w

# useful methods for dealing with the GuoBiao (GB) encoding

package gb;

use util;

# %encodings is of the form:
#
# encodings{encodingname-encodingname}->blocktranslation
# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
#
# Any of the top translation blocks can point to an undefined
# value. This data structure aims to allow fast translation and 
# efficient storage.

# @array256 is used for initialisation, there must be
# a better way...
@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);


%translations = ();


# gb2unicode uses the GBK translation table
sub gb2unicode {
    generalgb2unicode ("GBK", @_);
}

# returns a pointer to unicode array
sub generalgb2unicode {
    my ($encoding, $intext) = @_;

    if (!&loadmapencoding ("$encoding-unicode")) {
	print STDERR "gb: ERROR - could not load encoding $encoding\n";
	return [];
    }
    
    my @outtext = ();
    my $encodename = "$encoding-unicode";
    my $len = length($intext);
    my ($c1, $c2);
    my $i = 0;

    while ($i < $len) {
	if (($c1 = ord(substr($intext, $i, 1))) >= 0x81) {
	    if ($i+1 < $len) {
		# found a GB character, put black square if cannot translate
		$c2 = ord(substr($intext, $i+1, 1));
		my $c = &transchar ($encodename, ($c1 << 8) | $c2);
		$c = 0x25a1 if $c == 0;
		push (@outtext, $c);
		$i += 2;
		
	    } else {
		# error
		print STDERR "gb: ERROR missing second half of GB character\n";
		$i++;
	    }
	    
	} else {
	    # normal ascii character
	    push (@outtext, $c1) if ($c1 < 0x80);
	    $i++;
	}
    }
    
    return \@outtext;
}


# unicode2gb uses the GBK translation table
sub unicode2gb {
    generalunicode2gb ("GBK", @_);
}

# returns a gb string. Note that this will not translate
# traditional forms of characters to their simplified forms
# Any traditional characters in the unicode will not be translated
sub generalunicode2gb {
    my ($encoding, $intext) = @_;

    # load the encoding (if it is not already loaded)
    if (!&loadmapencoding ("unicode-$encoding")) {
	print STDERR "gb: ERROR - could not load encoding $encoding\n";
	return "";
    }
    
    # translate the string
    my ($num, $gbc);
    my $outtext = "";
    my $encodename = "unicode-$encoding";
    foreach $num (@$intext) {
	if ($num < 0x80) {
	    # normal ascii character
	    $outtext .= chr ($num);
	} else {
	    # gb character
	    my $gbc = &transchar ($encodename, $num);
	    $gbc = 0xa1f5 if $gbc == 0;
	    $outtext .= chr ($gbc >> 8);
	    $outtext .= chr ($gbc & 0xff);
	}
    }

    return $outtext;
}


# It returns 1 if successful, 0 if unsuccessful
sub loadmapencoding {
    my ($encoding) = @_;
    
    # check to see if the encoding has already been loaded
    return 1 if (defined $translations{$encoding});

    my $filename = $encoding;
    $filename =~ s/\-//;
    $filename =~ s/unicode/u/i;
    $filename =~ tr/A-Z/a-z/;
    $filename .= ".ump"; # unicode map file

    return 0 unless open (MAPFILE, "$ENV{'GSDLHOME'}/unicode/$filename");
    binmode (MAPFILE); # f$#@!!! windows

    $translations{$encoding} = [@array256];
    my $block = $translations{$encoding};

    my ($in,$i,$j);
    while (read(MAPFILE, $in, 1) == 1) {
	$i = unpack ("C", $in);
	$block->[$i] = [@array256];
	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
	    my ($n1, $n2) = unpack ("CC", $in);
	    $block->[$i]->[$j] = ($n1*256) + $n2;
	}
    }

    close (MAPFILE);
}


# loadencoding expects the mapfile to contain (at least) two
# tab-seperated fields. The first field is the mapped value
# and the second field is the unicode value.
#
# It returns 1 if successful, 0 if unsuccessful
sub loadencoding {
    my ($encoding, $mapfile) = @_;
    
    my $to = "$encoding-unicode";
    my $from = "unicode-$encoding";

    # check to see if the encoding has already been loaded
    if (defined $translations{$to} && defined $translations{$from}) {
	return 1;
    }

    return 0 unless open (MAPFILE, $mapfile);

    my ($line, @line);
    $translations{$to} = [@array256];
    $translations{$from} = [@array256];
    while (defined ($line = <MAPFILE>)) {
	# remove comments
	$line =~ s/\#.*$//;
	next unless $line =~ /\S/;

	# split the line into fields and do a few
	# simple sanity checks
	@line = split (/\t/, $line);
	next unless (scalar(@line) >= 2 &&
		     $line[0] =~ /^0x/ &&
		     $line[1] =~ /^0x/);

	my $gbc = hex($line[0]);
	my $unic = hex($line[1]);

	# gbk has already had the necessary most significant
	# bits set (it is used in the encoding
	$gbc = $gbc | 0x8080 unless ($encoding =~ /gbk/i);

	&addchartrans ($translations{$to}, $gbc, $unic);
	&addchartrans ($translations{$from}, $unic, $gbc);
    }

    close (MAPFILE);

    return 1;
}

sub showfilled {
    my ($block) = @_;
    my $count = 0;
    foreach $el (@$block) {
	if (ref ($el) eq "ARRAY") {
	    print "1";
	    $count++;
	} else {
	    print "0";
	}
    }
    print "\n";
    print "total: $count\n";
}


# addchartrans adds one character translation to a translation block.
# It also simplifies the translation block if possible.
sub addchartrans {
    my ($block, $from, $to) = @_;
    my $i = 0;

    my $high = ($from / 256) % 256;
    my $low = $from % 256;

    if (ref ($block->[$high]) ne "ARRAY") {
	$block->[$high] = [@array256];
    }
    $block->[$high]->[$low] = $to;
}


sub transchar {
    my ($encoding, $from) = @_;
    my $high = ($from / 256) % 256;
    my $low = $from % 256;

    return 0 unless defined $translations{$encoding};

    my $block = $translations{$encoding};

    if (ref ($block->[$high]) ne "ARRAY") {
	return 0;
    }
    return $block->[$high]->[$low];
}


1;