###########################################################################
#
# unicode.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# useful functions for dealing with Unicode

# Unicode strings are stored as arrays of scalars as perl 
# lacks characters are 8-bit (currently)

package unicode;

%translations = ();

# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
# and returns a unicode array.
sub ascii2unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my $len = length($in);
    while ($i < $len) {
	push (@$out, ord(substr ($in, $i, 1)));	
	$i++;
    }

    return $out;
}

# arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
# and returns a unicode array
sub arabic2unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my $len = length($in);
    while ($i < $len) {
	my $c = ord(substr ($in, $i, 1));
	$c += (1567-191) if ($c >= 0x80);
	push (@$out, $c);
	$i++;
    }

    return $out;
}

# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
# and returns a unicode array. These encodings are similar to but not
# identical to the corresponding ISO-8859 encodings.
#
# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
sub windows2unicode {
    my ($encoding, $in) = @_;
    my $out = [];

    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 
				      "WINDOWS", "$encoding.TXT"); 
    return $out unless &loadmapping ($encoding, $mapfile);

    my $i = 0;
    my $len = length($in);
    while ($i < $len) {
	my $c = ord(substr ($in, $i, 1));
	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
	push (@$out, $c);
	$i++;
    }

    return $out;
}


# ascii2utf8 takes a (extended) ascii string and
# returns a UTF-8 encoded string. This is just
# a faster version of "&unicode2utf8(&ascii2unicode($str));"
sub ascii2utf8 {
    my ($in) = @_;
    my $out = "";

    my ($c);
    my $i = 0;
    my $len = length($in);
    while ($i < $len) {
	$c = ord (substr ($in, $i, 1));
	if ($c < 0x80) {
	    # ascii character
	    $out .= chr ($c);

	} else {
	    # extended ascii character
	    $out .= chr (0xc0 + (($c >> 6) & 0x1f));
	    $out .= chr (0x80 + ($c & 0x3f));
	}
	$i++;
    }

    return $out;
}


# unicode2utf8 takes a unicode array as input and encodes it
# using utf-8
sub unicode2utf8 {
    my ($in) = @_;
    my $out = "";
    
    foreach $num (@$in) {
	if ($num < 0x80) {
	    $out .= chr ($num);

	} elsif ($num < 0x800) {
	    $out .= chr (0xc0 + (($num >> 6) & 0x1f));
	    $out .= chr (0x80 + ($num & 0x3f));

	} elsif ($num < 0xFFFF) {
	    $out .= chr (0xe0 + (($num >> 12) & 0xf));
	    $out .= chr (0x80 + (($num >> 6) & 0x3f));
	    $out .= chr (0x80 + ($num & 0x3f));

	} else {
	    # error, don't encode anything
	    die;
	}
    }

    return $out;
}


# utf82unicode takes a utf-8 string and produces a unicode
# array
sub utf82unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my ($c1, $c2, $c3);
    $len = length($in);
    while ($i < $len) {
	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
	    # normal ascii character
	    push (@$out, $c1);

	} elsif ($c1 < 0xc0) {
	    # error, was expecting the first byte of an
	    # encoded character. Do nothing.

	} elsif ($c1 < 0xe0 && $i+1 < $len) {
	    # an encoded character with two bytes
	    $c2 = ord (substr ($in, $i+1, 1));
	    if ($c2 >= 0x80 && $c2 < 0xc0) {
		# everything looks ok
		push (@$out, ((($c1 & 0x1f) << 6) +
		      ($c2 & 0x3f)));
		$i++; # gobbled an extra byte
	    }

	} elsif ($c1 < 0xf0 && $i+2 < $len) {
	    # an encoded character with three bytes
	    $c2 = ord (substr ($in, $i+1, 1));
	    $c3 = ord (substr ($in, $i+2, 1));
	    if ($c2 >= 0x80 && $c2 < 0xc0 &&
		$c3 >= 0x80 && $c3 < 0xc0) {
		# everything looks ok
		push (@$out, ((($c1 & 0xf) << 12) +
		      (($c2 & 0x3f) << 6) +
		      ($c3 & 0x3f)));

		$i += 2; # gobbled an extra two bytes
	    }

	} else {
	    # error, only decode Unicode characters not full UCS.
	    # Do nothing.
	}

	$i++;
    }

    return $out;
}


# unicode2ucs2 takes a unicode array and produces a UCS-2
# unicode string (every two bytes forms a unicode character)
sub unicode2ucs2 {
    my ($in) = @_;
    my $out = "";

    foreach $num (@$in) {
	$out .= chr (($num & 0xff00) >> 8);
	$out .= chr ($num & 0xff);
    }

    return $out;
}


# ucs22unicode takes a UCS-2 string and produces a unicode array
sub ucs22unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my $len = length ($in);
    while ($i+1 < $len) {
	push (@$out, ord (substr($in, $i, 1)) << 8 +
	      ord (substr($in, $i+1, 1)));

	$i ++;
    }

    return $out;
}

# loadmapping expects the mapfile to contain (at least) two
# tab-separated fields. The first field is the mapped value
# and the second field is the unicode value.
#
# It returns 1 if successful, 0 if unsuccessful
sub loadmapping {
    my ($encoding, $mapfile) = @_;
    
    my $to = "$encoding-unicode";
    my $from = "unicode-$encoding";

    # check to see if the encoding has already been loaded
    if (defined $translations{$to} && defined $translations{$from}) {
	return 1;
    }

    if (!open (MAPFILE, $mapfile)) {
	print STDERR "ERROR: unable to load mapfile $mapfile\n";
	return 0;
    }

    my ($line, @line);
    $translations{$to} = {};
    $translations{$from} = {};
    while (defined ($line = <MAPFILE>)) {
	# remove comments
	$line =~ s/\#.*$//;
	next unless $line =~ /\S/;

	# split the line into fields and do a few
	# simple sanity checks
	@line = split (/\t/, $line);
	next unless (scalar(@line) >= 2 &&
		     $line[0] =~ /^0x/ &&
		     $line[1] =~ /^0x/);

	my $a = hex($line[0]);
	my $b = hex($line[1]);

	$translations{$to}->{$a} = $b;
	$translations{$from}->{$b} = $a;
    }

    close (MAPFILE);

    return 1;
}


1;