###########################################################################
#
# unicode.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# useful functions for dealing with Unicode

# Unicode strings are stored as arrays of scalars as perl 
# lacks characters are 8-bit (currently)

package unicode;


# ascii2unicode takes a (extended) ascii string and
# returns a unicode array.
sub ascii2unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my $len = length($in);
    while ($i < $len) {
	push (@$out, ord(substr ($in, $i, 1)));	
	$i++;
    }

    return $out;
}


# ascii2utf8 takes a (extended) ascii string and
# returns a UTF-8 encoded string. This is just
# a faster version of "&unicode2utf8(&ascii2unicode($str));"
sub ascii2utf8 {
    my ($in) = @_;
    my $out = "";

    my ($c);
    my $i = 0;
    my $len = length($in);
    while ($i < $len) {
	$c = ord (substr ($in, $i, 1));
	if ($c < 0x80) {
	    # ascii character
	    $out .= chr ($c);

	} else {
	    # extended ascii character
	    $out .= chr (0xc0 + (($c >> 6) & 0x1f));
	    $out .= chr (0x80 + ($c & 0x3f));
	}
	$i++;
    }

    return $out;
}


# unicode2utf8 takes a unicode array as input and encodes it
# using utf-8
sub unicode2utf8 {
    my ($in) = @_;
    my $out = "";
    
    foreach $num (@$in) {
	if ($num < 0x80) {
	    $out .= chr ($num);

	} elsif ($num < 0x800) {
	    $out .= chr (0xc0 + (($num >> 6) & 0x1f));
	    $out .= chr (0x80 + ($num & 0x3f));

	} elsif ($num < 0xFFFF) {
	    $out .= chr (0xe0 + (($num >> 12) & 0xf));
	    $out .= chr (0x80 + (($num >> 6) & 0x3f));
	    $out .= chr (0x80 + ($num & 0x3f));

	} else {
	    # error, don't encode anything
	    die;
	}
    }

    return $out;
}


# utf82unicode takes a utf-8 string and produces a unicode
# array
sub utf82unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my ($c1, $c2, $c3);
    $len = length($in);
    while ($i < $len) {
	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
	    # normal ascii character
	    push (@$out, $c1);

	} elsif ($c1 < 0xc0) {
	    # error, was expecting the first byte of an
	    # encoded character. Do nothing.

	} elsif ($c1 < 0xe0 && $i+1 < $len) {
	    # an encoded character with two bytes
	    $c2 = ord (substr ($in, $i+1, 1));
	    if ($c2 >= 0x80 && $c2 < 0xc0) {
		# everything looks ok
		push (@$out, ((($c1 & 0x1f) << 6) +
		      ($c2 & 0x3f)));
		$i++; # gobbled an extra byte
	    }

	} elsif ($c1 < 0xf0 && $i+2 < $len) {
	    # an encoded character with three bytes
	    $c2 = ord (substr ($in, $i+1, 1));
	    $c3 = ord (substr ($in, $i+2, 1));
	    if ($c2 >= 0x80 && $c2 < 0xc0 &&
		$c3 >= 0x80 && $c3 < 0xc0) {
		# everything looks ok
		push (@$out, ((($c1 & 0xf) << 12) +
		      (($c2 & 0x3f) << 6) +
		      ($c3 & 0x3f)));

		$i += 2; # gobbled an extra two bytes
	    }

	} else {
	    # error, only decode Unicode characters not full UCS.
	    # Do nothing.
	}

	$i++;
    }

    return $out;
}


# unicode2ucs2 takes a unicode array and produces a UCS-2
# unicode string (every two bytes forms a unicode character)
sub unicode2ucs2 {
    my ($in) = @_;
    my $out = "";

    foreach $num (@$in) {
	$out .= chr (($num & 0xff00) >> 8);
	$out .= chr ($num & 0xff);
    }

    return $out;
}


# ucs22unicode takes a UCS-2 string and produces a unicode array
sub ucs22unicode {
    my ($in) = @_;
    my $out = [];

    my $i = 0;
    my $len = length ($in);
    while ($i+1 < $len) {
	push (@$out, ord (substr($in, $i, 1)) << 8 +
	      ord (substr($in, $i+1, 1)));

	$i ++;
    }

    return $out;
}


1;