Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 593

Last change on this file since 593 was 537, checked in by sjboddie, 25 years ago
added GPL headers
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.4 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# unicode.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[71]	26	# useful functions for dealing with Unicode
	27
	28	# Unicode strings are stored as arrays of scalars as perl
	29	# lacks characters are 8-bit (currently)
	30
	31	package unicode;
	32
	33
[73]	34
	35	# ascii2unicode takes a (extended) ascii string and
	36	# returns a unicode array.
	37	sub ascii2unicode {
	38	my ($in) = @_;
	39	my $out = [];
	40
	41	my $i = 0;
	42	my $len = length($in);
	43	while ($i < $len) {
	44	push (@$out, ord(substr ($in, $i, 1)));
	45	$i++;
	46	}
	47
	48	return $out;
	49	}
	50
	51
	52	# ascii2utf8 takes a (extended) ascii string and
	53	# returns a UTF-8 encoded string. This is just
	54	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
	55	sub ascii2utf8 {
	56	my ($in) = @_;
	57	my $out = "";
	58
	59	my ($c);
	60	my $i = 0;
	61	my $len = length($in);
	62	while ($i < $len) {
	63	$c = ord (substr ($in, $i, 1));
	64	if ($c < 0x80) {
	65	# ascii character
	66	$out .= chr ($c);
	67
	68	} else {
	69	# extended ascii character
	70	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
	71	$out .= chr (0x80 + ($c & 0x3f));
	72	}
	73	$i++;
	74	}
	75
	76	return $out;
	77	}
	78
	79
[71]	80	# unicode2utf8 takes a unicode array as input and encodes it
	81	# using utf-8
	82	sub unicode2utf8 {
	83	my ($in) = @_;
	84	my $out = "";
	85
	86	foreach $num (@$in) {
	87	if ($num < 0x80) {
	88	$out .= chr ($num);
	89
	90	} elsif ($num < 0x800) {
	91	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
	92	$out .= chr (0x80 + ($num & 0x3f));
	93
	94	} elsif ($num < 0xFFFF) {
	95	$out .= chr (0xe0 + (($num >> 12) & 0xf));
	96	$out .= chr (0x80 + (($num >> 6) & 0x3f));
	97	$out .= chr (0x80 + ($num & 0x3f));
	98
	99	} else {
	100	# error, don't encode anything
	101	die;
	102	}
	103	}
	104
	105	return $out;
	106	}
	107
	108
	109	# utf82unicode takes a utf-8 string and produces a unicode
	110	# array
	111	sub utf82unicode {
	112	my ($in) = @_;
	113	my $out = [];
	114
	115	my $i = 0;
	116	my ($c1, $c2, $c3);
	117	$len = length($in);
	118	while ($i < $len) {
	119	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
	120	# normal ascii character
	121	push (@$out, $c1);
	122
	123	} elsif ($c1 < 0xc0) {
	124	# error, was expecting the first byte of an
	125	# encoded character. Do nothing.
	126
	127	} elsif ($c1 < 0xe0 && $i+1 < $len) {
	128	# an encoded character with two bytes
	129	$c2 = ord (substr ($in, $i+1, 1));
	130	if ($c2 >= 0x80 && $c2 < 0xc0) {
	131	# everything looks ok
	132	push (@$out, ((($c1 & 0x1f) << 6) +
	133	($c2 & 0x3f)));
	134	$i++; # gobbled an extra byte
	135	}
	136
	137	} elsif ($c1 < 0xf0 && $i+2 < $len) {
	138	# an encoded character with three bytes
	139	$c2 = ord (substr ($in, $i+1, 1));
	140	$c3 = ord (substr ($in, $i+2, 1));
	141	if ($c2 >= 0x80 && $c2 < 0xc0 &&
	142	$c3 >= 0x80 && $c3 < 0xc0) {
	143	# everything looks ok
	144	push (@$out, ((($c1 & 0xf) << 12) +
	145	(($c2 & 0x3f) << 6) +
	146	($c3 & 0x3f)));
	147
	148	$i += 2; # gobbled an extra two bytes
	149	}
	150
	151	} else {
	152	# error, only decode Unicode characters not full UCS.
	153	# Do nothing.
	154	}
	155
	156	$i++;
	157	}
	158
	159	return $out;
	160	}
	161
	162
	163	# unicode2ucs2 takes a unicode array and produces a UCS-2
	164	# unicode string (every two bytes forms a unicode character)
	165	sub unicode2ucs2 {
	166	my ($in) = @_;
	167	my $out = "";
	168
	169	foreach $num (@$in) {
	170	$out .= chr (($num & 0xff00) >> 8);
	171	$out .= chr ($num & 0xff);
	172	}
	173
	174	return $out;
	175	}
	176
	177
	178	# ucs22unicode takes a UCS-2 string and produces a unicode array
	179	sub ucs22unicode {
	180	my ($in) = @_;
	181	my $out = [];
	182
	183	my $i = 0;
	184	my $len = length ($in);
	185	while ($i+1 < $len) {
	186	push (@$out, ord (substr($in, $i, 1)) << 8 +
	187	ord (substr($in, $i+1, 1)));
	188
	189	$i ++;
	190	}
	191
	192	return $out;
	193	}
	194
	195
	196	1;
	197

Note: See TracBrowser for help on using the repository browser.

Download in other formats: