Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 1240

Last change on this file since 1240 was 1227, checked in by sjboddie, 24 years ago
Modified the perl code for importing arabic encoded documents. Plugins now support a windows_1256 and an iso_8859_6 encoding. I was briefly under the impression that these two encodings were similar enough to be treated the same. It turns out they're not. It appears that the Windows codepage 1256 is the most commonly used Arabic encoding so "arabic" is a synonym for windows_1256.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.6 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# unicode.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[71]	26	# useful functions for dealing with Unicode
	27
	28	# Unicode strings are stored as arrays of scalars as perl
	29	# lacks characters are 8-bit (currently)
	30
	31	package unicode;
	32
[1227]	33	%translations = ();
[71]	34
[1223]	35	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
	36	# and returns a unicode array.
[73]	37	sub ascii2unicode {
	38	my ($in) = @_;
	39	my $out = [];
	40
	41	my $i = 0;
	42	my $len = length($in);
	43	while ($i < $len) {
	44	push (@$out, ord(substr ($in, $i, 1)));
	45	$i++;
	46	}
	47
	48	return $out;
	49	}
	50
[1223]	51	# arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
	52	# and returns a unicode array
	53	sub arabic2unicode {
	54	my ($in) = @_;
	55	my $out = [];
[73]	56
[1223]	57	my $i = 0;
	58	my $len = length($in);
	59	while ($i < $len) {
	60	my $c = ord(substr ($in, $i, 1));
	61	$c += (1567-191) if ($c >= 0x80);
	62	push (@$out, $c);
	63	$i++;
	64	}
	65
	66	return $out;
	67	}
	68
[1227]	69	# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
	70	# and returns a unicode array. These encodings are similar to but not
	71	# identical to the corresponding ISO-8859 encodings.
	72	#
	73	# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
	74	sub windows2unicode {
	75	my ($encoding, $in) = @_;
	76	my $out = [];
	77
	78	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
	79	"WINDOWS", "$encoding.TXT");
	80	return $out unless &loadmapping ($encoding, $mapfile);
	81
	82	my $i = 0;
	83	my $len = length($in);
	84	while ($i < $len) {
	85	my $c = ord(substr ($in, $i, 1));
	86	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
	87	push (@$out, $c);
	88	$i++;
	89	}
	90
	91	return $out;
	92	}
	93
	94
[73]	95	# ascii2utf8 takes a (extended) ascii string and
	96	# returns a UTF-8 encoded string. This is just
	97	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
	98	sub ascii2utf8 {
	99	my ($in) = @_;
	100	my $out = "";
	101
	102	my ($c);
	103	my $i = 0;
	104	my $len = length($in);
	105	while ($i < $len) {
	106	$c = ord (substr ($in, $i, 1));
	107	if ($c < 0x80) {
	108	# ascii character
	109	$out .= chr ($c);
	110
	111	} else {
	112	# extended ascii character
	113	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
	114	$out .= chr (0x80 + ($c & 0x3f));
	115	}
	116	$i++;
	117	}
	118
	119	return $out;
	120	}
	121
	122
[71]	123	# unicode2utf8 takes a unicode array as input and encodes it
	124	# using utf-8
	125	sub unicode2utf8 {
	126	my ($in) = @_;
	127	my $out = "";
	128
	129	foreach $num (@$in) {
	130	if ($num < 0x80) {
	131	$out .= chr ($num);
	132
	133	} elsif ($num < 0x800) {
	134	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
	135	$out .= chr (0x80 + ($num & 0x3f));
	136
	137	} elsif ($num < 0xFFFF) {
	138	$out .= chr (0xe0 + (($num >> 12) & 0xf));
	139	$out .= chr (0x80 + (($num >> 6) & 0x3f));
	140	$out .= chr (0x80 + ($num & 0x3f));
	141
	142	} else {
	143	# error, don't encode anything
	144	die;
	145	}
	146	}
	147
	148	return $out;
	149	}
	150
	151
	152	# utf82unicode takes a utf-8 string and produces a unicode
	153	# array
	154	sub utf82unicode {
	155	my ($in) = @_;
	156	my $out = [];
	157
	158	my $i = 0;
	159	my ($c1, $c2, $c3);
	160	$len = length($in);
	161	while ($i < $len) {
	162	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
	163	# normal ascii character
	164	push (@$out, $c1);
	165
	166	} elsif ($c1 < 0xc0) {
	167	# error, was expecting the first byte of an
	168	# encoded character. Do nothing.
	169
	170	} elsif ($c1 < 0xe0 && $i+1 < $len) {
	171	# an encoded character with two bytes
	172	$c2 = ord (substr ($in, $i+1, 1));
	173	if ($c2 >= 0x80 && $c2 < 0xc0) {
	174	# everything looks ok
	175	push (@$out, ((($c1 & 0x1f) << 6) +
	176	($c2 & 0x3f)));
	177	$i++; # gobbled an extra byte
	178	}
	179
	180	} elsif ($c1 < 0xf0 && $i+2 < $len) {
	181	# an encoded character with three bytes
	182	$c2 = ord (substr ($in, $i+1, 1));
	183	$c3 = ord (substr ($in, $i+2, 1));
	184	if ($c2 >= 0x80 && $c2 < 0xc0 &&
	185	$c3 >= 0x80 && $c3 < 0xc0) {
	186	# everything looks ok
	187	push (@$out, ((($c1 & 0xf) << 12) +
	188	(($c2 & 0x3f) << 6) +
	189	($c3 & 0x3f)));
	190
	191	$i += 2; # gobbled an extra two bytes
	192	}
	193
	194	} else {
	195	# error, only decode Unicode characters not full UCS.
	196	# Do nothing.
	197	}
	198
	199	$i++;
	200	}
	201
	202	return $out;
	203	}
	204
	205
	206	# unicode2ucs2 takes a unicode array and produces a UCS-2
	207	# unicode string (every two bytes forms a unicode character)
	208	sub unicode2ucs2 {
	209	my ($in) = @_;
	210	my $out = "";
	211
	212	foreach $num (@$in) {
	213	$out .= chr (($num & 0xff00) >> 8);
	214	$out .= chr ($num & 0xff);
	215	}
	216
	217	return $out;
	218	}
	219
	220
	221	# ucs22unicode takes a UCS-2 string and produces a unicode array
	222	sub ucs22unicode {
	223	my ($in) = @_;
	224	my $out = [];
	225
	226	my $i = 0;
	227	my $len = length ($in);
	228	while ($i+1 < $len) {
	229	push (@$out, ord (substr($in, $i, 1)) << 8 +
	230	ord (substr($in, $i+1, 1)));
	231
	232	$i ++;
	233	}
	234
	235	return $out;
	236	}
	237
[1227]	238	# loadmapping expects the mapfile to contain (at least) two
	239	# tab-separated fields. The first field is the mapped value
	240	# and the second field is the unicode value.
	241	#
	242	# It returns 1 if successful, 0 if unsuccessful
	243	sub loadmapping {
	244	my ($encoding, $mapfile) = @_;
	245
	246	my $to = "$encoding-unicode";
	247	my $from = "unicode-$encoding";
[71]	248
[1227]	249	# check to see if the encoding has already been loaded
	250	if (defined $translations{$to} && defined $translations{$from}) {
	251	return 1;
	252	}
	253
	254	if (!open (MAPFILE, $mapfile)) {
	255	print STDERR "ERROR: unable to load mapfile $mapfile\n";
	256	return 0;
	257	}
	258
	259	my ($line, @line);
	260	$translations{$to} = {};
	261	$translations{$from} = {};
	262	while (defined ($line = <MAPFILE>)) {
	263	# remove comments
	264	$line =~ s/\#.*$//;
	265	next unless $line =~ /\S/;
	266
	267	# split the line into fields and do a few
	268	# simple sanity checks
	269	@line = split (/\t/, $line);
	270	next unless (scalar(@line) >= 2 &&
	271	$line[0] =~ /^0x/ &&
	272	$line[1] =~ /^0x/);
	273
	274	my $a = hex($line[0]);
	275	my $b = hex($line[1]);
	276
	277	$translations{$to}->{$a} = $b;
	278	$translations{$from}->{$b} = $a;
	279	}
	280
	281	close (MAPFILE);
	282
	283	return 1;
	284	}
	285
	286
[71]	287	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: