Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 1868

Last change on this file since 1868 was 1868, checked in by sjboddie, 23 years ago
Made a bunch of changes to the building code to support lots of new languages and encodings. It's still kind of a mess but should be fixed up over the weekend.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.3 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	%translations = ();
34
35	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
36	# and returns a unicode array.
37	sub ascii2unicode {
38	my ($in) = @_;
39	my $out = [];
40
41	my $i = 0;
42	my $len = length($in);
43	while ($i < $len) {
44	push (@$out, ord(substr ($in, $i, 1)));
45	$i++;
46	}
47
48	return $out;
49	}
50
51	# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
52	# and returns a unicode array. These encodings are similar to but not
53	# identical to the corresponding ISO-8859 encodings.
54	#
55	# $encoding should be the code page name (e.g. '1252')
56	#
57	# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
58	sub windows2unicode {
59	my ($encoding, $in) = @_;
60	my $out = [];
61
62	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
63	"WINDOWS", "$encoding.TXT");
64	return $out unless &loadmapping ($encoding, $mapfile);
65
66	my $i = 0;
67	my $len = length($in);
68	while ($i < $len) {
69	my $c = ord(substr ($in, $i, 1));
70	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
71	push (@$out, $c);
72	$i++;
73	}
74
75	return $out;
76	}
77
78	# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
79	# and returns a unicode array. This function is much like windows2unicode()
80	# except that only characters >= 0xA0 are read from the mapping file (since
81	# all characters below that are the same for all iso-8859 character sets
82	# and therefore already the same as unicode).
83	#
84	# Note that while this function will work for iso-8859-1 (latin 1) it'll be
85	# much faster to use ascii2unicode() or ascii2utf8()
86	#
87	# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
88	# encoding is
89	#
90	# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
91	sub iso2unicode {
92	my ($encoding, $in) = @_;
93	my $out = [];
94
95	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
96	"ISO_8859", "$encoding.TXT");
97	return $out unless &loadmapping ($encoding, $mapfile);
98
99	my $i = 0;
100	my $len = length($in);
101	while ($i < $len) {
102	my $c = ord(substr ($in, $i, 1));
103	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
104	push (@$out, $c);
105	$i++;
106	}
107
108	return $out;
109	}
110
111	# cyrillic2unicode is basically identical to windows2unicode, the only
112	# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
113	#
114	# values for $encoding may be 'koi8_r' or 'koi8_u'
115	sub cyrillic2unicode {
116	my ($encoding, $in) = @_;
117	my $out = [];
118
119	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
120	"CYRILLIC", "$encoding.txt");
121	return $out unless &loadmapping ($encoding, $mapfile);
122
123	my $i = 0;
124	my $len = length($in);
125	while ($i < $len) {
126	my $c = ord(substr ($in, $i, 1));
127	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
128	push (@$out, $c);
129	$i++;
130	}
131
132	return $out;
133	}
134
135	# iscii2unicode is basically identical to iso2unicode, the only
136	# difference being that the map files live in unicode/MAPPINGS/ISCII
137	#
138	# values for $encoding may be 'Devanagari' only at present
139	sub iscii2unicode {
140	my ($encoding, $in) = @_;
141	my $out = [];
142
143	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
144	"ISCII", "$encoding.txt");
145	return $out unless &loadmapping ($encoding, $mapfile);
146
147	my $i = 0;
148	my $len = length($in);
149	while ($i < $len) {
150	my $c = ord(substr ($in, $i, 1));
151	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
152	push (@$out, $c);
153	$i++;
154	}
155
156	return $out;
157	}
158
159	# ascii2utf8 takes a (extended) ascii string and
160	# returns a UTF-8 encoded string. This is just
161	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
162	sub ascii2utf8 {
163	my ($in) = @_;
164	my $out = "";
165
166	my ($c);
167	my $i = 0;
168	my $len = length($in);
169	while ($i < $len) {
170	$c = ord (substr ($in, $i, 1));
171	if ($c < 0x80) {
172	# ascii character
173	$out .= chr ($c);
174
175	} else {
176	# extended ascii character
177	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
178	$out .= chr (0x80 + ($c & 0x3f));
179	}
180	$i++;
181	}
182
183	return $out;
184	}
185
186
187	# unicode2utf8 takes a unicode array as input and encodes it
188	# using utf-8
189	sub unicode2utf8 {
190	my ($in) = @_;
191	my $out = "";
192
193	foreach $num (@$in) {
194	next unless defined $num;
195	if ($num < 0x80) {
196	$out .= chr ($num);
197
198	} elsif ($num < 0x800) {
199	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
200	$out .= chr (0x80 + ($num & 0x3f));
201
202	} elsif ($num < 0xFFFF) {
203	$out .= chr (0xe0 + (($num >> 12) & 0xf));
204	$out .= chr (0x80 + (($num >> 6) & 0x3f));
205	$out .= chr (0x80 + ($num & 0x3f));
206
207	} else {
208	# error, don't encode anything
209	die;
210	}
211	}
212
213	return $out;
214	}
215
216
217	# utf82unicode takes a utf-8 string and produces a unicode
218	# array
219	sub utf82unicode {
220	my ($in) = @_;
221	my $out = [];
222
223	my $i = 0;
224	my ($c1, $c2, $c3);
225	$len = length($in);
226	while ($i < $len) {
227	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
228	# normal ascii character
229	push (@$out, $c1);
230
231	} elsif ($c1 < 0xc0) {
232	# error, was expecting the first byte of an
233	# encoded character. Do nothing.
234
235	} elsif ($c1 < 0xe0 && $i+1 < $len) {
236	# an encoded character with two bytes
237	$c2 = ord (substr ($in, $i+1, 1));
238	if ($c2 >= 0x80 && $c2 < 0xc0) {
239	# everything looks ok
240	push (@$out, ((($c1 & 0x1f) << 6) +
241	($c2 & 0x3f)));
242	$i++; # gobbled an extra byte
243	}
244
245	} elsif ($c1 < 0xf0 && $i+2 < $len) {
246	# an encoded character with three bytes
247	$c2 = ord (substr ($in, $i+1, 1));
248	$c3 = ord (substr ($in, $i+2, 1));
249	if ($c2 >= 0x80 && $c2 < 0xc0 &&
250	$c3 >= 0x80 && $c3 < 0xc0) {
251	# everything looks ok
252	push (@$out, ((($c1 & 0xf) << 12) +
253	(($c2 & 0x3f) << 6) +
254	($c3 & 0x3f)));
255
256	$i += 2; # gobbled an extra two bytes
257	}
258
259	} else {
260	# error, only decode Unicode characters not full UCS.
261	# Do nothing.
262	}
263
264	$i++;
265	}
266
267	return $out;
268	}
269
270
271	# unicode2ucs2 takes a unicode array and produces a UCS-2
272	# unicode string (every two bytes forms a unicode character)
273	sub unicode2ucs2 {
274	my ($in) = @_;
275	my $out = "";
276
277	foreach $num (@$in) {
278	$out .= chr (($num & 0xff00) >> 8);
279	$out .= chr ($num & 0xff);
280	}
281
282	return $out;
283	}
284
285
286	# ucs22unicode takes a UCS-2 string and produces a unicode array
287	sub ucs22unicode {
288	my ($in) = @_;
289	my $out = [];
290
291	my $i = 0;
292	my $len = length ($in);
293	while ($i+1 < $len) {
294	push (@$out, ord (substr($in, $i, 1)) << 8 +
295	ord (substr($in, $i+1, 1)));
296
297	$i ++;
298	}
299
300	return $out;
301	}
302
303	# loadmapping expects the mapfile to contain (at least) two
304	# tab-separated fields. The first field is the mapped value
305	# and the second field is the unicode value.
306	#
307	# It returns 1 if successful, 0 if unsuccessful
308	sub loadmapping {
309	my ($encoding, $mapfile) = @_;
310
311	my $to = "$encoding-unicode";
312	my $from = "unicode-$encoding";
313
314	# check to see if the encoding has already been loaded
315	if (defined $translations{$to} && defined $translations{$from}) {
316	return 1;
317	}
318
319	if (!open (MAPFILE, $mapfile)) {
320	print STDERR "ERROR: unable to load mapfile $mapfile\n";
321	return 0;
322	}
323
324	my ($line, @line);
325	$translations{$to} = {};
326	$translations{$from} = {};
327	while (defined ($line = <MAPFILE>)) {
328	# remove comments
329	$line =~ s/\#.*$//;
330	next unless $line =~ /\S/;
331
332	# split the line into fields and do a few
333	# simple sanity checks
334	@line = split (/\t/, $line);
335	next unless (scalar(@line) >= 2 &&
336	$line[0] =~ /^0x/ &&
337	$line[1] =~ /^0x/);
338
339	my $a = hex($line[0]);
340	my $b = hex($line[1]);
341
342	$translations{$to}->{$a} = $b;
343	$translations{$from}->{$b} = $a;
344	}
345
346	close (MAPFILE);
347
348	return 1;
349	}
350
351
352
353
354
355
356
357
358
359	####################################################################################################
360
361
362	# %translations is of the form:
363	#
364	# encodings{encodingname-encodingname}->blocktranslation
365	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
366	#
367	# Any of the top translation blocks can point to an undefined
368	# value. This data structure aims to allow fast translation and
369	# efficient storage.
370	%translations = ();
371
372	# @array256 is used for initialisation, there must be
373	# a better way...
374	@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
375	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
376	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
377	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
378	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
379	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
380	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
381	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
382	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
383	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
384	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
385	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
386	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
387	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
388	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
389	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
390
391	$encodings = {
392	'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',
393	'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},
394
395	'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',
396	'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},
397
398	'iso_8859_3' => {'fullname' => 'Latin3',
399	'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},
400
401	'iso_8859_4' => {'fullname' => 'Latin4',
402	'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},
403
404	'iso_8859_5' => {'fullname' => 'Cyrillic',
405	'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},
406
407	'iso_8859_6' => {'fullname' => 'Arabic',
408	'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},
409
410	'iso_8859_7' => {'fullname' => 'Greek',
411	'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},
412
413	'iso_8859_8' => {'fullname' => 'Hebrew',
414	'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},
415
416	'iso_8859_9' => {'fullname' => 'Latin5',
417	'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},
418
419	'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',
420	'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},
421
422	'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',
423	'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},
424
425	'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',
426	'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},
427
428	'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',
429	'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},
430
431	'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',
432	'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},
433
434	'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',
435	'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},
436
437	'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',
438	'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},
439
440	'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',
441	'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},
442
443	'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',
444	'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},
445
446	'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',
447	'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},
448
449	'koi8_r' => {'fullname' => 'Cyrillic',
450	'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},
451
452	'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',
453	'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},
454
455	'iscii_de' => {'fullname' => 'ISCII Devanagari',
456	'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}
457	};
458
459	# returns a pointer to unicode array
460	sub simple2unicode {
461	my ($encoding, $intext) = @_;
462
463	if (!defined ($encodings->{$encoding})) {
464	print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";
465	return [];
466	}
467
468	my $info = $encodings->{$encoding};
469	my $encodename = "$encoding-unicode";
470	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",
471	$info->{'mapfile'});
472
473	if (!&loadmapencoding ($encodename, $mapfile)) {
474	print STDERR "unicode: ERROR - could not load encoding $encodename\n";
475	return [];
476	}
477
478	my @outtext = ();
479	my $len = length($intext);
480	my ($c);
481	my $i = 0;
482
483	while ($i < $len) {
484	if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {
485	# normal ascii character
486	push (@outtext, $c);
487	} else {
488	push (@outtext, &transchar ($encodename, $c));
489	}
490	$i ++;
491	}
492	return \@outtext;
493	}
494
495	# returns 1 if successful, 0 if unsuccessful
496	sub loadmapencoding {
497	my ($encoding, $mapfile) = @_;
498
499	# check to see if the encoding has already been loaded
500	return 1 if (defined $translations{$encoding});
501
502	return 0 unless open (MAPFILE, $mapfile);
503	binmode (MAPFILE);
504
505	$translations{$encoding} = [@array256];
506	my $block = $translations{$encoding};
507
508	my ($in,$i,$j);
509	while (read(MAPFILE, $in, 1) == 1) {
510	$i = unpack ("C", $in);
511	$block->[$i] = [@array256];
512	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
513	my ($n1, $n2) = unpack ("CC", $in);
514	$block->[$i]->[$j] = ($n1*256) + $n2;
515	}
516	}
517
518	close (MAPFILE);
519	}
520
521	sub transchar {
522	my ($encoding, $from) = @_;
523	my $high = ($from / 256) % 256;
524	my $low = $from % 256;
525
526	return 0 unless defined $translations{$encoding};
527
528	my $block = $translations{$encoding};
529
530	if (ref ($block->[$high]) ne "ARRAY") {
531	return 0;
532	}
533	return $block->[$high]->[$low];
534	}
535
536
537
538
539	1;
540

Note: See TracBrowser for help on using the repository browser.

Download in other formats: