source: trunk/gsdl/perllib/unicode.pm@ 1868

Last change on this file since 1868 was 1868, checked in by sjboddie, 23 years ago

Made a bunch of changes to the building code to support lots of new
languages and encodings. It's still kind of a mess but should be fixed
up over the weekend.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33%translations = ();
34
35# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
36# and returns a unicode array.
37sub ascii2unicode {
38 my ($in) = @_;
39 my $out = [];
40
41 my $i = 0;
42 my $len = length($in);
43 while ($i < $len) {
44 push (@$out, ord(substr ($in, $i, 1)));
45 $i++;
46 }
47
48 return $out;
49}
50
51# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
52# and returns a unicode array. These encodings are similar to but not
53# identical to the corresponding ISO-8859 encodings.
54#
55# $encoding should be the code page name (e.g. '1252')
56#
57# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
58sub windows2unicode {
59 my ($encoding, $in) = @_;
60 my $out = [];
61
62 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
63 "WINDOWS", "$encoding.TXT");
64 return $out unless &loadmapping ($encoding, $mapfile);
65
66 my $i = 0;
67 my $len = length($in);
68 while ($i < $len) {
69 my $c = ord(substr ($in, $i, 1));
70 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
71 push (@$out, $c);
72 $i++;
73 }
74
75 return $out;
76}
77
78# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
79# and returns a unicode array. This function is much like windows2unicode()
80# except that only characters >= 0xA0 are read from the mapping file (since
81# all characters below that are the same for all iso-8859 character sets
82# and therefore already the same as unicode).
83#
84# Note that while this function will work for iso-8859-1 (latin 1) it'll be
85# much faster to use ascii2unicode() or ascii2utf8()
86#
87# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
88# encoding is
89#
90# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
91sub iso2unicode {
92 my ($encoding, $in) = @_;
93 my $out = [];
94
95 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
96 "ISO_8859", "$encoding.TXT");
97 return $out unless &loadmapping ($encoding, $mapfile);
98
99 my $i = 0;
100 my $len = length($in);
101 while ($i < $len) {
102 my $c = ord(substr ($in, $i, 1));
103 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
104 push (@$out, $c);
105 $i++;
106 }
107
108 return $out;
109}
110
111# cyrillic2unicode is basically identical to windows2unicode, the only
112# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
113#
114# values for $encoding may be 'koi8_r' or 'koi8_u'
115sub cyrillic2unicode {
116 my ($encoding, $in) = @_;
117 my $out = [];
118
119 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
120 "CYRILLIC", "$encoding.txt");
121 return $out unless &loadmapping ($encoding, $mapfile);
122
123 my $i = 0;
124 my $len = length($in);
125 while ($i < $len) {
126 my $c = ord(substr ($in, $i, 1));
127 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
128 push (@$out, $c);
129 $i++;
130 }
131
132 return $out;
133}
134
135# iscii2unicode is basically identical to iso2unicode, the only
136# difference being that the map files live in unicode/MAPPINGS/ISCII
137#
138# values for $encoding may be 'Devanagari' only at present
139sub iscii2unicode {
140 my ($encoding, $in) = @_;
141 my $out = [];
142
143 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
144 "ISCII", "$encoding.txt");
145 return $out unless &loadmapping ($encoding, $mapfile);
146
147 my $i = 0;
148 my $len = length($in);
149 while ($i < $len) {
150 my $c = ord(substr ($in, $i, 1));
151 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
152 push (@$out, $c);
153 $i++;
154 }
155
156 return $out;
157}
158
159# ascii2utf8 takes a (extended) ascii string and
160# returns a UTF-8 encoded string. This is just
161# a faster version of "&unicode2utf8(&ascii2unicode($str));"
162sub ascii2utf8 {
163 my ($in) = @_;
164 my $out = "";
165
166 my ($c);
167 my $i = 0;
168 my $len = length($in);
169 while ($i < $len) {
170 $c = ord (substr ($in, $i, 1));
171 if ($c < 0x80) {
172 # ascii character
173 $out .= chr ($c);
174
175 } else {
176 # extended ascii character
177 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
178 $out .= chr (0x80 + ($c & 0x3f));
179 }
180 $i++;
181 }
182
183 return $out;
184}
185
186
187# unicode2utf8 takes a unicode array as input and encodes it
188# using utf-8
189sub unicode2utf8 {
190 my ($in) = @_;
191 my $out = "";
192
193 foreach $num (@$in) {
194 next unless defined $num;
195 if ($num < 0x80) {
196 $out .= chr ($num);
197
198 } elsif ($num < 0x800) {
199 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
200 $out .= chr (0x80 + ($num & 0x3f));
201
202 } elsif ($num < 0xFFFF) {
203 $out .= chr (0xe0 + (($num >> 12) & 0xf));
204 $out .= chr (0x80 + (($num >> 6) & 0x3f));
205 $out .= chr (0x80 + ($num & 0x3f));
206
207 } else {
208 # error, don't encode anything
209 die;
210 }
211 }
212
213 return $out;
214}
215
216
217# utf82unicode takes a utf-8 string and produces a unicode
218# array
219sub utf82unicode {
220 my ($in) = @_;
221 my $out = [];
222
223 my $i = 0;
224 my ($c1, $c2, $c3);
225 $len = length($in);
226 while ($i < $len) {
227 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
228 # normal ascii character
229 push (@$out, $c1);
230
231 } elsif ($c1 < 0xc0) {
232 # error, was expecting the first byte of an
233 # encoded character. Do nothing.
234
235 } elsif ($c1 < 0xe0 && $i+1 < $len) {
236 # an encoded character with two bytes
237 $c2 = ord (substr ($in, $i+1, 1));
238 if ($c2 >= 0x80 && $c2 < 0xc0) {
239 # everything looks ok
240 push (@$out, ((($c1 & 0x1f) << 6) +
241 ($c2 & 0x3f)));
242 $i++; # gobbled an extra byte
243 }
244
245 } elsif ($c1 < 0xf0 && $i+2 < $len) {
246 # an encoded character with three bytes
247 $c2 = ord (substr ($in, $i+1, 1));
248 $c3 = ord (substr ($in, $i+2, 1));
249 if ($c2 >= 0x80 && $c2 < 0xc0 &&
250 $c3 >= 0x80 && $c3 < 0xc0) {
251 # everything looks ok
252 push (@$out, ((($c1 & 0xf) << 12) +
253 (($c2 & 0x3f) << 6) +
254 ($c3 & 0x3f)));
255
256 $i += 2; # gobbled an extra two bytes
257 }
258
259 } else {
260 # error, only decode Unicode characters not full UCS.
261 # Do nothing.
262 }
263
264 $i++;
265 }
266
267 return $out;
268}
269
270
271# unicode2ucs2 takes a unicode array and produces a UCS-2
272# unicode string (every two bytes forms a unicode character)
273sub unicode2ucs2 {
274 my ($in) = @_;
275 my $out = "";
276
277 foreach $num (@$in) {
278 $out .= chr (($num & 0xff00) >> 8);
279 $out .= chr ($num & 0xff);
280 }
281
282 return $out;
283}
284
285
286# ucs22unicode takes a UCS-2 string and produces a unicode array
287sub ucs22unicode {
288 my ($in) = @_;
289 my $out = [];
290
291 my $i = 0;
292 my $len = length ($in);
293 while ($i+1 < $len) {
294 push (@$out, ord (substr($in, $i, 1)) << 8 +
295 ord (substr($in, $i+1, 1)));
296
297 $i ++;
298 }
299
300 return $out;
301}
302
303# loadmapping expects the mapfile to contain (at least) two
304# tab-separated fields. The first field is the mapped value
305# and the second field is the unicode value.
306#
307# It returns 1 if successful, 0 if unsuccessful
308sub loadmapping {
309 my ($encoding, $mapfile) = @_;
310
311 my $to = "$encoding-unicode";
312 my $from = "unicode-$encoding";
313
314 # check to see if the encoding has already been loaded
315 if (defined $translations{$to} && defined $translations{$from}) {
316 return 1;
317 }
318
319 if (!open (MAPFILE, $mapfile)) {
320 print STDERR "ERROR: unable to load mapfile $mapfile\n";
321 return 0;
322 }
323
324 my ($line, @line);
325 $translations{$to} = {};
326 $translations{$from} = {};
327 while (defined ($line = <MAPFILE>)) {
328 # remove comments
329 $line =~ s/\#.*$//;
330 next unless $line =~ /\S/;
331
332 # split the line into fields and do a few
333 # simple sanity checks
334 @line = split (/\t/, $line);
335 next unless (scalar(@line) >= 2 &&
336 $line[0] =~ /^0x/ &&
337 $line[1] =~ /^0x/);
338
339 my $a = hex($line[0]);
340 my $b = hex($line[1]);
341
342 $translations{$to}->{$a} = $b;
343 $translations{$from}->{$b} = $a;
344 }
345
346 close (MAPFILE);
347
348 return 1;
349}
350
351
352
353
354
355
356
357
358
359####################################################################################################
360
361
362# %translations is of the form:
363#
364# encodings{encodingname-encodingname}->blocktranslation
365# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
366#
367# Any of the top translation blocks can point to an undefined
368# value. This data structure aims to allow fast translation and
369# efficient storage.
370%translations = ();
371
372# @array256 is used for initialisation, there must be
373# a better way...
374@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
375 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
376 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
377 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
378 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
379 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
380 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
381 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
382 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
383 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
384 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
385 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
386 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
387 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
388 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
389 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
390
391$encodings = {
392 'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',
393 'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},
394
395 'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',
396 'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},
397
398 'iso_8859_3' => {'fullname' => 'Latin3',
399 'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},
400
401 'iso_8859_4' => {'fullname' => 'Latin4',
402 'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},
403
404 'iso_8859_5' => {'fullname' => 'Cyrillic',
405 'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},
406
407 'iso_8859_6' => {'fullname' => 'Arabic',
408 'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},
409
410 'iso_8859_7' => {'fullname' => 'Greek',
411 'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},
412
413 'iso_8859_8' => {'fullname' => 'Hebrew',
414 'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},
415
416 'iso_8859_9' => {'fullname' => 'Latin5',
417 'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},
418
419 'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',
420 'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},
421
422 'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',
423 'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},
424
425 'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',
426 'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},
427
428 'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',
429 'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},
430
431 'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',
432 'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},
433
434 'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',
435 'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},
436
437 'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',
438 'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},
439
440 'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',
441 'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},
442
443 'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',
444 'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},
445
446 'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',
447 'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},
448
449 'koi8_r' => {'fullname' => 'Cyrillic',
450 'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},
451
452 'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',
453 'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},
454
455 'iscii_de' => {'fullname' => 'ISCII Devanagari',
456 'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}
457};
458
459# returns a pointer to unicode array
460sub simple2unicode {
461 my ($encoding, $intext) = @_;
462
463 if (!defined ($encodings->{$encoding})) {
464 print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";
465 return [];
466 }
467
468 my $info = $encodings->{$encoding};
469 my $encodename = "$encoding-unicode";
470 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",
471 $info->{'mapfile'});
472
473 if (!&loadmapencoding ($encodename, $mapfile)) {
474 print STDERR "unicode: ERROR - could not load encoding $encodename\n";
475 return [];
476 }
477
478 my @outtext = ();
479 my $len = length($intext);
480 my ($c);
481 my $i = 0;
482
483 while ($i < $len) {
484 if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {
485 # normal ascii character
486 push (@outtext, $c);
487 } else {
488 push (@outtext, &transchar ($encodename, $c));
489 }
490 $i ++;
491 }
492 return \@outtext;
493}
494
495# returns 1 if successful, 0 if unsuccessful
496sub loadmapencoding {
497 my ($encoding, $mapfile) = @_;
498
499 # check to see if the encoding has already been loaded
500 return 1 if (defined $translations{$encoding});
501
502 return 0 unless open (MAPFILE, $mapfile);
503 binmode (MAPFILE);
504
505 $translations{$encoding} = [@array256];
506 my $block = $translations{$encoding};
507
508 my ($in,$i,$j);
509 while (read(MAPFILE, $in, 1) == 1) {
510 $i = unpack ("C", $in);
511 $block->[$i] = [@array256];
512 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
513 my ($n1, $n2) = unpack ("CC", $in);
514 $block->[$i]->[$j] = ($n1*256) + $n2;
515 }
516 }
517
518 close (MAPFILE);
519}
520
521sub transchar {
522 my ($encoding, $from) = @_;
523 my $high = ($from / 256) % 256;
524 my $low = $from % 256;
525
526 return 0 unless defined $translations{$encoding};
527
528 my $block = $translations{$encoding};
529
530 if (ref ($block->[$high]) ne "ARRAY") {
531 return 0;
532 }
533 return $block->[$high]->[$low];
534}
535
536
537
538
5391;
540
Note: See TracBrowser for help on using the repository browser.