source: tags/gsdl-2_51-distribution/gsdl/perllib/unicode.pm@ 7622

Last change on this file since 7622 was 4229, checked in by mdewsnip, 21 years ago

Replaced unicode2koi8r with a more generic version: unicode2singlebyte. This can be used to convert unicode to any single byte encoding - (KOI8-R, the Kazakh encoding, etc.).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use util;
37
38# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
39# and returns a unicode array.
40sub ascii2unicode {
41 my ($in) = @_;
42 my $out = [];
43
44 my $i = 0;
45 my $len = length($in);
46 while ($i < $len) {
47 push (@$out, ord(substr ($in, $i, 1)));
48 $i++;
49 }
50
51 return $out;
52}
53
54# ascii2utf8 takes a reference to an (extended) ascii string and returns a
55# UTF-8 encoded string. This is just a faster version of
56# "&unicode2utf8(&ascii2unicode($str));"
57sub ascii2utf8 {
58 my ($in) = @_;
59 my $out = "";
60
61 if (!defined($in)|| !defined($$in)) {
62 return $out;
63 }
64
65 my ($c);
66 my $i = 0;
67 my $len = length($$in);
68 while ($i < $len) {
69 $c = ord (substr ($$in, $i, 1));
70 if ($c < 0x80) {
71 # ascii character
72 $out .= chr ($c);
73
74 } else {
75 # extended ascii character
76 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
77 $out .= chr (0x80 + ($c & 0x3f));
78 }
79 $i++;
80 }
81
82 return $out;
83}
84
85# unicode2utf8 takes a unicode array as input and encodes it
86# using utf-8
87sub unicode2utf8 {
88 my ($in) = @_;
89 my $out = "";
90
91 foreach $num (@$in) {
92 next unless defined $num;
93 if ($num < 0x80) {
94 $out .= chr ($num);
95
96 } elsif ($num < 0x800) {
97 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
98 $out .= chr (0x80 + ($num & 0x3f));
99
100 } elsif ($num < 0xFFFF) {
101 $out .= chr (0xe0 + (($num >> 12) & 0xf));
102 $out .= chr (0x80 + (($num >> 6) & 0x3f));
103 $out .= chr (0x80 + ($num & 0x3f));
104
105 } else {
106 # error, don't encode anything
107 die;
108 }
109 }
110 return $out;
111}
112
113# utf82unicode takes a utf-8 string and produces a unicode
114# array
115sub utf82unicode {
116 my ($in) = @_;
117 my $out = [];
118
119 my $i = 0;
120 my ($c1, $c2, $c3);
121 $len = length($in);
122 while ($i < $len) {
123 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
124 # normal ascii character
125 push (@$out, $c1);
126
127 } elsif ($c1 < 0xc0) {
128 # error, was expecting the first byte of an
129 # encoded character. Do nothing.
130
131 } elsif ($c1 < 0xe0 && $i+1 < $len) {
132 # an encoded character with two bytes
133 $c2 = ord (substr ($in, $i+1, 1));
134 if ($c2 >= 0x80 && $c2 < 0xc0) {
135 # everything looks ok
136 push (@$out, ((($c1 & 0x1f) << 6) +
137 ($c2 & 0x3f)));
138 $i++; # gobbled an extra byte
139 }
140
141 } elsif ($c1 < 0xf0 && $i+2 < $len) {
142 # an encoded character with three bytes
143 $c2 = ord (substr ($in, $i+1, 1));
144 $c3 = ord (substr ($in, $i+2, 1));
145 if ($c2 >= 0x80 && $c2 < 0xc0 &&
146 $c3 >= 0x80 && $c3 < 0xc0) {
147 # everything looks ok
148 push (@$out, ((($c1 & 0xf) << 12) +
149 (($c2 & 0x3f) << 6) +
150 ($c3 & 0x3f)));
151
152 $i += 2; # gobbled an extra two bytes
153 }
154
155 } else {
156 # error, only decode Unicode characters not full UCS.
157 # Do nothing.
158 }
159
160 $i++;
161 }
162
163 return $out;
164}
165
166# unicode2ucs2 takes a unicode array and produces a UCS-2
167# unicode string (every two bytes forms a unicode character)
168sub unicode2ucs2 {
169 my ($in) = @_;
170 my $out = "";
171
172 foreach $num (@$in) {
173 $out .= chr (($num & 0xff00) >> 8);
174 $out .= chr ($num & 0xff);
175 }
176
177 return $out;
178}
179
180# ucs22unicode takes a UCS-2 string and produces a unicode array
181sub ucs22unicode {
182 my ($in) = @_;
183 my $out = [];
184
185 my $i = 0;
186 my $len = length ($in);
187 while ($i+1 < $len) {
188 push (@$out, ord (substr($in, $i, 1)) << 8 +
189 ord (substr($in, $i+1, 1)));
190
191 $i ++;
192 }
193
194 return $out;
195}
196
197# takes a reference to a string and returns a reference to a unicode array
198sub convert2unicode {
199 my ($encoding, $textref) = @_;
200
201 if (!defined $encodings::encodings->{$encoding}) {
202 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
203 return [];
204 }
205
206 my $encodename = "$encoding-unicode";
207 my $enc_info = $encodings::encodings->{$encoding};
208 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
209 "to_uc", $enc_info->{'mapfile'});
210 if (!&loadmapencoding ($encodename, $mapfile)) {
211 print STDERR "unicode: ERROR - could not load encoding $encodename\n";
212 return [];
213 }
214
215 if (defined $enc_info->{'converter'}) {
216 my $converter = $enc_info->{'converter'};
217 return &$converter ($encodename, $textref);
218 }
219
220 if ($translations{$encodename}->{'count'} == 1) {
221 return &singlebyte2unicode ($encodename, $textref);
222 } else {
223 return &doublebyte2unicode ($encodename, $textref);
224 }
225}
226
227# singlebyte2unicode converts simple 8 bit encodings where characters below
228# 0x80 are normal ascii characters and the rest are decoded using the
229# appropriate mapping files.
230#
231# Examples of encodings that may be converted using singlebyte2unicode are
232# the iso-8859 and windows-125* series.
233sub singlebyte2unicode {
234 my ($encodename, $textref) = @_;
235
236 my @outtext = ();
237 my $len = length($$textref);
238 my ($c);
239 my $i = 0;
240
241 while ($i < $len) {
242 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
243 # normal ascii character
244 push (@outtext, $c);
245 } else {
246 $c = &transchar ($encodename, $c);
247 # put a black square if cannot translate
248 $c = 0x25A1 if $c == 0;
249 push (@outtext, $c);
250 }
251 $i ++;
252 }
253 return \@outtext;
254}
255
256# doublebyte2unicode converts simple two byte encodings where characters
257# below code point 0x80 are single-byte characters and the rest are
258# double-byte characters.
259#
260# Examples of encodings that may be converted using doublebyte2unicode are
261# CJK encodings like GB encoded Chinese and UHC Korean.
262#
263# Note that no error checking is performed to make sure that the input text
264# is valid for the given encoding.
265#
266# Also, encodings that may contain characters of more than two bytes are
267# not supported (any EUC encoded text may in theory contain 3-byte
268# characters but in practice only one and two byte characters are used).
269sub doublebyte2unicode {
270 my ($encodename, $textref) = @_;
271
272 my @outtext = ();
273 my $len = length($$textref);
274 my ($c1, $c2);
275 my $i = 0;
276
277 while ($i < $len) {
278 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
279 if ($i+1 < $len) {
280 # double-byte character
281 $c2 = ord(substr($$textref, $i+1, 1));
282 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
283 # put a black square if cannot translate
284 $c = 0x25A1 if $c == 0;
285 push (@outtext, $c);
286 $i += 2;
287
288 } else {
289 # error
290 print STDERR "unicode: ERROR missing second half of double-byte character\n";
291 $i++;
292 }
293
294 } else {
295 # single-byte character
296 push (@outtext, $c1);
297 $i++;
298 }
299 }
300 return \@outtext;
301}
302
303# Shift-JIS to unicode
304# We can't use doublebyte2unicode for Shift-JIS because it uses some
305# single-byte characters above code point 0x80 (i.e. half-width katakana
306# characters in the range 0xA1-0xDF)
307sub shiftjis2unicode {
308 my ($encodename, $textref) = @_;
309
310 my @outtext = ();
311 my $len = length($$textref);
312 my ($c1, $c2);
313 my $i = 0;
314
315 while ($i < $len) {
316 $c1 = ord(substr($$textref, $i, 1));
317
318 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
319 # Single-byte half-width katakana character or
320 # JIS Roman yen or overline characters
321 my $c = &transchar ($encodename, $c1);
322 # - put a black square if cannot translate
323 $c = 0x25A1 if $c == 0;
324 push (@outtext, $c);
325 $i++;
326
327 } elsif ($c1 < 0x80) {
328 # ASCII
329 push (@outtext, $c1);
330 $i ++;
331
332 } elsif ($c1 < 0xEF) {
333 if ($i+1 < $len) {
334 $c2 = ord(substr($$textref, $i+1, 1));
335 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
336 # Double-byte shift-jis character
337 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
338 # put a black square if cannot translate
339 $c = 0x25A1 if $c == 0;
340 push (@outtext, $c);
341 } else {
342 # error
343 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
344 }
345 $i += 2;
346 } else {
347 # error
348 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
349 $i ++;
350 }
351 } else {
352 # error
353 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
354 $i ++;
355 }
356 }
357 return \@outtext;
358}
359
360sub transchar {
361 my ($encoding, $from) = @_;
362 my $high = ($from / 256) % 256;
363 my $low = $from % 256;
364
365 return 0 unless defined $translations{$encoding};
366
367 my $block = $translations{$encoding}->{'map'};
368
369 if (ref ($block->[$high]) ne "ARRAY") {
370 return 0;
371 }
372 return $block->[$high]->[$low];
373}
374
375# %translations is of the form:
376#
377# encodings{encodingname-encodingname}->{'map'}->blocktranslation
378# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
379#
380# Any of the top translation blocks can point to an undefined
381# value. This data structure aims to allow fast translation and
382# efficient storage.
383%translations = ();
384
385# @array256 is used for initialisation, there must be
386# a better way...
387# What about this?: @array256 = (0) x 256;
388@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
389 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
390 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
391 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
392 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
393 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
404
405# returns 1 if successful, 0 if unsuccessful
406sub loadmapencoding {
407 my ($encoding, $mapfile) = @_;
408
409 # check to see if the encoding has already been loaded
410 return 1 if (defined $translations{$encoding});
411
412 return 0 unless open (MAPFILE, $mapfile);
413 binmode (MAPFILE);
414
415 $translations{$encoding} = {'map' => [@array256], 'count' => 0};
416 my $block = $translations{$encoding};
417
418 my ($in,$i,$j);
419 while (read(MAPFILE, $in, 1) == 1) {
420 $i = unpack ("C", $in);
421 $block->{'map'}->[$i] = [@array256];
422 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
423 my ($n1, $n2) = unpack ("CC", $in);
424 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
425 }
426 $block->{'count'} ++;
427 }
428
429 close (MAPFILE);
430}
431
432# unicode2singlebyte converts unicode to simple 8 bit encodings where
433# characters below 0x80 are normal ascii characters and the rest are encoded
434# using the appropriate mapping files.
435#
436# Examples of encodings that may be converted using unicode2singlebyte are
437# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
438sub unicode2singlebyte {
439 my ($uniref, $encoding) = @_;
440
441 my $outtext = "";
442 my $encodename = "unicode-$encoding";
443 my $enc_info = $encodings::encodings->{"$encoding"};
444 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
445 "from_uc", $enc_info->{'mapfile'});
446 if (!&loadmapencoding ($encodename, $mapfile)) {
447 print STDERR "unicode: ERROR - could not load encoding $encodename\n";
448 return "";
449 }
450
451 foreach my $c (@$uniref) {
452 if ($c < 0x80) {
453 # normal ascii character
454 $outtext .= chr($c);
455 } else {
456 # extended ascii character
457 $c = &transchar ($encodename, $c);
458
459 # put a question mark if cannot translate
460 if ($c == 0) {
461 $outtext .= "?";
462 } else {
463 $outtext .= chr($c);
464 }
465 }
466 }
467 return $outtext;
468}
469
4701;
Note: See TracBrowser for help on using the repository browser.