source: gsdl/trunk/perllib/unicode.pm@ 18201

Last change on this file since 18201 was 18201, checked in by ak19, 15 years ago

When associated files are renamed with the URLencoded versions of their original filenames, the spaces are no longer URL encoded, as this conflicted with mp3, wmv and possibly other media file formats being opened in external or browser-embedded apps

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.4 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38
39# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
40# and returns a unicode array.
41sub ascii2unicode {
42 my ($in) = @_;
43 my $out = [];
44
45 my $i = 0;
46 my $len = length($in);
47 while ($i < $len) {
48 push (@$out, ord(substr ($in, $i, 1)));
49 $i++;
50 }
51
52 return $out;
53}
54
55# ascii2utf8 takes a reference to an (extended) ascii string and returns a
56# UTF-8 encoded string. This is just a faster version of
57# "&unicode2utf8(&ascii2unicode($str));"
58# "Extended ascii" really means "iso_8859_1"
59sub ascii2utf8 {
60 my ($in) = @_;
61 my $out = "";
62
63 if (!defined($in)|| !defined($$in)) {
64 return $out;
65 }
66
67 my ($c);
68 my $i = 0;
69 my $len = length($$in);
70 while ($i < $len) {
71 $c = ord (substr ($$in, $i, 1));
72 if ($c < 0x80) {
73 # ascii character
74 $out .= chr ($c);
75
76 } else {
77 # extended ascii character
78 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
79 $out .= chr (0x80 + ($c & 0x3f));
80 }
81 $i++;
82 }
83
84 return $out;
85}
86
87# unicode2utf8 takes a unicode array as input and encodes it
88# using utf-8
89sub unicode2utf8 {
90 my ($in) = @_;
91 my $out = "";
92
93 foreach my $num (@$in) {
94 next unless defined $num;
95 if ($num < 0x80) {
96 $out .= chr ($num);
97
98 } elsif ($num < 0x800) {
99 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
100 $out .= chr (0x80 + ($num & 0x3f));
101
102 } elsif ($num < 0xFFFF) {
103 $out .= chr (0xe0 + (($num >> 12) & 0xf));
104 $out .= chr (0x80 + (($num >> 6) & 0x3f));
105 $out .= chr (0x80 + ($num & 0x3f));
106
107 } else {
108 # error, don't encode anything
109 die;
110 }
111 }
112 return $out;
113}
114
115# utf82unicode takes a utf-8 string and produces a unicode
116# array
117sub utf82unicode {
118 my ($in) = @_;
119 my $out = [];
120
121 my $i = 0;
122 my ($c1, $c2, $c3);
123 my $len = length($in);
124 while ($i < $len) {
125 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
126 # normal ascii character
127 push (@$out, $c1);
128
129 } elsif ($c1 < 0xc0) {
130 # error, was expecting the first byte of an
131 # encoded character. Do nothing.
132
133 } elsif ($c1 < 0xe0 && $i+1 < $len) {
134 # an encoded character with two bytes
135 $c2 = ord (substr ($in, $i+1, 1));
136 if ($c2 >= 0x80 && $c2 < 0xc0) {
137 # everything looks ok
138 push (@$out, ((($c1 & 0x1f) << 6) +
139 ($c2 & 0x3f)));
140 $i++; # gobbled an extra byte
141 }
142
143 } elsif ($c1 < 0xf0 && $i+2 < $len) {
144 # an encoded character with three bytes
145 $c2 = ord (substr ($in, $i+1, 1));
146 $c3 = ord (substr ($in, $i+2, 1));
147 if ($c2 >= 0x80 && $c2 < 0xc0 &&
148 $c3 >= 0x80 && $c3 < 0xc0) {
149 # everything looks ok
150 push (@$out, ((($c1 & 0xf) << 12) +
151 (($c2 & 0x3f) << 6) +
152 ($c3 & 0x3f)));
153
154 $i += 2; # gobbled an extra two bytes
155 }
156
157 } else {
158 # error, only decode Unicode characters not full UCS.
159 # Do nothing.
160 }
161
162 $i++;
163 }
164
165 return $out;
166}
167
168# unicode2ucs2 takes a unicode array and produces a UCS-2
169# unicode string (every two bytes forms a unicode character)
170sub unicode2ucs2 {
171 my ($in) = @_;
172 my $out = "";
173
174 foreach my $num (@$in) {
175 $out .= chr (($num & 0xff00) >> 8);
176 $out .= chr ($num & 0xff);
177 }
178
179 return $out;
180}
181
182# ucs22unicode takes a UCS-2 string and produces a unicode array
183sub ucs22unicode {
184 my ($in) = @_;
185 my $out = [];
186
187 my $i = 0;
188 my $len = length ($in);
189 while ($i+1 < $len) {
190 push (@$out, ord (substr($in, $i, 1)) << 8 +
191 ord (substr($in, $i+1, 1)));
192
193 $i ++;
194 }
195
196 return $out;
197}
198
199# takes a reference to a string and returns a reference to a unicode array
200sub convert2unicode {
201 my ($encoding, $textref) = @_;
202
203 if (!defined $encodings::encodings->{$encoding}) {
204 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
205 return [];
206 }
207
208 my $encodename = "$encoding-unicode";
209 my $enc_info = $encodings::encodings->{$encoding};
210 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
211 "to_uc", $enc_info->{'mapfile'});
212 if (!&loadmapencoding ($encodename, $mapfile)) {
213 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
214 return [];
215 }
216
217 if (defined $enc_info->{'converter'}) {
218 my $converter = $enc_info->{'converter'};
219 return &$converter ($encodename, $textref);
220 }
221
222 if ($unicode::translations{$encodename}->{'count'} == 1) {
223 return &singlebyte2unicode ($encodename, $textref);
224 } else {
225 return &doublebyte2unicode ($encodename, $textref);
226 }
227}
228
229# singlebyte2unicode converts simple 8 bit encodings where characters below
230# 0x80 are normal ascii characters and the rest are decoded using the
231# appropriate mapping files.
232#
233# Examples of encodings that may be converted using singlebyte2unicode are
234# the iso-8859 and windows-125* series.
235sub singlebyte2unicode {
236 my ($encodename, $textref) = @_;
237
238 my @outtext = ();
239 my $len = length($$textref);
240 my ($c);
241 my $i = 0;
242
243 while ($i < $len) {
244 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
245 # normal ascii character
246 push (@outtext, $c);
247 } else {
248 $c = &transchar ($encodename, $c);
249 # put a black square if cannot translate
250 $c = 0x25A1 if $c == 0;
251 push (@outtext, $c);
252 }
253 $i ++;
254 }
255 return \@outtext;
256}
257
258# doublebyte2unicode converts simple two byte encodings where characters
259# below code point 0x80 are single-byte characters and the rest are
260# double-byte characters.
261#
262# Examples of encodings that may be converted using doublebyte2unicode are
263# CJK encodings like GB encoded Chinese and UHC Korean.
264#
265# Note that no error checking is performed to make sure that the input text
266# is valid for the given encoding.
267#
268# Also, encodings that may contain characters of more than two bytes are
269# not supported (any EUC encoded text may in theory contain 3-byte
270# characters but in practice only one and two byte characters are used).
271sub doublebyte2unicode {
272 my ($encodename, $textref) = @_;
273
274 my @outtext = ();
275 my $len = length($$textref);
276 my ($c1, $c2);
277 my $i = 0;
278
279 while ($i < $len) {
280 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
281 if ($i+1 < $len) {
282 # double-byte character
283 $c2 = ord(substr($$textref, $i+1, 1));
284 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
285 # put a black square if cannot translate
286 $c = 0x25A1 if $c == 0;
287 push (@outtext, $c);
288 $i += 2;
289
290 } else {
291 # error
292 print STDERR "unicode: ERROR missing second half of double-byte character\n";
293 $i++;
294 }
295
296 } else {
297 # single-byte character
298 push (@outtext, $c1);
299 $i++;
300 }
301 }
302 return \@outtext;
303}
304
305# Shift-JIS to unicode
306# We can't use doublebyte2unicode for Shift-JIS because it uses some
307# single-byte characters above code point 0x80 (i.e. half-width katakana
308# characters in the range 0xA1-0xDF)
309sub shiftjis2unicode {
310 my ($encodename, $textref) = @_;
311
312 my @outtext = ();
313 my $len = length($$textref);
314 my ($c1, $c2);
315 my $i = 0;
316
317 while ($i < $len) {
318 $c1 = ord(substr($$textref, $i, 1));
319
320 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
321 # Single-byte half-width katakana character or
322 # JIS Roman yen or overline characters
323 my $c = &transchar ($encodename, $c1);
324 # - put a black square if cannot translate
325 $c = 0x25A1 if $c == 0;
326 push (@outtext, $c);
327 $i++;
328
329 } elsif ($c1 < 0x80) {
330 # ASCII
331 push (@outtext, $c1);
332 $i ++;
333
334 } elsif ($c1 < 0xEF) {
335 if ($i+1 < $len) {
336 $c2 = ord(substr($$textref, $i+1, 1));
337 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
338 # Double-byte shift-jis character
339 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
340 # put a black square if cannot translate
341 $c = 0x25A1 if $c == 0;
342 push (@outtext, $c);
343 } else {
344 # error
345 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
346 }
347 $i += 2;
348 } else {
349 # error
350 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
351 $i ++;
352 }
353 } else {
354 # error
355 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
356 $i ++;
357 }
358 }
359 return \@outtext;
360}
361
362sub transchar {
363 my ($encoding, $from) = @_;
364 my $high = ($from / 256) % 256;
365 my $low = $from % 256;
366
367 return 0 unless defined $unicode::translations{$encoding};
368
369 my $block = $unicode::translations{$encoding}->{'map'};
370
371 if (ref ($block->[$high]) ne "ARRAY") {
372 return 0;
373 }
374 return $block->[$high]->[$low];
375}
376
377# %translations is of the form:
378#
379# encodings{encodingname-encodingname}->{'map'}->blocktranslation
380# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
381#
382# Any of the top translation blocks can point to an undefined
383# value. This data structure aims to allow fast translation and
384# efficient storage.
385%unicode::translations = ();
386
387# @array256 is used for initialisation, there must be
388# a better way...
389# What about this?: @array256 = (0) x 256;
390@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
391 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
392 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
393 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
406
407# returns 1 if successful, 0 if unsuccessful
408sub loadmapencoding {
409 my ($encoding, $mapfile) = @_;
410
411 # check to see if the encoding has already been loaded
412 return 1 if (defined $unicode::translations{$encoding});
413
414 if (! -r $mapfile || -d $mapfile) {
415 return 0;
416 }
417 return 0 unless open (MAPFILE, $mapfile);
418 binmode (MAPFILE);
419
420 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
421 my $block = $unicode::translations{$encoding};
422
423 my ($in,$i,$j);
424 while (1) {
425 my $ret=read(MAPFILE, $in, 1);
426 if (!defined($ret)) { # error
427 print STDERR "unicode.pm: error reading mapfile: $!\n";
428 last;
429 }
430 if ($ret != 1) { last }
431 $i = unpack ("C", $in);
432 $block->{'map'}->[$i] = [@unicode::array256];
433 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
434 my ($n1, $n2) = unpack ("CC", $in);
435 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
436 }
437 $block->{'count'} ++;
438 }
439
440 close (MAPFILE);
441}
442
443# unicode2singlebyte converts unicode to simple 8 bit encodings where
444# characters below 0x80 are normal ascii characters and the rest are encoded
445# using the appropriate mapping files.
446#
447# Examples of encodings that may be converted using unicode2singlebyte are
448# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
449sub unicode2singlebyte {
450 my ($uniref, $encoding) = @_;
451
452 my $outtext = "";
453 my $encodename = "unicode-$encoding";
454
455 if (!exists $encodings::encodings->{$encoding}) {
456 print STDERR "unicode.pm: ERROR - unsupported encoding "
457 . "'$encoding' requested\n";
458 return "";
459 }
460
461 my $enc_info = $encodings::encodings->{$encoding};
462 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
463 "from_uc", $enc_info->{'mapfile'});
464 if (!&loadmapencoding ($encodename, $mapfile)) {
465 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
466 return "";
467 }
468
469 foreach my $c (@$uniref) {
470 if ($c < 0x80) {
471 # normal ascii character
472 $outtext .= chr($c);
473 } else {
474 # extended ascii character
475 $c = &transchar ($encodename, $c);
476
477 # put a question mark if cannot translate
478 if ($c == 0) {
479 $outtext .= "?";
480 } else {
481 $outtext .= chr($c);
482 }
483 }
484 }
485 return $outtext;
486}
487
488
489# this makes sure that the referenced input string is utf8 encoded, and
490# will change/remove bytes that aren't.
491# returns 0 if the text was already utf8, or 1 if text modified to become utf8
492sub ensure_utf8 {
493 my $stringref=shift;
494
495 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
496 return $stringref;
497 }
498
499 my $value=$$stringref;
500
501 my $non_utf8_found = 0;
502 $value =~ m/^/g; # to set \G
503 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
504 my $highbytes=$1;
505 my $highbyteslength=length($highbytes);
506 # make sure this block of high bytes is utf-8
507 $highbytes =~ /^/g; # set pos()
508 my $byte_replaced = 0;
509 while ($highbytes =~
510 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
511 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
512 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
513 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
514 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
515 )*([\x80-\xff])? !xg
516 ) {
517 # this highbyte is "out-of-place" for valid utf-8
518 my $badbyte=$1;
519 if (!defined $badbyte) {next} # hit end of string
520 my $pos=pos($highbytes);
521 # replace bad byte. assume iso-8859-1 -> utf-8
522 # ascii2utf8 does "extended ascii"... ie iso-8859-1
523 my $replacement=&unicode::ascii2utf8(\$badbyte);
524 substr($highbytes, $pos-1, 1, $replacement);
525 # update the position to continue searching (for \G)
526 pos($highbytes) = $pos+length($replacement)-1;
527 $byte_replaced = 1;
528 }
529 if ($byte_replaced) {
530 # replace this block of high bytes in the $value
531 $non_utf8_found = 1;
532 my $replength=length($highbytes); # we've changed the length
533 my $textpos=pos($value); # pos at end of last match
534 # replace bad bytes with good bytes
535 substr($value, $textpos-$highbyteslength,
536 $highbyteslength, $highbytes);
537 # update the position to continue searching (for \G)
538 pos($value)=$textpos+($replength-$highbyteslength)+1;
539 }
540 }
541
542 $$stringref = $value;
543 return $non_utf8_found;
544}
545
546# Returns true (1) if the given string is utf8 and false (0) if it isn't.
547# Does not modify the string parameter.
548sub check_is_utf8 {
549 my $value=shift;
550
551 if (!defined($value)) {
552 return 0; # not utf8 because it is undefined
553 }
554
555 $value =~ m/^/g; # to set \G
556 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
557 my $highbytes=$1;
558 # make sure this block of high bytes is utf-8
559 $highbytes =~ /^/g; # set pos()
560 while ($highbytes =~
561 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
562 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
563 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
564 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
565 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
566 )*([\x80-\xff])? !xg
567 ) {
568 my $badbyte=$1;
569 if (defined $badbyte) { # not end of string
570 return 0; # non-utf8 found
571 }
572 }
573 }
574
575 return 1;
576}
577
578sub url_encode {
579 my ($text) = @_;
580 if (!&is_url_encoded($text)) {
581 $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
582 }
583 return $text;
584}
585
586sub url_decode {
587 my ($text) = @_;
588
589 $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
590 return $text;
591}
592
593sub is_url_encoded {
594 my ($text) = @_;
595 return ($text =~ m/\%([A-F0-9]{2})/);
596}
597
598# When a filename on the filesystem is already URL-encoded, the
599# URL to it will have %25s in in place of every % sign, so that
600# URLs in html pages can refer to the URL-encoded filename.
601# This method changes the URL reference back into the actual
602# (URL-encoded) filename on the filesystem by replacing %25 with %.
603sub url_to_filename {
604 my ($text) =@_;
605 $text =~ s/%25/%/g;
606 return $text;
607}
608
609# When a filename on the filesystem is already URL-encoded, the
610# URL to it will have %25s in in place of every % sign, so that
611# URLs in html pages can refer to the URL-encoded filename.
612# Given a (URL-encoded) filename on the filesystem, this subroutine
613# returns the URL reference string for it by replacing % with %25.
614# The output string will be the same as the input string if the input
615# already contains one or more %25s. This is to prevent processing
616# a url more than once this way.
617sub filename_to_url {
618 my ($text) =@_;
619
620 if($text !~ m/%25/) {
621 $text =~ s/%/%25/g;
622 }
623 return $text;
624}
625
626
627sub substr
628{
629 my ($utf8_string, $offset, $length) = @_;
630
631 my @unicode_string = @{&utf82unicode($utf8_string)};
632 my $unicode_string_length = scalar(@unicode_string);
633
634 my $substr_start = $offset;
635 if ($substr_start >= $unicode_string_length) {
636 return "";
637 }
638
639 my $substr_end = $offset + $length - 1;
640 if ($substr_end >= $unicode_string_length) {
641 $substr_end = $unicode_string_length - 1;
642 }
643
644 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
645 return &unicode2utf8(\@unicode_substring);
646}
647
648
6491;
Note: See TracBrowser for help on using the repository browser.